//////////////////////////////////////////////// // Predict //////////////////////////////////////////////// using Microsoft.ML.Data; using Microsoft.ML; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace PredictCategorie { public class InputObject { [LoadColumn(0)] public string CatSWM { get; set; } [LoadColumn(1)] public string ArticleFR { get; set; } [LoadColumn(2)] [ColumnName("Label")] public float IDCategorie { get; set; } } /////////////// from Example BEGIN public class InputObjectDataView : IDataView { private readonly IEnumerable _data; public IEnumerable Data { get { return _data; } } public DataViewSchema Schema { get; } public bool CanShuffle => false; public InputObjectDataView(IEnumerable data) { _data = data; var builder = new DataViewSchema.Builder(); builder.AddColumn("CatSWM", TextDataViewType.Instance); builder.AddColumn("ArticleFR", TextDataViewType.Instance); builder.AddColumn("IDCategorie", NumberDataViewType.Single); Schema = builder.ToSchema(); } public long? GetRowCount() => null; public DataViewRowCursor GetRowCursor( IEnumerable columnsNeeded, Random rand = null) => new Cursor(this, columnsNeeded.Any(c => c.Index == 0), columnsNeeded.Any(c => c.Index == 1), columnsNeeded.Any(c => c.Index == 2)); public DataViewRowCursor[] GetRowCursorSet( IEnumerable columnsNeeded, int n, Random rand = null) => new[] { GetRowCursor(columnsNeeded, rand) }; public class Cursor : DataViewRowCursor { private bool _disposed; private long _position; private readonly IEnumerator _enumerator; private readonly Delegate[] _getters; public override long Position => _position; public override long Batch => 0; public override DataViewSchema Schema { get; } public Cursor(InputObjectDataView parent, bool wantsCatSWM, bool wantsArticleFR, bool wantsIDCategorie) { Schema = parent.Schema; _position = -1; _enumerator = parent.Data.GetEnumerator(); _getters = new Delegate[] { wantsCatSWM ? (ValueGetter>) Text2GetterImplementation : null, wantsArticleFR ? (ValueGetter>) Text1GetterImplementation : null, wantsIDCategorie ? (ValueGetter) Text3GetterImplementation : null }; } protected override void Dispose(bool disposing) { if (_disposed) return; if (disposing) { _enumerator.Dispose(); _position = -1; } _disposed = true; base.Dispose(disposing); } private void Text2GetterImplementation(ref ReadOnlyMemory value) => value = _enumerator.Current.CatSWM.AsMemory(); private void Text1GetterImplementation(ref ReadOnlyMemory value) => value = _enumerator.Current.ArticleFR.AsMemory(); private void Text3GetterImplementation(ref float value) => value = _enumerator.Current.IDCategorie; private void IdGetterImplementation(ref DataViewRowId id) => id = new DataViewRowId((ulong)_position, 0); public override ValueGetter GetGetter( DataViewSchema.Column column) { if (!IsColumnActive(column)) throw new ArgumentOutOfRangeException(nameof(column)); return (ValueGetter)_getters[column.Index]; } public override ValueGetter GetIdGetter() => IdGetterImplementation; public override bool IsColumnActive(DataViewSchema.Column column) => _getters[column.Index] != null; public override bool MoveNext() { if (_disposed) return false; if (_enumerator.MoveNext()) { _position++; return true; } Dispose(); return false; } } } /////////////// from Example END class Program { static void Main(string[] args) { Console.WriteLine("On va trouver la catégorie!"); // path and file location definition string file_path = new DirectoryInfo(Environment.CurrentDirectory).Parent.Parent.Parent.Parent.FullName + @"\MLwork\categorie\"; string file_name = "cat_TestOpenIndexOnly_classification_04.10.2024"; // file to write string filename_out = $"{file_path}{file_name}_out.csv"; if (File.Exists(filename_out)) { File.Delete(filename_out); } using StreamWriter sw = File.CreateText(filename_out); // the 1st line sw.WriteLine("CatSWM;ArticleFR;IDcategorie;LibelleCategorie;SMP"); // file to open Console.WriteLine("Read input file"); string filename_in = $"{file_path}{file_name}.csv"; string[] lines = System.IO.File.ReadAllLines(filename_in); // read file and store List art = new List(); List cat = new List(); int count = 0; foreach (string line in lines) { string[] line_elements = line.Split(';'); if (line_elements[0] != "CatSWM") { art.Add(line_elements[0]); cat.Add(line_elements[1]); count++; } } // https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/save-load-machine-learning-models-ml-net //Create MLContext MLContext mlContext = new MLContext(); // Define data preparation and trained model schemas DataViewSchema dataPrepPipelineSchema, modelSchema; // Load data preparation pipeline Console.WriteLine("Load data preparation pipeline"); string data_prep_name = "\\files\\data_preparation_pipeline_categorie"; string data_prep_file = $"{file_path}{data_prep_name}.zip"; ITransformer dataPrepPipeline = mlContext.Model.Load(data_prep_file, out dataPrepPipelineSchema); // Load Trained Model Console.WriteLine("Load Trained Model"); //string model_name = "\\ML_categorie"; string model_name = "\\files\\model_lbfgs"; string model_file = $"{file_path}{model_name}.zip"; ITransformer trainedModel = mlContext.Model.Load(model_file, out modelSchema); Console.WriteLine("Load IDataView"); List categorieData = new List(); for (int i = 0; i < count; i++) { categorieData.Add(new InputObject { CatSWM = cat[i], ArticleFR = art[i] }); } var inputData = new InputObjectDataView(categorieData); // Predicted Data Console.WriteLine("Predict"); IDataView predictions = trainedModel.Transform(inputData); float[] scoreColumn = predictions.GetColumn("PredictedLabel").ToArray(); ///////////////////////////// // output Console.WriteLine("Write ouptput"); string line_out = ""; for (int i = 0; i < count; i++) { if (i % 500 == 0) { Console.WriteLine($"{i}/{count} DONE"); } line_out = $"{art[i]};{cat[i]};{scoreColumn[i]}"; //Console.WriteLine(line_out); sw.WriteLine(line_out); } } } } //////////////////////////////////////////////// // Train //////////////////////////////////////////////// using Microsoft.ML.Data; using Microsoft.ML.Trainers; using Microsoft.ML; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace TrainCategorie { public class InputObject { [LoadColumn(0)] public string CatSWM { get; set; } [LoadColumn(1)] public string ArticleFR { get; set; } [LoadColumn(2)] [ColumnName("Label")] public float IDCategorie { get; set; } } class Program { static void Main(string[] args) { Console.WriteLine("On va entraîner la catégorie!!"); // path and file location definition //string file_path = @"\\int.ofac.ch\OFAC\Collaborateurs\jamin\Visual Studio 2019\MLwork\categorie\files"; string file_path = new DirectoryInfo(Environment.CurrentDirectory).Parent.Parent.Parent.Parent.FullName + @"\MLwork\categorie\files"; string file_name = "cat_TrainOpenIndex_classification_04.10.2024"; // file to open Console.WriteLine("Read input file"); string filename_in = $"{file_path}\\{file_name}.csv"; // https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/load-data-ml-net //Create MLContext MLContext mlContext = new MLContext(); // Load Trained Model Console.WriteLine("Load Pipeline"); IDataView raw_data = mlContext.Data.LoadFromTextFile(filename_in, separatorChar: ';', hasHeader: true); //https://docs.microsoft.com/en-us/dotnet/machine-learning/how-to-guides/prepare-data-ml-net Console.WriteLine("Convert str->float"); // 1 // Define text transform estimator var textEstimator1 = mlContext.Transforms.Text.FeaturizeText("ArticleFR"); var textEstimator2 = mlContext.Transforms.Text.FeaturizeText("CatSWM"); // Fit data to estimator // Fitting generates a transformer that applies the operations of defined by estimator ITransformer textTransformer1 = textEstimator1.Fit(raw_data); ITransformer textTransformer2 = textEstimator2.Fit(raw_data); var fullTransformer = textTransformer1.Append(textTransformer2); // Transform data IDataView data = fullTransformer.Transform(raw_data); //https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/train-machine-learning-model-ml-net //DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); //IDataView trainData = dataSplit.TrainSet; //IDataView testData = dataSplit.TestSet; IDataView trainData = data; IDataView testData = data; // Define Data Prep Estimator // 1. Concatenate Size and Historical into a single feature vector output to a new column called Features // 2. Normalize Features vector Console.WriteLine("Concatenate Features"); IEstimator dataPrepEstimator = mlContext.Transforms.Concatenate("Features", "ArticleFR", "CatSWM") .Append(mlContext.Transforms.NormalizeMinMax("Features")); // Create data prep transformer ITransformer dataPrepTransformer = dataPrepEstimator.Fit(trainData); // Apply transforms to data Console.WriteLine("Prepare Train Data"); IDataView transformedTrainingData = dataPrepTransformer.Transform(trainData); Console.WriteLine("Prepare Test Data"); IDataView transformedTestData = dataPrepTransformer.Transform(testData); // tuto trainer //Console.WriteLine("Train Sdca"); //TrainSdca(mlContext, transformedTrainingData, transformedTestData); // Lbfgs needed Trainer Console.WriteLine("Train Lbfgs"); TrainLbfgs(mlContext, transformedTrainingData, transformedTestData, file_path); // Save Data Prep transformer Console.WriteLine("Save Data Prep transformer"); mlContext.Model.Save(dataPrepTransformer, trainData.Schema, $"{file_path}\\data_preparation_pipeline_categorie.zip"); } private static void TrainSdca(MLContext mlContext, IDataView transformedTrainingData, IDataView transformedTestData) { // Define StochasticDualCoordinateAscent regression algorithm estimator Console.WriteLine("Build ML"); var sdcaEstimator = mlContext.Regression.Trainers.Sdca(); // Build machine learning model Console.WriteLine("Train ML"); var trainedModel = sdcaEstimator.Fit(transformedTrainingData); // extract model parameters //var trainedModelParameters = trainedModel.Model as LinearRegressionModelParameters; // Measure trained model performance // Use trained model to make inferences on test data IDataView testDataPredictions = trainedModel.Transform(transformedTestData); // Extract model metrics and get RSquared Console.WriteLine("Evaluate Test Data"); RegressionMetrics trainedModelMetrics = mlContext.Regression.Evaluate(testDataPredictions); double rSquared = trainedModelMetrics.RSquared; Console.WriteLine($"rSquared={rSquared}"); // save model //mlContext.Model.Save(trainedModel, data.Schema, $"{file_path}\\testmodel.zip"); } private static void TrainLbfgs(MLContext mlContext, IDataView transformedTrainingData, IDataView transformedTestData, string file_path) { // https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.standardtrainerscatalog.lbfgslogisticregression?view=ml-dotnet // https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.standardtrainerscatalog.lbfgsmaximumentropy?view=ml-dotnet#Microsoft_ML_StandardTrainersCatalog_LbfgsMaximumEntropy_Microsoft_ML_MulticlassClassificationCatalog_MulticlassClassificationTrainers_System_String_System_String_System_String_System_Single_System_Single_System_Single_System_Int32_System_Boolean_ Console.WriteLine("Build ML"); var options = new LbfgsMaximumEntropyMulticlassTrainer.Options() { //LabelColumnName = "IDCategorie", //FeatureColumnName = "Features", HistorySize = 50, L1Regularization = 0.1f, NumberOfThreads = 1 }; // Define the trainer. var pipeline = // Convert the string labels into key types. mlContext.Transforms.Conversion.MapValueToKey("Label") // Apply LbfgsMaximumEntropy multiclass trainer. .Append(mlContext.MulticlassClassification.Trainers //.LbfgsMaximumEntropy(options)); .LbfgsMaximumEntropy()); // Train the model. Console.WriteLine("Train ML"); var trainedModel = pipeline.Fit(transformedTrainingData); // Use trained model to make inferences on test data Console.WriteLine("transform trained model"); IDataView testDataPredictions = trainedModel.Transform(transformedTestData); // Extract model metrics and get accuracy Console.WriteLine("Evaluate Test Data"); var trainedModelMetrics = mlContext.MulticlassClassification.Evaluate(testDataPredictions); double accuracy = trainedModelMetrics.MicroAccuracy; Console.WriteLine($"accuracy={accuracy}"); // Save Trained Model Console.WriteLine("Save Trained Model"); mlContext.Model.Save(trainedModel, transformedTrainingData.Schema, $"{file_path}\\model_lbfgs.zip"); } } } //////////////////////////////////////////////// // csv file //////////////////////////////////////////////// CatSWM;ArticleFR;IDCategorie cat1;name1;1 ;name2;2 cat3;nam3;3