Last active
August 26, 2022 20:15
-
-
Save luisquintanilla/a7fb1eb3ce00685a29f6fcd25abe1a42 to your computer and use it in GitHub Desktop.
AutoML on BBD Large Dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Install AutoML NuGet package" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"#i \"nuget:https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json\"\n", | |
"\n", | |
"#r \"nuget:Microsoft.ML.AutoML,0.20.0-preview.22424.1\"" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": "<div><div><strong>Restore sources</strong><ul><li><span>https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json</span></li></ul></div><div></div><div><strong>Installed Packages</strong><ul><li><span>Microsoft.ML.AutoML, 0.20.0-preview.22424.1</span></li></ul></div></div>" | |
}, | |
"execution_count": 1, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/markdown": "Loading extensions from `Microsoft.ML.AutoML.Interactive.dll`" | |
}, | |
"execution_count": 1, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Import packages" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"using Microsoft.ML;\n", | |
"using Microsoft.ML.Data;\n", | |
"using Microsoft.ML.AutoML;" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define input schema" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"public class Input\n", | |
"{\n", | |
"\t[LoadColumn(0,149999)]\n", | |
"\t[VectorType(150000)]\n", | |
"\tpublic float[] Features {get;set;}\n", | |
"\n", | |
"\t[LoadColumn(150000)]\n", | |
"\tpublic bool Label {get;set;}\n", | |
"}" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define data path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var dataPath = @\"C:\\Datasets\\BBD_Full.csv\";" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Initialize MLContext" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var mlContext = new MLContext();" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load data into IDataView" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var data = mlContext.Data.LoadFromTextFile<Input>(dataPath, hasHeader:true,separatorChar:',');" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Shuffle data\n", | |
"\n", | |
"**NOTE: This is not needed when using the full dataset. This is done so there's 0 and 1 values in the *Label* column since only 5k are sampled to simplify training in this sample.**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var shuffledData = mlContext.Data.ShuffleRows(data);" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Take 5k rows\n", | |
"\n", | |
"This is to make training faster. Not needed when using the entire dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var sample = mlContext.Data.TakeRows(shuffledData,5000);" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var trainTestData = mlContext.Data.TrainTestSplit(sample, testFraction:0.1);" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Display IDataView Schema" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"data.Schema" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": "<table><thead><tr><th><i>index</i></th><th>Name</th><th>Index</th><th>IsHidden</th><th>Type</th><th>Annotations</th></tr></thead><tbody><tr><td>0</td><td>Features</td><td><div class=\"dni-plaintext\">0</div></td><td><div class=\"dni-plaintext\">False</div></td><td><table><thead><tr><th>Dimensions</th><th>IsKnownSize</th><th>ItemType</th><th>Size</th><th>RawType</th></tr></thead><tbody><tr><td><div class=\"dni-plaintext\">[ 150000 ]</div></td><td><div class=\"dni-plaintext\">True</div></td><td><div class=\"dni-plaintext\">{ Single: RawType: System.Single }</div></td><td><div class=\"dni-plaintext\">150000</div></td><td><div class=\"dni-plaintext\">Microsoft.ML.Data.VBuffer<System.Single></div></td></tr></tbody></table></td><td><table><thead><tr><th>Schema</th></tr></thead><tbody><tr><td><div class=\"dni-plaintext\">[ ]</div></td></tr></tbody></table></td></tr><tr><td>1</td><td>Label</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">False</div></td><td><table><thead><tr><th>RawType</th></tr></thead><tbody><tr><td><div class=\"dni-plaintext\">System.Boolean</div></td></tr></tbody></table></td><td><table><thead><tr><th>Schema</th></tr></thead><tbody><tr><td><div class=\"dni-plaintext\">[ ]</div></td></tr></tbody></table></td></tr></tbody></table>" | |
}, | |
"execution_count": 1, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define pipeline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var pipeline = \n", | |
"\tmlContext.Auto().Featurizer(trainTestData.TrainSet,numericColumns:new[] {\"Features\"})\n", | |
"\t\t.Append(mlContext.Auto().BinaryClassification());" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Initialize AutoML experiment and configure settings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var experiment = mlContext.Auto().CreateExperiment();\n", | |
"\n", | |
"experiment\n", | |
"\t.SetPipeline(pipeline)\n", | |
"\t.SetTrainingTimeInSeconds(60)\n", | |
"\t.SetBinaryClassificationMetric(BinaryClassificationMetric.Accuracy, labelColumn:\"Label\")\n", | |
"\t.SetDataset(trainTestData.TrainSet, trainTestData.TestSet);" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Run AutoML experiment" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"var result = await experiment.RunAsync();" | |
], | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Display metric" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"result.Metric" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": "<div class=\"dni-plaintext\">0.9866220735785953</div>" | |
}, | |
"execution_count": 1, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Save model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"dotnet_interactive": { | |
"language": "csharp" | |
} | |
}, | |
"source": [ | |
"mlContext.Model.Save(result.Model, data.Schema, \"BBDModel.zip\");" | |
], | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": ".NET (C#)", | |
"language": "C#", | |
"name": ".net-csharp" | |
}, | |
"language_info": { | |
"file_extension": ".cs", | |
"mimetype": "text/x-csharp", | |
"name": "C#", | |
"pygments_lexer": "csharp", | |
"version": "8.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment