luisquintanilla · August 24, 2022 17:04
diff --git a/WordCount.ipynb b/WordCount.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Install packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "#r \"nuget:Microsoft.ML\""
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": "<div><div></div><div></div><div><strong>Installed Packages</strong><ul><li><span>Microsoft.ML, 1.7.1</span></li></ul></div></div>"
          },
          "execution_count": 1,
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Import packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "using System;\n",
        "using System.Linq;\n",
        "using Microsoft.ML;\n",
        "using Microsoft.ML.Data;"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Create data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var corpus = new [] \n",
        "{\n",
        "\tnew {Text = \"The quick brown fox jumped over the lazy dog. Dog is so lazy. Quick!\"},\n",
        "\tnew {Text = \"The lazy dog was jumped over by the quick brown fox. Fox is not lazy\"}\n",
        "};"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Initialize MLContext"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var ctx = new MLContext();\n",
        ""
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load data into IDataView"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var data = ctx.Data.LoadFromEnumerable(corpus);"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Define pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var pipeline = \n",
        "\tctx.Transforms.Text.NormalizeText(outputColumnName:\"NormalizedText\",inputColumnName:\"Text\", keepPunctuations:false)\n",
        "\t.Append(ctx.Transforms.Text.ProduceWordBags (outputColumnName: \"WB\", inputColumnName:\"NormalizedText\", ngramLength:1));"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Apply pipeline to data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var transformed = pipeline.Fit(data).Transform(data);"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Get words in word bag\n",
        "\n",
        "This contains the count for each word found in each of the respective documents."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var wordCounts = transformed.GetColumn<float[]>(\"WB\");"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Get words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "VBuffer<ReadOnlyMemory<char>> slotNames = default;\n",
        "transformed.Schema[\"WB\"].GetSlotNames(ref slotNames);"
      ],
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "var words = slotNames.DenseValues();"
      ],
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Map words to word counts"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "dotnet_interactive": {
          "language": "csharp"
        }
      },
      "source": [
        "wordCounts\n",
        "\t.Select(x => words.Zip(x))"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": "<table><thead><tr><th><i>index</i></th><th>value</th></tr></thead><tbody><tr><td>0</td><td><div class=\"dni-plaintext\">[ ( the, 2 ), ( quick, 2 ), ( brown, 1 ), ( fox, 1 ), ( jumped, 1 ), ( over, 1 ), ( lazy, 2 ), ( dog, 2 ), ( is, 1 ), ( so, 1 ), ( was, 0 ), ( by, 0 ), ( not, 0 ) ]</div></td></tr><tr><td>1</td><td><div class=\"dni-plaintext\">[ ( the, 2 ), ( quick, 1 ), ( brown, 1 ), ( fox, 2 ), ( jumped, 1 ), ( over, 1 ), ( lazy, 2 ), ( dog, 1 ), ( is, 1 ), ( so, 0 ), ( was, 1 ), ( by, 1 ), ( not, 1 ) ]</div></td></tr></tbody></table>"
          },
          "execution_count": 1,
          "metadata": {}
        }
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": ".NET (C#)",
      "language": "C#",
      "name": ".net-csharp"
    },
    "language_info": {
      "file_extension": ".cs",
      "mimetype": "text/x-csharp",
      "name": "C#",
      "pygments_lexer": "csharp",
      "version": "8.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Install packages"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"#r \"nuget:Microsoft.ML\""
	],
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": "<div><div></div><div></div><div><strong>Installed Packages</strong><ul><li><span>Microsoft.ML, 1.7.1</span></li></ul></div></div>"
	},
	"execution_count": 1,
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Import packages"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"using System;\n",
	"using System.Linq;\n",
	"using Microsoft.ML;\n",
	"using Microsoft.ML.Data;"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var corpus = new [] \n",
	"{\n",
	"\tnew {Text = \"The quick brown fox jumped over the lazy dog. Dog is so lazy. Quick!\"},\n",
	"\tnew {Text = \"The lazy dog was jumped over by the quick brown fox. Fox is not lazy\"}\n",
	"};"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Initialize MLContext"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var ctx = new MLContext();\n",
	""
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load data into IDataView"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var data = ctx.Data.LoadFromEnumerable(corpus);"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Define pipeline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var pipeline = \n",
	"\tctx.Transforms.Text.NormalizeText(outputColumnName:\"NormalizedText\",inputColumnName:\"Text\", keepPunctuations:false)\n",
	"\t.Append(ctx.Transforms.Text.ProduceWordBags (outputColumnName: \"WB\", inputColumnName:\"NormalizedText\", ngramLength:1));"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Apply pipeline to data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var transformed = pipeline.Fit(data).Transform(data);"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Get words in word bag\n",
	"\n",
	"This contains the count for each word found in each of the respective documents."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var wordCounts = transformed.GetColumn<float[]>(\"WB\");"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Get words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"VBuffer<ReadOnlyMemory<char>> slotNames = default;\n",
	"transformed.Schema[\"WB\"].GetSlotNames(ref slotNames);"
	],
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"var words = slotNames.DenseValues();"
	],
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Map words to word counts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"dotnet_interactive": {
	"language": "csharp"
	}
	},
	"source": [
	"wordCounts\n",
	"\t.Select(x => words.Zip(x))"
	],
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": "<table><thead><tr><th><i>index</i></th><th>value</th></tr></thead><tbody><tr><td>0</td><td><div class=\"dni-plaintext\">[ ( the, 2 ), ( quick, 2 ), ( brown, 1 ), ( fox, 1 ), ( jumped, 1 ), ( over, 1 ), ( lazy, 2 ), ( dog, 2 ), ( is, 1 ), ( so, 1 ), ( was, 0 ), ( by, 0 ), ( not, 0 ) ]</div></td></tr><tr><td>1</td><td><div class=\"dni-plaintext\">[ ( the, 2 ), ( quick, 1 ), ( brown, 1 ), ( fox, 2 ), ( jumped, 1 ), ( over, 1 ), ( lazy, 2 ), ( dog, 1 ), ( is, 1 ), ( so, 0 ), ( was, 1 ), ( by, 1 ), ( not, 1 ) ]</div></td></tr></tbody></table>"
	},
	"execution_count": 1,
	"metadata": {}
	}
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".NET (C#)",
	"language": "C#",
	"name": ".net-csharp"
	},
	"language_info": {
	"file_extension": ".cs",
	"mimetype": "text/x-csharp",
	"name": "C#",
	"pygments_lexer": "csharp",
	"version": "8.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}