sagorbrur · May 31, 2021 16:27
diff --git a/loading_custom_dataset_in_huggingface_datasets.ipynb b/loading_custom_dataset_in_huggingface_datasets.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "loading_custom_dataset_in_huggingface_datasets.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyMSiKJdfR4unmYGYAHiWnhH",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/sagorbrur/0188b189de3bc548c3c936421d7a35a9/loading_custom_dataset_in_huggingface_datasets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "v2ZqYwyiGlJ8"
      },
      "source": [
        "# Loading custom ner dataset in huggingface datasets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "o6ob7siJGfv0"
      },
      "source": [
        "# !pip install datasets"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7bG8e4A8GtuF"
      },
      "source": [
        "# Loading custom NER data\n",
        "\"\"\"\n",
        "Prepare your NER dataset in jsonl format and save it as json file\n",
        "Here is an example:\n",
        "{\"id\": 1, \"tokens\": [\"I\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [0, 0, 0, 5]}\n",
        "{\"id\": 1, \"tokens\": [\"Rita\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [1, 0, 0, 5]}\n",
        "\"\"\"\n",
        "import datasets\n",
        "from datasets import load_dataset\n",
        "\n",
        "features = datasets.Features(\n",
        "    {\n",
        "        \"id\": datasets.Value(\"string\"),\n",
        "        \"tokens\": datasets.Sequence(datasets.Value(\"string\")),\n",
        "        \"ner_tags\": datasets.Sequence(\n",
        "            datasets.features.ClassLabel(\n",
        "                num_classes=9,\n",
        "                names=[\n",
        "                    \"O\",\n",
        "                    \"B-PER\",\n",
        "                    \"I-PER\",\n",
        "                    \"B-ORG\",\n",
        "                    \"I-ORG\",\n",
        "                    \"B-LOC\",\n",
        "                    \"I-LOC\",\n",
        "                    \"B-MISC\",\n",
        "                    \"I-MISC\"\n",
        "                ]\n",
        "            )\n",
        "        ),\n",
        "    }\n",
        ")\n",
        "\n",
        "datafiles = {\n",
        "    \"train\": \"./mypath/train.json\",\n",
        "    \"validation\": \"./mypath/valid.json\",\n",
        "    \"test\": \"./mypath/test.json\"\n",
        "}\n",
        "dataset = load_dataset('json', data_files=datafiles, features=features)"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "loading_custom_dataset_in_huggingface_datasets.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyMSiKJdfR4unmYGYAHiWnhH",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/sagorbrur/0188b189de3bc548c3c936421d7a35a9/loading_custom_dataset_in_huggingface_datasets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "v2ZqYwyiGlJ8"
	},
	"source": [
	"# Loading custom ner dataset in huggingface datasets"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "o6ob7siJGfv0"
	},
	"source": [
	"# !pip install datasets"
	],
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "7bG8e4A8GtuF"
	},
	"source": [
	"# Loading custom NER data\n",
	"\"\"\"\n",
	"Prepare your NER dataset in jsonl format and save it as json file\n",
	"Here is an example:\n",
	"{\"id\": 1, \"tokens\": [\"I\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [0, 0, 0, 5]}\n",
	"{\"id\": 1, \"tokens\": [\"Rita\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [1, 0, 0, 5]}\n",
	"\"\"\"\n",
	"import datasets\n",
	"from datasets import load_dataset\n",
	"\n",
	"features = datasets.Features(\n",
	" {\n",
	" \"id\": datasets.Value(\"string\"),\n",
	" \"tokens\": datasets.Sequence(datasets.Value(\"string\")),\n",
	" \"ner_tags\": datasets.Sequence(\n",
	" datasets.features.ClassLabel(\n",
	" num_classes=9,\n",
	" names=[\n",
	" \"O\",\n",
	" \"B-PER\",\n",
	" \"I-PER\",\n",
	" \"B-ORG\",\n",
	" \"I-ORG\",\n",
	" \"B-LOC\",\n",
	" \"I-LOC\",\n",
	" \"B-MISC\",\n",
	" \"I-MISC\"\n",
	" ]\n",
	" )\n",
	" ),\n",
	" }\n",
	")\n",
	"\n",
	"datafiles = {\n",
	" \"train\": \"./mypath/train.json\",\n",
	" \"validation\": \"./mypath/valid.json\",\n",
	" \"test\": \"./mypath/test.json\"\n",
	"}\n",
	"dataset = load_dataset('json', data_files=datafiles, features=features)"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}
No results found