Created
May 31, 2021 16:27
-
-
Save sagorbrur/0188b189de3bc548c3c936421d7a35a9 to your computer and use it in GitHub Desktop.
loading_custom_dataset_in_huggingface_datasets.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "loading_custom_dataset_in_huggingface_datasets.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "authorship_tag": "ABX9TyMSiKJdfR4unmYGYAHiWnhH", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/sagorbrur/0188b189de3bc548c3c936421d7a35a9/loading_custom_dataset_in_huggingface_datasets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "v2ZqYwyiGlJ8" | |
| }, | |
| "source": [ | |
| "# Loading custom ner dataset in huggingface datasets" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "o6ob7siJGfv0" | |
| }, | |
| "source": [ | |
| "# !pip install datasets" | |
| ], | |
| "execution_count": 2, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "7bG8e4A8GtuF" | |
| }, | |
| "source": [ | |
| "# Loading custom NER data\n", | |
| "\"\"\"\n", | |
| "Prepare your NER dataset in jsonl format and save it as json file\n", | |
| "Here is an example:\n", | |
| "{\"id\": 1, \"tokens\": [\"I\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [0, 0, 0, 5]}\n", | |
| "{\"id\": 1, \"tokens\": [\"Rita\", \"live\", \"in\", \"Dhaka\"], \"ner_tags\": [1, 0, 0, 5]}\n", | |
| "\"\"\"\n", | |
| "import datasets\n", | |
| "from datasets import load_dataset\n", | |
| "\n", | |
| "features = datasets.Features(\n", | |
| " {\n", | |
| " \"id\": datasets.Value(\"string\"),\n", | |
| " \"tokens\": datasets.Sequence(datasets.Value(\"string\")),\n", | |
| " \"ner_tags\": datasets.Sequence(\n", | |
| " datasets.features.ClassLabel(\n", | |
| " num_classes=9,\n", | |
| " names=[\n", | |
| " \"O\",\n", | |
| " \"B-PER\",\n", | |
| " \"I-PER\",\n", | |
| " \"B-ORG\",\n", | |
| " \"I-ORG\",\n", | |
| " \"B-LOC\",\n", | |
| " \"I-LOC\",\n", | |
| " \"B-MISC\",\n", | |
| " \"I-MISC\"\n", | |
| " ]\n", | |
| " )\n", | |
| " ),\n", | |
| " }\n", | |
| ")\n", | |
| "\n", | |
| "datafiles = {\n", | |
| " \"train\": \"./mypath/train.json\",\n", | |
| " \"validation\": \"./mypath/valid.json\",\n", | |
| " \"test\": \"./mypath/test.json\"\n", | |
| "}\n", | |
| "dataset = load_dataset('json', data_files=datafiles, features=features)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment