AlJohri · December 19, 2019 06:00
diff --git a/ner.ipynb b/ner.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Name Entity Recognition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting server with command: java -Xmx4G -cp /Users/johria/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-18490e6ceb8743bd.props -preload ner\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import warnings\n",
    "from tqdm.auto import tqdm\n",
    "from IPython.display import HTML\n",
    "\n",
    "# NLTK\n",
    "import nltk\n",
    "\n",
    "# Spacy\n",
    "import spacy\n",
    "from spacy import displacy\n",
    "\n",
    "spacy_nlp = spacy.load(\"en_core_web_lg\")\n",
    "\n",
    "# Stanford CoreNLP\n",
    "from stanfordnlp.server import CoreNLPClient\n",
    "\n",
    "corenlp_model = \"~/stanfordnlp_resources/stanford-corenlp-full-2018-10-05\"\n",
    "os.environ[\"CORENLP_HOME\"] = os.path.expanduser(corenlp_model)\n",
    "corenlp_client = CoreNLPClient(annotators=[\"ner\"], timeout=60000, memory=\"4G\")\n",
    "corenlp_client.start()\n",
    "\n",
    "# AllenNLP\n",
    "from allennlp.predictors.predictor import Predictor\n",
    "from allennlp.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer\n",
    "\n",
    "allennlp_models = \"https://s3-us-west-2.amazonaws.com/allennlp/models/\"\n",
    "allennlp_ner_model = allennlp_models + \"ner-model-2018.12.18.tar.gz\"\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    allennlp_predictor = Predictor.from_path(allennlp_ner_model)\n",
    "allennlp_predictor._tokenizer = WhitespaceTokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_ner(x, label=\"labels\"):\n",
    "    for start, end, tag in x[label]:\n",
    "        print(x[\"text\"][start:end], tag)\n",
    "\n",
    "\n",
    "def visualize_ner(result):\n",
    "    return displacy.render(\n",
    "        {\n",
    "            \"text\": result[\"text\"],\n",
    "            \"ents\": [\n",
    "                {\"start\": x[0], \"end\": x[1], \"label\": x[2]} for x in result[\"labels\"]\n",
    "            ],\n",
    "        },\n",
    "        style=\"ent\",\n",
    "        manual=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.006108,
     "end_time": "2019-12-18T03:08:05.600714",
     "exception": false,
     "start_time": "2019-12-18T03:08:05.594606",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "papermill": {
     "duration": 9.337781,
     "end_time": "2019-12-18T03:08:14.945137",
     "exception": false,
     "start_time": "2019-12-18T03:08:05.607356",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def ner_spacy(text):\n",
    "    doc = spacy_nlp(text)\n",
    "    labels = [(ent.start_char, ent.end_char, ent.label_,) for ent in doc.ents]\n",
    "    return {\"text\": text, \"labels\": labels}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.007375,
     "end_time": "2019-12-18T03:08:14.958855",
     "exception": false,
     "start_time": "2019-12-18T03:08:14.951480",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## AllenNLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ENTITY_MAPPING = {\n",
    "    \"PER\": \"PERSON\",\n",
    "    \"LOC\": \"GPE\",\n",
    "}\n",
    "\n",
    "from itertools import tee\n",
    "\n",
    "\n",
    "def parwise(lst):\n",
    "    a, b = tee(lst, 2)\n",
    "    next(b, None)\n",
    "    return zip(a, b)\n",
    "\n",
    "\n",
    "from collections import namedtuple\n",
    "\n",
    "Label = namedtuple(\"Label\", [\"start\", \"end\", \"tag\"])\n",
    "\n",
    "\n",
    "def convert_to_labels(labels):\n",
    "    def translate_tag(tag):\n",
    "        x, y = tag.split(\"-\")\n",
    "        return x + \"-\" + ENTITY_MAPPING.get(y, y)\n",
    "\n",
    "    return [Label(start, end, translate_tag(tag),) for (start, end, tag) in labels]\n",
    "\n",
    "\n",
    "def merge_labels(original_labels):\n",
    "    def merge(*args):\n",
    "        x, y = args[0], args[-1]\n",
    "        return Label(x.start, y.end, y.tag.split(\"-\")[1])\n",
    "\n",
    "    def mergable(x, y):\n",
    "        x_tag_split = x.tag.split(\"-\")\n",
    "        x_base = x_tag_split[1] if len(x_tag_split) == 2 else x_tag_split[0]\n",
    "        return x.end + 1 == y.start and x_base == y.tag.split(\"-\")[1]\n",
    "\n",
    "    labels = convert_to_labels(original_labels)\n",
    "\n",
    "    def inner(labels):\n",
    "        if len(labels) == 1:\n",
    "            return labels\n",
    "        current, next_ = labels.pop(0), labels.pop(0)\n",
    "        if mergable(current, next_):\n",
    "            return inner([merge(current, next_)] + labels)\n",
    "        else:\n",
    "            return [current] + inner([next_] + labels)\n",
    "\n",
    "    return inner(labels)\n",
    "\n",
    "\n",
    "def ner_allennlp(text):\n",
    "    # Github Issue shows how to use external tokenized text\n",
    "    # https://github.com/allenai/allennlp/issues/1678\n",
    "    \n",
    "    text = text.strip()\n",
    "    \n",
    "    doc = spacy_nlp(text)\n",
    "    labels = []\n",
    "    index = 0\n",
    "    sents = list(doc.sents)\n",
    "    for sent in tqdm(sents):\n",
    "        # DEBUG\n",
    "        # print(len(sent.text_with_ws), repr(sent.text_with_ws))\n",
    "        tokens = list(sent)\n",
    "        if all(x.is_space or x.is_punct for x in tokens):\n",
    "            index += len(sent.text_with_ws)\n",
    "        else:\n",
    "            sent_counter = 0\n",
    "            string_tokens = [str(x) for x in tokens]\n",
    "            obj = allennlp_predictor.predict(sentence=\" \".join(string_tokens))\n",
    "            # hack to re-insert new line characters as individual tokens\n",
    "            # into the allennlp output if there is a newline character\n",
    "            # in the middle of a `sent`\n",
    "            new_line_indicies = [i for i, x in enumerate(string_tokens) if x.strip(' ') == \"\\n\" or x.strip(' ') == '\\xa0']\n",
    "            if len(string_tokens) != len(obj[\"words\"]) and len(new_line_indicies) > 0:\n",
    "                for i in new_line_indicies:\n",
    "                    obj[\"words\"].insert(i, string_tokens[i])\n",
    "                    obj[\"tags\"].insert(i, \"O\")\n",
    "            # DEBUG:\n",
    "            # print(new_line_indicies)\n",
    "            # print(string_tokens)\n",
    "            # print(obj[\"words\"])\n",
    "            for i, (tag, word, token) in enumerate(zip(obj[\"tags\"], obj[\"words\"], tokens)):\n",
    "                if word != str(token):\n",
    "                    raise Exception(f\"{word} != {token}\")\n",
    "                if tag != \"O\":\n",
    "                    label = (index, index + len(token), tag)\n",
    "                    labels.append(label)\n",
    "                index += len(token.text_with_ws)\n",
    "                sent_counter += len(token.text_with_ws)\n",
    "                # DEBUG:\n",
    "                # print(text[label[0]:label[1]], label[2], repr(token.text_with_ws))\n",
    "            else:\n",
    "                # for some reason, the end punctuation of the sentence does not include\n",
    "                # the newline character in `token.text_with_ws` but the newline is included\n",
    "                # in the sent.text_with_ws. so we just add the remaining characters to\n",
    "                # increment the index to the next sentence\n",
    "                index += len(sent.text_with_ws) - sent_counter\n",
    "\n",
    "    return {\"text\": text, \"original_labels\": labels, \"labels\": merge_labels(labels)}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ner_nltk(text):\n",
    "    \"\"\"\n",
    "    doesn't seem to be any way to easily get the character indicies for each ne chunk unless\n",
    "    I search for the chunk (ngram) in the sentence to get the indicies\n",
    "    https://stackoverflow.com/questions/36831354/absolute-position-of-leaves-in-nltk-tree\n",
    "    \"\"\"\n",
    "\n",
    "    labels = []\n",
    "    for sent in nltk.sent_tokenize(text):\n",
    "        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):\n",
    "            # print(chunk)\n",
    "            if hasattr(chunk, \"label\"):\n",
    "                s = \" \".join(c[0] for c in chunk)\n",
    "                tag = chunk.label()\n",
    "                labels.append((s, tag))\n",
    "    return {\"text\": text, \"labels\": labels}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.009161,
     "end_time": "2019-12-18T03:08:26.897949",
     "exception": false,
     "start_time": "2019-12-18T03:08:26.888788",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Stanford CoreNLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "papermill": {
     "duration": 52.175936,
     "end_time": "2019-12-18T03:09:19.161228",
     "exception": false,
     "start_time": "2019-12-18T03:08:26.985292",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def ner_corenlp(text):\n",
    "    \"\"\"\n",
    "    Stanford CoreNLP assumes that the input text is stripped.\n",
    "    \"\"\"\n",
    "    text = text.strip()\n",
    "    labels = []\n",
    "    ann = corenlp_client.annotate(text)\n",
    "    for sent in ann.sentence:\n",
    "        for mention in sent.mentions:\n",
    "            start = (\n",
    "                sent.characterOffsetBegin\n",
    "                + sent.token[mention.tokenStartInSentenceInclusive].beginChar\n",
    "            )\n",
    "            end = (\n",
    "                sent.characterOffsetBegin\n",
    "                + sent.token[mention.tokenEndInSentenceExclusive - 1].endChar\n",
    "            )\n",
    "            labels.append((start, end, mention.ner))\n",
    "    return {\"text\": text, \"labels\": labels}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare NER Systems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_parquet(\"../stories.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<strong>ner_spacy</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">The rock-star \n",
       "<mark class=\"entity\" style=\"background: #c887fb; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Democrat\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">NORP</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Beto O'Rourke\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       ", a candidate for president, once supported the bulldozing of a low-income neighborhood in his hometown of \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    El Paso-\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " a project spearheaded by his father-in-law.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong>ner_corenlp</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">The rock-star \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Democrat\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MISC</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Beto O'Rourke\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       ", a \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    candidate\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">TITLE</span>\n",
       "</mark>\n",
       " for \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    president\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">TITLE</span>\n",
       "</mark>\n",
       ", \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    once\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " supported the bulldozing of a low-income neighborhood in his hometown of \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    El Paso\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CITY</span>\n",
       "</mark>\n",
       "\n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    his\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " hometown of El Paso- a project spearheaded by \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    his\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " father-in-law.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong>ner_allennlp</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1244c83eba674be79220c2eeddc885c4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">The rock-star \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Democrat\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">U-MISC</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Beto O'Rourke\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       ", a candidate for president, once supported the bulldozing of a low-income neighborhood in his hometown of \n",
       "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    El Paso-\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
       "</mark>\n",
       " a project spearheaded by his father-in-law.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sentence1 = \"The rock-star Democrat Beto O'Rourke, a candidate for president, once supported the bulldozing of a low-income neighborhood in his hometown of El Paso- a project spearheaded by his father-in-law.\"\n",
    "for fn in [ner_spacy, ner_corenlp, ner_allennlp]:\n",
    "    display(HTML(\"<strong>\" + fn.__name__ + \"</strong>\"))\n",
    "    visualize_ner(fn(sentence1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<strong>ner_spacy</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">President \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Trump\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " said \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Friday\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " that he will not fire \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    White House\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " counselor \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Kellyanne Conway\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " for repeated violations of \n",
       "<mark class=\"entity\" style=\"background: #ff8197; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    the Hatch Act\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">LAW</span>\n",
       "</mark>\n",
       ", which bars federal employees from engaging in political activity in the course of their work.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong>ner_corenlp</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    President\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">TITLE</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Trump\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " said \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Friday\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " that he will not fire \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    White House\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORGANIZATION</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    counselor\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">TITLE</span>\n",
       "</mark>\n",
       " \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Kellyanne Conway\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " for repeated violations of the \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Hatch Act\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MISC</span>\n",
       "</mark>\n",
       "\n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    he\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " will not fire White House counselor Kellyanne Conway for repeated violations of the Hatch Act, which bars federal employees from engaging in political activity in the course of their work.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong>ner_allennlp</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8448f8bf734c4170a1988fbd7262a914",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">President \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Trump\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">U-PERSON</span>\n",
       "</mark>\n",
       " said Friday that he will not fire \n",
       "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    White House\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
       "</mark>\n",
       " counselor \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Kellyanne Conway\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " for repeated violations of the \n",
       "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Hatch Act\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MISC</span>\n",
       "</mark>\n",
       ", which bars federal employees from engaging in political activity in the course of their work.</div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sentence2 = \"President Trump said Friday that he will not fire White House counselor Kellyanne Conway for repeated violations of the Hatch Act, which bars federal employees from engaging in political activity in the course of their work.\"\n",
    "for fn in [ner_spacy, ner_corenlp, ner_allennlp]:\n",
    "    display(HTML(\"<strong>\" + fn.__name__ + \"</strong>\"))\n",
    "    visualize_ner(fn(sentence2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # ner_corenlp\n",
    "# text = df.query('type != \"Transcript\"').sample(1).iloc[0].text\n",
    "# for fn in [ner_spacy, ner_allennlp]:\n",
    "#     print(fn.__name__)\n",
    "#     visualize_ner(fn(text))\n",
    "#     print('---------------------------------------------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  },
  "papermill": {
   "duration": 78.452419,
   "end_time": "2019-12-18T03:09:20.962876",
   "environment_variables": {},
   "exception": null,
   "input_path": "ner.ipynb",
   "output_path": "ner.ipynb",
   "parameters": {},
   "start_time": "2019-12-18T03:08:02.510457",
   "version": "1.2.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }