Created
August 7, 2019 22:58
-
-
Save abevieiramota/f265d2bf2800098deef8f76901d79d93 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<style>.container { width:100% !important; }</style>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from IPython.core.display import display, HTML\n", | |
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
" \u001b[93mInfo about spaCy\u001b[0m\n", | |
"\n", | |
" spaCy version 2.0.16 \n", | |
" Location /home/abevieiramota/anaconda3/envs/webnlg/lib/python3.6/site-packages/spacy\n", | |
" Platform Linux-4.4.0-43-Microsoft-x86_64-with-debian-stretch-sid\n", | |
" Python version 3.6.7 \n", | |
" Models en, en_core_web_lg\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"{'spaCy version': '2.0.16',\n", | |
" 'Location': '/home/abevieiramota/anaconda3/envs/webnlg/lib/python3.6/site-packages/spacy',\n", | |
" 'Platform': 'Linux-4.4.0-43-Microsoft-x86_64-with-debian-stretch-sid',\n", | |
" 'Python version': '3.6.7',\n", | |
" 'Models': 'en, en_core_web_lg'}" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import spacy\n", | |
"from spacy import displacy\n", | |
"\n", | |
"spacy.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nlp = spacy.load('en_core_web_lg')\n", | |
"\n", | |
"with open('uncomp_sentences') as f:\n", | |
" uncomp = [t[:-1] for t in f.readlines()]\n", | |
" \n", | |
"with open('comp_sentences') as f:\n", | |
" comp = [t[:-1] for t in f.readlines()]\n", | |
"\n", | |
" \n", | |
"data = list(zip(uncomp, comp))\n", | |
"\n", | |
"del uncomp\n", | |
"del comp" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(Scientists at Newcastle University claim they have created human sperm using embryonic stem cells..,\n", | |
" Scientists claim they have created human sperm using stem cells.)" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"doc_uncomp = nlp(data[60][0])\n", | |
"doc_comp = nlp(data[60][1])\n", | |
"\n", | |
"doc_uncomp, doc_comp" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"2500\" height=\"487.0\" style=\"max-width: none; height: 487.0px; color: #000000; background: #ffffff; font-family: Arial\">\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Scientists</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">at</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">ADP</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">Newcastle</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">PROPN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\">University</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">PROPN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">claim</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">they</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">PRON</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">have</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">created</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1450\">human</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1450\">ADJ</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1625\">sperm</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1625\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1800\">using</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1800\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1975\">embryonic</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1975\">ADJ</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2150\">stem</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2150\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2325\">cells..</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2325\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,352.0 C70,2.0 750.0,2.0 750.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M70,354.0 L62,342.0 78,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M70,352.0 C70,264.5 210.0,264.5 210.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M210.0,354.0 L218.0,342.0 202.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M420,352.0 C420,264.5 560.0,264.5 560.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M420,354.0 L412,342.0 428,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M245,352.0 C245,177.0 565.0,177.0 565.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M565.0,354.0 L573.0,342.0 557.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M945,352.0 C945,177.0 1265.0,177.0 1265.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M945,354.0 L937,342.0 953,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M1120,352.0 C1120,264.5 1260.0,264.5 1260.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1120,354.0 L1112,342.0 1128,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M770,352.0 C770,89.5 1270.0,89.5 1270.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">ccomp</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1270.0,354.0 L1278.0,342.0 1262.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M1470,352.0 C1470,264.5 1610.0,264.5 1610.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1470,354.0 L1462,342.0 1478,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M1295,352.0 C1295,177.0 1615.0,177.0 1615.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1615.0,354.0 L1623.0,342.0 1607.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M1295,352.0 C1295,89.5 1795.0,89.5 1795.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">advcl</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1795.0,354.0 L1803.0,342.0 1787.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-10\" stroke-width=\"2px\" d=\"M1995,352.0 C1995,177.0 2315.0,177.0 2315.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-10\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1995,354.0 L1987,342.0 2003,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-11\" stroke-width=\"2px\" d=\"M2170,352.0 C2170,264.5 2310.0,264.5 2310.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-11\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M2170,354.0 L2162,342.0 2178,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-12\" stroke-width=\"2px\" d=\"M1820,352.0 C1820,89.5 2320.0,89.5 2320.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-12\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M2320.0,354.0 L2328.0,342.0 2312.0,342.0\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"</svg>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"displacy.render(doc_uncomp, jupyter=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"1800\" height=\"399.5\" style=\"max-width: none; height: 399.5px; color: #000000; background: #ffffff; font-family: Arial\">\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Scientists</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">claim</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">they</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">PRON</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\">have</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">created</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">human</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">ADJ</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">sperm</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">using</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">VERB</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1450\">stem</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1450\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"309.5\">\n", | |
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1625\">cells.</tspan>\n", | |
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1625\">NOUN</tspan>\n", | |
"</text>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,264.5 C70,177.0 215.0,177.0 215.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M70,266.5 L62,254.5 78,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M420,264.5 C420,89.5 745.0,89.5 745.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M420,266.5 L412,254.5 428,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M595,264.5 C595,177.0 740.0,177.0 740.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M595,266.5 L587,254.5 603,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M245,264.5 C245,2.0 750.0,2.0 750.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">ccomp</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M750.0,266.5 L758.0,254.5 742.0,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M945,266.5 L937,254.5 953,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M770,264.5 C770,89.5 1095.0,89.5 1095.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1095.0,266.5 L1103.0,254.5 1087.0,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M770,264.5 C770,2.0 1275.0,2.0 1275.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">advcl</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1275.0,266.5 L1283.0,254.5 1267.0,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M1470,264.5 C1470,177.0 1615.0,177.0 1615.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1470,266.5 L1462,254.5 1478,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"\n", | |
"<g class=\"displacy-arrow\">\n", | |
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M1295,264.5 C1295,89.5 1620.0,89.5 1620.0,264.5\" fill=\"none\" stroke=\"currentColor\"/>\n", | |
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n", | |
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n", | |
" </text>\n", | |
" <path class=\"displacy-arrowhead\" d=\"M1620.0,266.5 L1628.0,254.5 1612.0,254.5\" fill=\"currentColor\"/>\n", | |
"</g>\n", | |
"</svg>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"displacy.render(doc_comp, jupyter=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"uncomp_tree = doc_uncomp.print_tree()[0]\n", | |
"comp_tree = doc_comp.print_tree()[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('claim', 'claim')" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"uncomp_tree['word'], comp_tree['word']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_tree_level(doc):\n", | |
" \n", | |
" tree = doc.print_tree()[0]\n", | |
" \n", | |
" data = []\n", | |
" \n", | |
" nodes = [(tree, 0)]\n", | |
" \n", | |
" while nodes:\n", | |
" \n", | |
" node, level = nodes.pop()\n", | |
" \n", | |
" # cuidado com words repetidas, é necessário identificar que token é, não que word\n", | |
" d = {'word': node['word'],\n", | |
" 'lemma': node['lemma'],\n", | |
" 'NE': node['NE'],\n", | |
" 'POS_fine': node['POS_fine'],\n", | |
" 'POS_coarse': node['POS_coarse'],\n", | |
" 'arc': node['arc'],\n", | |
" 'n_modifiers': len(node['modifiers']),\n", | |
" 'level': level\n", | |
" }\n", | |
" data.append(d)\n", | |
" \n", | |
" nodes.extend(((modifier, level+1) for modifier in node['modifiers']))\n", | |
" \n", | |
" return data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NE</th>\n", | |
" <th>POS_coarse</th>\n", | |
" <th>POS_fine</th>\n", | |
" <th>arc</th>\n", | |
" <th>lemma</th>\n", | |
" <th>level</th>\n", | |
" <th>n_modifiers</th>\n", | |
" <th>word</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBP</td>\n", | |
" <td>ROOT</td>\n", | |
" <td>claim</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>claim</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td></td>\n", | |
" <td>PUNCT</td>\n", | |
" <td>.</td>\n", | |
" <td>punct</td>\n", | |
" <td>..</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>..</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBN</td>\n", | |
" <td>ccomp</td>\n", | |
" <td>create</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>created</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBG</td>\n", | |
" <td>advcl</td>\n", | |
" <td>use</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>using</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NNS</td>\n", | |
" <td>dobj</td>\n", | |
" <td>cell</td>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" <td>cells</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NE POS_coarse POS_fine arc lemma level n_modifiers word\n", | |
"0 VERB VBP ROOT claim 0 3 claim\n", | |
"1 PUNCT . punct .. 1 0 ..\n", | |
"2 VERB VBN ccomp create 1 4 created\n", | |
"3 VERB VBG advcl use 2 1 using\n", | |
"4 NOUN NNS dobj cell 3 2 cells" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_uncomp = pd.DataFrame(get_tree_level(doc_uncomp))\n", | |
"df_comp = pd.DataFrame(get_tree_level(doc_comp))\n", | |
"\n", | |
"df_uncomp.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>NE_uncomp</th>\n", | |
" <th>POS_coarse_uncomp</th>\n", | |
" <th>POS_fine_uncomp</th>\n", | |
" <th>arc_uncomp</th>\n", | |
" <th>lemma_uncomp</th>\n", | |
" <th>level_uncomp</th>\n", | |
" <th>n_modifiers_uncomp</th>\n", | |
" <th>word</th>\n", | |
" <th>NE_comp</th>\n", | |
" <th>POS_coarse_comp</th>\n", | |
" <th>POS_fine_comp</th>\n", | |
" <th>arc_comp</th>\n", | |
" <th>lemma_comp</th>\n", | |
" <th>level_comp</th>\n", | |
" <th>n_modifiers_comp</th>\n", | |
" <th>join</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NNS</td>\n", | |
" <td>nsubj</td>\n", | |
" <td>scientist</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Scientists</td>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NNS</td>\n", | |
" <td>nsubj</td>\n", | |
" <td>scientist</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td></td>\n", | |
" <td>PRON</td>\n", | |
" <td>PRP</td>\n", | |
" <td>nsubj</td>\n", | |
" <td>-PRON-</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>they</td>\n", | |
" <td></td>\n", | |
" <td>PRON</td>\n", | |
" <td>PRP</td>\n", | |
" <td>nsubj</td>\n", | |
" <td>-PRON-</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBP</td>\n", | |
" <td>aux</td>\n", | |
" <td>have</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>have</td>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBP</td>\n", | |
" <td>aux</td>\n", | |
" <td>have</td>\n", | |
" <td>2.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td></td>\n", | |
" <td>ADJ</td>\n", | |
" <td>JJ</td>\n", | |
" <td>amod</td>\n", | |
" <td>human</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>human</td>\n", | |
" <td></td>\n", | |
" <td>ADJ</td>\n", | |
" <td>JJ</td>\n", | |
" <td>amod</td>\n", | |
" <td>human</td>\n", | |
" <td>3.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NN</td>\n", | |
" <td>dobj</td>\n", | |
" <td>sperm</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>sperm</td>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NN</td>\n", | |
" <td>dobj</td>\n", | |
" <td>sperm</td>\n", | |
" <td>2.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NN</td>\n", | |
" <td>compound</td>\n", | |
" <td>stem</td>\n", | |
" <td>4</td>\n", | |
" <td>0</td>\n", | |
" <td>stem</td>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NN</td>\n", | |
" <td>compound</td>\n", | |
" <td>stem</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NNS</td>\n", | |
" <td>dobj</td>\n", | |
" <td>cell</td>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" <td>cells</td>\n", | |
" <td></td>\n", | |
" <td>NOUN</td>\n", | |
" <td>NNS</td>\n", | |
" <td>dobj</td>\n", | |
" <td>cell</td>\n", | |
" <td>3.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBG</td>\n", | |
" <td>advcl</td>\n", | |
" <td>use</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>using</td>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBG</td>\n", | |
" <td>advcl</td>\n", | |
" <td>use</td>\n", | |
" <td>2.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBN</td>\n", | |
" <td>ccomp</td>\n", | |
" <td>create</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>created</td>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBN</td>\n", | |
" <td>ccomp</td>\n", | |
" <td>create</td>\n", | |
" <td>1.0</td>\n", | |
" <td>4.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBP</td>\n", | |
" <td>ROOT</td>\n", | |
" <td>claim</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>claim</td>\n", | |
" <td></td>\n", | |
" <td>VERB</td>\n", | |
" <td>VBP</td>\n", | |
" <td>ROOT</td>\n", | |
" <td>claim</td>\n", | |
" <td>0.0</td>\n", | |
" <td>3.0</td>\n", | |
" <td>both</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>ORG</td>\n", | |
" <td>PROPN</td>\n", | |
" <td>NNP</td>\n", | |
" <td>pobj</td>\n", | |
" <td>Newcastle University</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>Newcastle University</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>left_only</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td></td>\n", | |
" <td>ADP</td>\n", | |
" <td>IN</td>\n", | |
" <td>prep</td>\n", | |
" <td>at</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>at</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>left_only</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td></td>\n", | |
" <td>ADJ</td>\n", | |
" <td>JJ</td>\n", | |
" <td>amod</td>\n", | |
" <td>embryonic</td>\n", | |
" <td>4</td>\n", | |
" <td>0</td>\n", | |
" <td>embryonic</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>left_only</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td></td>\n", | |
" <td>PUNCT</td>\n", | |
" <td>.</td>\n", | |
" <td>punct</td>\n", | |
" <td>..</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>..</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>left_only</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" NE_uncomp POS_coarse_uncomp POS_fine_uncomp arc_uncomp \\\n", | |
"11 NOUN NNS nsubj \n", | |
"10 PRON PRP nsubj \n", | |
"9 VERB VBP aux \n", | |
"8 ADJ JJ amod \n", | |
"7 NOUN NN dobj \n", | |
"5 NOUN NN compound \n", | |
"4 NOUN NNS dobj \n", | |
"3 VERB VBG advcl \n", | |
"2 VERB VBN ccomp \n", | |
"0 VERB VBP ROOT \n", | |
"13 ORG PROPN NNP pobj \n", | |
"12 ADP IN prep \n", | |
"6 ADJ JJ amod \n", | |
"1 PUNCT . punct \n", | |
"\n", | |
" lemma_uncomp level_uncomp n_modifiers_uncomp \\\n", | |
"11 scientist 1 1 \n", | |
"10 -PRON- 2 0 \n", | |
"9 have 2 0 \n", | |
"8 human 3 0 \n", | |
"7 sperm 2 1 \n", | |
"5 stem 4 0 \n", | |
"4 cell 3 2 \n", | |
"3 use 2 1 \n", | |
"2 create 1 4 \n", | |
"0 claim 0 3 \n", | |
"13 Newcastle University 3 0 \n", | |
"12 at 2 1 \n", | |
"6 embryonic 4 0 \n", | |
"1 .. 1 0 \n", | |
"\n", | |
" word NE_comp POS_coarse_comp POS_fine_comp arc_comp \\\n", | |
"11 Scientists NOUN NNS nsubj \n", | |
"10 they PRON PRP nsubj \n", | |
"9 have VERB VBP aux \n", | |
"8 human ADJ JJ amod \n", | |
"7 sperm NOUN NN dobj \n", | |
"5 stem NOUN NN compound \n", | |
"4 cells NOUN NNS dobj \n", | |
"3 using VERB VBG advcl \n", | |
"2 created VERB VBN ccomp \n", | |
"0 claim VERB VBP ROOT \n", | |
"13 Newcastle University NaN NaN NaN NaN \n", | |
"12 at NaN NaN NaN NaN \n", | |
"6 embryonic NaN NaN NaN NaN \n", | |
"1 .. NaN NaN NaN NaN \n", | |
"\n", | |
" lemma_comp level_comp n_modifiers_comp join \n", | |
"11 scientist 1.0 0.0 both \n", | |
"10 -PRON- 2.0 0.0 both \n", | |
"9 have 2.0 0.0 both \n", | |
"8 human 3.0 0.0 both \n", | |
"7 sperm 2.0 1.0 both \n", | |
"5 stem 4.0 0.0 both \n", | |
"4 cell 3.0 1.0 both \n", | |
"3 use 2.0 1.0 both \n", | |
"2 create 1.0 4.0 both \n", | |
"0 claim 0.0 3.0 both \n", | |
"13 NaN NaN NaN left_only \n", | |
"12 NaN NaN NaN left_only \n", | |
"6 NaN NaN NaN left_only \n", | |
"1 NaN NaN NaN left_only " | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.merge(df_uncomp, df_comp, on='word', how='left', suffixes=('_uncomp', '_comp'), indicator='join')\n", | |
"\n", | |
"df.sort_values('join', ascending=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment