Created
March 30, 2019 05:12
-
-
Save eriknomitch/a4f7d0be0567e2526414b12de6142d01 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import string\n", | |
"import spacy\n", | |
"from spacy.tokens import Token\n", | |
"from spacy.tokenizer import Tokenizer\n", | |
"import inflect\n", | |
"import numpy as np\n", | |
"\n", | |
"from IPython.core.display import display, HTML" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Globals" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Library" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nlp = spacy.load(\"en_core_web_lg\")\n", | |
"tokenizer = Tokenizer(nlp.vocab)\n", | |
"ie = inflect.engine()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"doc = nlp(u\"\"\"Apple’s special event this week was all about apples and oranges and software subscription services, signaling a new focus on business aside from its bread and butter of consumer electronics.\n", | |
"\n", | |
"Nestled among the software announcements, there was one announcement that piqued my interest, the physical version of Apple’s new credit card, Apple Card and it's new fruit line Apple Fruits.\"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Negative matches\n", | |
"negative_matches = {u\"no\", u\"not\", u\"unavailable\"}\n", | |
"\n", | |
"# The raw matches from the filter\n", | |
"filter_matches = {u\"apple\", u\"event\"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Utility" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pluralize(word):\n", | |
" global ie\n", | |
"\n", | |
" # This is a little counterintuitive but `singular_noun(...)` will\n", | |
" # return false if the word is a singular noun. Only pluralize then.\n", | |
" if not ie.singular_noun(word):\n", | |
" return ie.plural(word)\n", | |
"\n", | |
" return word\n", | |
"\n", | |
"def tokenize(word):\n", | |
" return tokenizer(word)[0]\n", | |
"\n", | |
"def list_to_tokens(words):\n", | |
" return [tokenize(w) for w in words]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Match Setup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Matches + pluralized versions\n", | |
"matches = filter_matches.copy()\n", | |
"\n", | |
"for match in list(matches):\n", | |
" matches.add(pluralize(match))\n", | |
"\n", | |
"# Tokenized version for NLP\n", | |
"matches_tokens = list_to_tokens(matches)\n", | |
"negative_matches_tokens = list_to_tokens(negative_matches)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Main" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def match_getter(token):\n", | |
" global matches_tokens\n", | |
" \n", | |
" # Save some time on exact matches\n", | |
" if token.text.lower() in matches:\n", | |
" return 1.0\n", | |
" \n", | |
" if not token.has_vector:\n", | |
" return 0.0\n", | |
"\n", | |
" similarities = list(map(lambda m: m.similarity(token), matches_tokens))\n", | |
" \n", | |
" most_similar = max(similarities)\n", | |
" \n", | |
" return most_similar\n", | |
"\n", | |
"Token.set_extension(\"is_match\", getter=match_getter, force=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#print([(w.text, w.pos_) for w in doc])\n", | |
"#print(type(doc[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def output_highlighted(similarity_threshold):\n", | |
" \n", | |
" words_highlighted = []\n", | |
" \n", | |
" np_similarity_threshold = np.float32(similarity_threshold)\n", | |
"\n", | |
" for w in doc:\n", | |
" similarity = np.float32(w._.is_match)\n", | |
" \n", | |
" if np.less(similarity, np_similarity_threshold):\n", | |
" word = f\"<span>{w.text}</span>\"\n", | |
" else:\n", | |
" weight = \"bold\" if similarity == 1.0 else \"normal\"\n", | |
" word = f\"<span style='background: rgba(0, 255, 0, {similarity}); font-weight: {weight}'>{w.text}</span>\"\n", | |
"\n", | |
" words_highlighted.append(word)\n", | |
"\n", | |
" words_output = \" \".join(words_highlighted)\n", | |
"\n", | |
"\n", | |
" output = f\"\"\"\n", | |
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n", | |
" <div>\n", | |
" matches: <b>{\", \".join(filter_matches)}</b>\n", | |
" </br>\n", | |
" w/ plurals: <b>{\", \".join(matches)}</b>\n", | |
" <br/>\n", | |
" SIMILIARTY_THRESHOLD: <b>{float(similarity_threshold)}</b>\n", | |
" </div>\n", | |
" <hr/>\n", | |
" <p style=\"font-size: 16px;\">\n", | |
" {words_output}\n", | |
" </p>\n", | |
" </div>\"\"\"\n", | |
"\n", | |
" display(HTML(output))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\n", | |
"\n", | |
"FULL SIMILARITY SHOWN\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n", | |
" <div>\n", | |
" matches: <b>event, apple</b>\n", | |
" </br>\n", | |
" w/ plurals: <b>event, events, apple, apples</b>\n", | |
" <br/>\n", | |
" SIMILIARTY_THRESHOLD: <b>0.0</b>\n", | |
" </div>\n", | |
" <hr/>\n", | |
" <p style=\"font-size: 16px;\">\n", | |
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.3808760643005371); font-weight: normal'>’s</span> <span style='background: rgba(0, 255, 0, 0.5234818458557129); font-weight: normal'>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span style='background: rgba(0, 255, 0, 0.4608474373817444); font-weight: normal'>this</span> <span style='background: rgba(0, 255, 0, 0.43837690353393555); font-weight: normal'>week</span> <span style='background: rgba(0, 255, 0, 0.35579434037208557); font-weight: normal'>was</span> <span style='background: rgba(0, 255, 0, 0.42519861459732056); font-weight: normal'>all</span> <span style='background: rgba(0, 255, 0, 0.3430214524269104); font-weight: normal'>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.7780942320823669); font-weight: normal'>oranges</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.3079564571380615); font-weight: normal'>software</span> <span style='background: rgba(0, 255, 0, 0.22379352152347565); font-weight: normal'>subscription</span> <span style='background: rgba(0, 255, 0, 0.34125784039497375); font-weight: normal'>services</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.19052930176258087); font-weight: normal'>signaling</span> <span style='background: rgba(0, 255, 0, 0.33145979046821594); font-weight: normal'>a</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.3979387879371643); font-weight: normal'>focus</span> <span style='background: rgba(0, 255, 0, 0.25729233026504517); font-weight: normal'>on</span> <span style='background: rgba(0, 255, 0, 0.34057262539863586); font-weight: normal'>business</span> <span style='background: rgba(0, 255, 0, 0.3790111243724823); font-weight: normal'>aside</span> <span style='background: rgba(0, 255, 0, 0.2546783685684204); font-weight: normal'>from</span> <span style='background: rgba(0, 255, 0, 0.32460078597068787); font-weight: normal'>its</span> <span style='background: rgba(0, 255, 0, 0.5796013474464417); font-weight: normal'>bread</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.608683168888092); font-weight: normal'>butter</span> <span style='background: rgba(0, 255, 0, 0.3342975378036499); font-weight: normal'>of</span> <span style='background: rgba(0, 255, 0, 0.28704968094825745); font-weight: normal'>consumer</span> <span style='background: rgba(0, 255, 0, 0.28327369689941406); font-weight: normal'>electronics</span> <span style='background: rgba(0, 255, 0, 0.20869764685630798); font-weight: normal'>.</span> <span style='background: rgba(0, 255, 0, 0.0); font-weight: normal'>\n", | |
"\n", | |
"</span> <span style='background: rgba(0, 255, 0, 0.2163950502872467); font-weight: normal'>Nestled</span> <span style='background: rgba(0, 255, 0, 0.3320554494857788); font-weight: normal'>among</span> <span style='background: rgba(0, 255, 0, 0.42156127095222473); font-weight: normal'>the</span> <span style='background: rgba(0, 255, 0, 0.3079564571380615); font-weight: normal'>software</span> <span style='background: rgba(0, 255, 0, 0.5097285509109497); font-weight: normal'>announcements</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.4293268024921417); font-weight: normal'>there</span> <span style='background: rgba(0, 255, 0, 0.35579434037208557); font-weight: normal'>was</span> <span style='background: rgba(0, 255, 0, 0.41250690817832947); font-weight: normal'>one</span> <span style='background: rgba(0, 255, 0, 0.47261321544647217); font-weight: normal'>announcement</span> <span style='background: rgba(0, 255, 0, 0.4123636484146118); font-weight: normal'>that</span> <span style='background: rgba(0, 255, 0, 0.11201054602861404); font-weight: normal'>piqued</span> <span style='background: rgba(0, 255, 0, 0.3132108151912689); font-weight: normal'>my</span> <span style='background: rgba(0, 255, 0, 0.35797837376594543); font-weight: normal'>interest</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.42156127095222473); font-weight: normal'>the</span> <span style='background: rgba(0, 255, 0, 0.3398952782154083); font-weight: normal'>physical</span> <span style='background: rgba(0, 255, 0, 0.21415886282920837); font-weight: normal'>version</span> <span style='background: rgba(0, 255, 0, 0.3342975378036499); font-weight: normal'>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.3808760643005371); font-weight: normal'>’s</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.2234654426574707); font-weight: normal'>credit</span> <span style='background: rgba(0, 255, 0, 0.2577524483203888); font-weight: normal'>card</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.2577524483203888); font-weight: normal'>Card</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.36499473452568054); font-weight: normal'>it</span> <span style='background: rgba(0, 255, 0, 0.27155137062072754); font-weight: normal'>'s</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.724179744720459); font-weight: normal'>fruit</span> <span style='background: rgba(0, 255, 0, 0.25101417303085327); font-weight: normal'>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.7155129909515381); font-weight: normal'>Fruits</span> <span style='background: rgba(0, 255, 0, 0.20869764685630798); font-weight: normal'>.</span>\n", | |
" </p>\n", | |
" </div>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\n", | |
"\n", | |
"W/ A MATCH THRESHOLD\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n", | |
" <div>\n", | |
" matches: <b>event, apple</b>\n", | |
" </br>\n", | |
" w/ plurals: <b>event, events, apple, apples</b>\n", | |
" <br/>\n", | |
" SIMILIARTY_THRESHOLD: <b>0.6</b>\n", | |
" </div>\n", | |
" <hr/>\n", | |
" <p style=\"font-size: 16px;\">\n", | |
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span>this</span> <span>week</span> <span>was</span> <span>all</span> <span>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span>and</span> <span style='background: rgba(0, 255, 0, 0.7780942320823669); font-weight: normal'>oranges</span> <span>and</span> <span>software</span> <span>subscription</span> <span>services</span> <span>,</span> <span>signaling</span> <span>a</span> <span>new</span> <span>focus</span> <span>on</span> <span>business</span> <span>aside</span> <span>from</span> <span>its</span> <span>bread</span> <span>and</span> <span style='background: rgba(0, 255, 0, 0.608683168888092); font-weight: normal'>butter</span> <span>of</span> <span>consumer</span> <span>electronics</span> <span>.</span> <span>\n", | |
"\n", | |
"</span> <span>Nestled</span> <span>among</span> <span>the</span> <span>software</span> <span>announcements</span> <span>,</span> <span>there</span> <span>was</span> <span>one</span> <span>announcement</span> <span>that</span> <span>piqued</span> <span>my</span> <span>interest</span> <span>,</span> <span>the</span> <span>physical</span> <span>version</span> <span>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>new</span> <span>credit</span> <span>card</span> <span>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Card</span> <span>and</span> <span>it</span> <span>'s</span> <span>new</span> <span style='background: rgba(0, 255, 0, 0.724179744720459); font-weight: normal'>fruit</span> <span>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.7155129909515381); font-weight: normal'>Fruits</span> <span>.</span>\n", | |
" </p>\n", | |
" </div>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"\n", | |
"\n", | |
"EXACT MATCHES ONLY\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n", | |
" <div>\n", | |
" matches: <b>event, apple</b>\n", | |
" </br>\n", | |
" w/ plurals: <b>event, events, apple, apples</b>\n", | |
" <br/>\n", | |
" SIMILIARTY_THRESHOLD: <b>1.0</b>\n", | |
" </div>\n", | |
" <hr/>\n", | |
" <p style=\"font-size: 16px;\">\n", | |
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span>this</span> <span>week</span> <span>was</span> <span>all</span> <span>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span>and</span> <span>oranges</span> <span>and</span> <span>software</span> <span>subscription</span> <span>services</span> <span>,</span> <span>signaling</span> <span>a</span> <span>new</span> <span>focus</span> <span>on</span> <span>business</span> <span>aside</span> <span>from</span> <span>its</span> <span>bread</span> <span>and</span> <span>butter</span> <span>of</span> <span>consumer</span> <span>electronics</span> <span>.</span> <span>\n", | |
"\n", | |
"</span> <span>Nestled</span> <span>among</span> <span>the</span> <span>software</span> <span>announcements</span> <span>,</span> <span>there</span> <span>was</span> <span>one</span> <span>announcement</span> <span>that</span> <span>piqued</span> <span>my</span> <span>interest</span> <span>,</span> <span>the</span> <span>physical</span> <span>version</span> <span>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>new</span> <span>credit</span> <span>card</span> <span>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Card</span> <span>and</span> <span>it</span> <span>'s</span> <span>new</span> <span>fruit</span> <span>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Fruits</span> <span>.</span>\n", | |
" </p>\n", | |
" </div>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"print(\"\\n\\n\\nFULL SIMILARITY SHOWN\\n\")\n", | |
"\n", | |
"output_highlighted(0.0)\n", | |
"\n", | |
"print(\"\\n\\n\\nW/ A MATCH THRESHOLD\\n\")\n", | |
"output_highlighted(0.6)\n", | |
"\n", | |
"print(\"\\n\\n\\nEXACT MATCHES ONLY\\n\")\n", | |
"output_highlighted(1.0)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment