Last active
May 25, 2023 21:25
-
-
Save avidale/b4680e66c2e75a4fe0edeeb27c2e0a68 to your computer and use it in GitHub Desktop.
pyahocorasick entity search.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "pyahocorasick entity search.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyMtp3RWl9h5J4ZYS2bdqUy/", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/avidale/b4680e66c2e75a4fe0edeeb27c2e0a68/pyahocorasick-entity-search.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "JMvxoYqUxGoh", | |
"colab_type": "code", | |
"outputId": "38fff888-dd83-4ff8-d4df-eead0ccc1792", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 192 | |
} | |
}, | |
"source": [ | |
"!pip install pyahocorasick" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting pyahocorasick\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n", | |
"\r\u001b[K |█ | 10kB 19.6MB/s eta 0:00:01\r\u001b[K |██ | 20kB 3.2MB/s eta 0:00:01\r\u001b[K |███▏ | 30kB 4.6MB/s eta 0:00:01\r\u001b[K |████▏ | 40kB 3.0MB/s eta 0:00:01\r\u001b[K |█████▎ | 51kB 3.7MB/s eta 0:00:01\r\u001b[K |██████▎ | 61kB 4.4MB/s eta 0:00:01\r\u001b[K |███████▍ | 71kB 5.0MB/s eta 0:00:01\r\u001b[K |████████▍ | 81kB 3.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 92kB 4.3MB/s eta 0:00:01\r\u001b[K |██████████▌ | 102kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████▌ | 112kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████▋ | 122kB 4.8MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 133kB 4.8MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 143kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 153kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 163kB 4.8MB/s eta 0:00:01\r\u001b[K |█████████████████▉ | 174kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████████████ | 184kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████████████ | 194kB 4.8MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 204kB 4.8MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 215kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 225kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 235kB 4.8MB/s eta 0:00:01\r\u001b[K |█████████████████████████▏ | 245kB 4.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████▎ | 256kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████████████████████▎ | 266kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████████████████████▍ | 276kB 4.8MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 286kB 4.8MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 296kB 4.8MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 307kB 4.8MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 317kB 4.8MB/s \n", | |
"\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n", | |
" Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81690 sha256=a062c805433a61a2f9055f2bda49463c4c6dc489ba58d2890290cc96a0512248\n", | |
" Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n", | |
"Successfully built pyahocorasick\n", | |
"Installing collected packages: pyahocorasick\n", | |
"Successfully installed pyahocorasick-1.4.0\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jaZAAPYCxPM8", | |
"colab_type": "code", | |
"outputId": "90031e46-cbde-433d-bfea-5c8d315f9e9b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 279 | |
} | |
}, | |
"source": [ | |
"!wget https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.6.zip\n", | |
"!unzip simplemaps_worldcities_basicv1.6.zip" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-04-26 14:34:35-- https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.6.zip\n", | |
"Resolving simplemaps.com (simplemaps.com)... 104.26.12.95, 104.26.13.95, 2606:4700:20::681a:c5f, ...\n", | |
"Connecting to simplemaps.com (simplemaps.com)|104.26.12.95|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 1505132 (1.4M) [application/zip]\n", | |
"Saving to: ‘simplemaps_worldcities_basicv1.6.zip’\n", | |
"\n", | |
"\r simplemap 0%[ ] 0 --.-KB/s \rsimplemaps_worldcit 100%[===================>] 1.43M --.-KB/s in 0.1s \n", | |
"\n", | |
"2020-04-26 14:34:35 (11.7 MB/s) - ‘simplemaps_worldcities_basicv1.6.zip’ saved [1505132/1505132]\n", | |
"\n", | |
"Archive: simplemaps_worldcities_basicv1.6.zip\n", | |
" inflating: license.txt \n", | |
" inflating: worldcities.csv \n", | |
" inflating: worldcities.xlsx \n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rkLfkdRU28La", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "SUgbE9gD_rFM", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
" import pandas as pd\n", | |
" import re\n", | |
" import ahocorasick" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "XVyfK30ZxnuJ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"\n", | |
" cities = pd.read_csv('worldcities.csv')\n", | |
"\n", | |
" def preprocess(text):\n", | |
" return '_{}_'.format(re.sub('[^a-z]', '_', text.lower()))\n", | |
"\n", | |
" index = ahocorasick.Automaton()\n", | |
" for city in cities.city:\n", | |
" index.add_word(preprocess(city), city)\n", | |
" index.make_automaton()\n", | |
" # this object can be pickled to disk and then loaded back" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FI-nJBClxH48", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
" index = ahocorasick.Automaton()\n", | |
" for city in cities.city:\n", | |
" index.add_word(preprocess(city), city)\n", | |
" index.make_automaton()\n", | |
" # this object can be pickled to disk and then loaded back" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "q86GyF-TxeYD", | |
"colab_type": "code", | |
"outputId": "70ebae09-ec01-40e8-e7c7-ed90ae528165", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
" def find_cities(text, searcher):\n", | |
" result = dict()\n", | |
" for end_index, city_name in searcher.iter(preprocess(text)):\n", | |
" end = end_index - 1\n", | |
" start = end - len(city_name)\n", | |
" occurrence_text = text[start:end]\n", | |
" result[(start, end)] = city_name\n", | |
" return result\n", | |
"\n", | |
" print(find_cities( 'Tver’ is somewhere between Moscow and Saint Petersburg', index))" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"{(0, 5): 'Tver’', (27, 33): 'Moscow', (38, 54): 'Saint Petersburg', (44, 54): 'Petersburg'}\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4LY7WTRw53Sp", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"text = 'Tver’ is somewhere between Moscow and Saint Petersburg'" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gllWDn1ZzPbJ", | |
"colab_type": "code", | |
"outputId": "21229c2d-0ba1-4f4a-b3c1-bc6a8ebd528c", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 69 | |
} | |
}, | |
"source": [ | |
"%%time\n", | |
"print(find_cities(text, index))" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"{(0, 5): 'Tver’', (27, 33): 'Moscow', (38, 54): 'Saint Petersburg', (44, 54): 'Petersburg'}\n", | |
"CPU times: user 86 µs, sys: 18 µs, total: 104 µs\n", | |
"Wall time: 109 µs\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "46zt5JOy1ygL", | |
"colab_type": "code", | |
"outputId": "2f358ea5-7839-45be-f3a0-12a034618fb2", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 139 | |
} | |
}, | |
"source": [ | |
" %%time\n", | |
" \n", | |
" for city in cities.city:\n", | |
" idx = text.find(city)\n", | |
" if idx >=0:\n", | |
" print(idx, city)" | |
], | |
"execution_count": 0, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"27 Moscow\n", | |
"38 Saint Petersburg\n", | |
"0 Tver’\n", | |
"44 Petersburg\n", | |
"27 Moscow\n", | |
"CPU times: user 7.54 ms, sys: 0 ns, total: 7.54 ms\n", | |
"Wall time: 7.31 ms\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_TE85q735Kjl", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# scalability" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "f0odFcsK_wID", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment