Created
January 22, 2019 20:19
-
-
Save willismonroe/539fcbcab36d3428817d1fafde509cb8 to your computer and use it in GitHub Desktop.
M388 suffixes and prefixes.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import re\nfrom nltk.util import bigrams\nfrom collections import Counter\nfrom beautifultable import BeautifulTable", | |
"execution_count": 99, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "with open('protoElamTranslits20111120.txt') as f:\n data = f.read().split('\\n')", | |
"execution_count": 100, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "data[:15]", | |
"execution_count": 101, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "['&P009331 = MDP 26S, 5233',\n '@obverse',\n '@column 1',\n '1. M010# , 2(N14)',\n '@reverse',\n '$ broken',\n '',\n '&P009342 = MDP 31, 003',\n '@obverse',\n '@column 1',\n '1. [...] ,',\n '2. x , 8(N14) 4(N01)',\n '3. M096~d , 4(N01) 2(N39B) 1(N24)#',\n '4. x , 5(N01)#',\n '@reverse']" | |
}, | |
"execution_count": 101, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "line_re = re.compile(r\"\\d+\\.\")\ndamage_re = re.compile(r\"( x |\\[\\.\\.\\.\\])\")", | |
"execution_count": 102, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# grab lines of text\nraw_lines = [line for line in data if line_re.match(line)]\n# reject lines with damage\ncomplete_lines = [line for line in raw_lines if not damage_re.search(line)]\n# remove line numerals\nlines = [line.split(' ', maxsplit=1)[1] for line in complete_lines if len(line.split(' ', maxsplit=1)) > 1]", | |
"execution_count": 103, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "lines[:5]", | |
"execution_count": 104, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "['M010# , 2(N14)',\n 'M096~d , 4(N01) 2(N39B) 1(N24)#',\n 'M388 , 9(N01)',\n 'M046 M254~b# , 1(N01)',\n 'M417~h , 3(N01)#']" | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# make one big line of signs\ncorpus = ' '.join(lines)\n# tokenize the line by spaces (naive)\nwords = corpus.split(' ')", | |
"execution_count": 105, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "m388_re = re.compile(r\"M388\")\nm388_suffixes = Counter([word[1] for word in bigrams(words) if m388_re.search(word[0])])\nm388_prefixes = Counter([word[0] for word in bigrams(words) if m388_re.search(word[1])])", | |
"execution_count": 106, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "t_m388 = BeautifulTable()\nt_m388.column_headers = [\"suffix\", \"count\", \"prefix\", \"count\"]\nfor row in zip(m388_suffixes.most_common(5), m388_prefixes.most_common(5)): \n t_m388.append_row([item for duo in row for item in duo])\nt_m388.insert_column(2, ' ', [' ']*5)\nprint(t_m388)", | |
"execution_count": 107, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "+--------+-------+---+--------+-------+\n| suffix | count | | prefix | count |\n+--------+-------+---+--------+-------+\n| , | 93 | | 1(N01) | 35 |\n+--------+-------+---+--------+-------+\n| M218 | 13 | | , | 19 |\n+--------+-------+---+--------+-------+\n| M066 | 11 | | M305 | 12 |\n+--------+-------+---+--------+-------+\n| M387 | 7 | | 2(N01) | 10 |\n+--------+-------+---+--------+-------+\n| M347 | 7 | | M054 | 10 |\n+--------+-------+---+--------+-------+\n" | |
} | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.7", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "M388 suffixes and prefixes.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment