Last active
June 3, 2018 05:44
-
-
Save willismonroe/e3dbc9ba0ee834befae82fb641535783 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, pprint | |
class ORACC_text_reader: | |
def __init__(self, json_string, DEBUG=False): | |
self.DEBUG = DEBUG | |
self.data = json.loads(json_string) | |
try: | |
for node in self.data['cdl'][0]['cdl']: | |
if 'cdl' in node.keys(): | |
self.text = node['cdl'][0]['cdl'] | |
except: | |
if self.DEBUG: pprint.pprint(self.data) | |
def output_translit(self, with_line_headers=True): | |
output = [] | |
line = 'o' if with_line_headers else '' | |
for node in self.text: | |
if node['node'] == 'd' and 'label' in node.keys(): | |
output.append(line) | |
line = node['label'] if with_line_headers else '' | |
elif node['node'] == 'l': | |
line += ' ' + node['frag'] | |
output.append(line) | |
return output | |
def output_norm(self, with_line_headers=True): | |
output = [] | |
line = 'o' if with_line_headers else '' | |
for node in self.text: | |
if node['node'] == 'd' and 'label' in node.keys(): | |
output.append(line) | |
line = node['label'] if with_line_headers else '' | |
elif node['node'] == 'l': | |
if 'norm' in node['f'].keys(): | |
line += ' ' + node['f']['norm'] | |
else: | |
line += ' ' + node ['f']['form'] | |
output.append(line) | |
return output | |
def output_sense(self, with_line_headers=True): | |
output = [] | |
line = 'o' if with_line_headers else '' | |
for node in self.text: | |
if node['node'] == 'd' and 'label' in node.keys(): | |
output.append(line) | |
line = node['label'] if with_line_headers else '' | |
elif node['node'] == 'l': | |
if 'sense' in node['f'].keys(): | |
line += ' ' + node['f']['sense'] | |
else: | |
line += ' ' + node ['f']['form'] | |
output.append(line) | |
return output | |
def output_cuneiform(self, with_line_headers=True): | |
output = [] | |
line = 'o' if with_line_headers else '' | |
for node in self.text: | |
if node['node'] == 'd' and 'label' in node.keys(): | |
output.append(line) | |
line = node['label'] if with_line_headers else '' | |
elif node['node'] == 'l': | |
translit = node['frag'] | |
gdl = node['f']['gdl'] | |
if len(gdl) == 1: | |
# single gdl | |
sign = '' | |
if 'group' in gdl[0].keys(): | |
group = gdl[0]['group'] | |
for el in group: | |
if 'gdl_utf8' in el.keys(): | |
sign += el['gdl_utf8'] | |
else: | |
sign += el['seq'][0]['gdl_utf8'] | |
else: | |
sign = gdl[0]['gdl_utf8'] | |
if self.DEBUG: print("Single gdl ✓ {} = {}".format(translit, sign)) | |
line += ' ' + sign | |
elif len(gdl) > 1: | |
sign = '' | |
for el in gdl: | |
if 'gdl_utf8' in el.keys(): | |
sign += el['gdl_utf8'] | |
elif 'seq' in el.keys(): | |
for seq in el['seq']: | |
sign += seq['gdl_utf8'] | |
elif 'group' in el.keys(): | |
for el2 in el['group']: | |
sign += el2['gdl_utf8'] | |
else: | |
if self.DEBUG: print("Error with multi-gdl ✗ {} = {}".format(translit, sign)) | |
pprint.pprint(gdl) | |
if self.DEBUG: print("Multi-gdl ✓ {} = {}".format(translit, sign)) | |
line += ' ' + sign | |
else: | |
if self.DEBUG: print("Can't process ✗") | |
pprint.pprint(node) | |
print() | |
output.append(line) | |
return output |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import json\n", | |
"import pprint\n", | |
"import collections\n", | |
"import tabulate" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from oracc_reader import ORACC_text_reader" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"big_text = ''\n", | |
"for filename in os.listdir('corpusjson/'):\n", | |
" if filename.endswith(\".json\"):\n", | |
" oracc_reader = ORACC_text_reader(open('corpusjson/' + filename).read(), DEBUG=True)\n", | |
" output = oracc_reader.output_norm(with_line_headers=False)\n", | |
" big_text += ''.join(output)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"words_follow_moon = []\n", | |
"big_text_words = big_text.split()\n", | |
"for i, w in enumerate(big_text_words):\n", | |
" if w == \"Sin\":\n", | |
" words_follow_moon.append(' '.join(big_text_words[i+1:i+3]))\n", | |
"\n", | |
"words_follow_sun = []\n", | |
"for i, w in enumerate(big_text_words):\n", | |
" if w == \"Šamaš\":\n", | |
" words_follow_sun.append(' '.join(big_text_words[i+1:i+3]))\n", | |
" \n", | |
"words_follow_planets = []\n", | |
"for i, w in enumerate(big_text_words):\n", | |
" if w in [\"Sagmegar\", \"Dilbat\", \"Kayyamanu\", \"Šihṭu\", \"Ṣalbatanu\"]:\n", | |
" words_follow_planets.append(' '.join(big_text_words[i+1:i+3]))\n", | |
" \n", | |
"c_moon = collections.Counter(words_follow_moon)\n", | |
"c_sun = collections.Counter(words_follow_sun)\n", | |
"c_planets = collections.Counter(words_follow_planets)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Most common words following the moon:\n", | |
"Word Count\n", | |
"-------------- -------\n", | |
"u Šamaš 207\n", | |
"ina tāmartīšu 105\n", | |
"tarbāṣu ilmīma 84\n", | |
"ūm 01-KAM₂ 44\n", | |
"itti Šamaš 19\n", | |
"ina Simani 18\n", | |
"...\n", | |
"Total: 933\n", | |
"\n", | |
"Most common words following the sun:\n", | |
"Word Count\n", | |
"------------ -------\n", | |
"itti ahāmeš 89\n", | |
"šutātû šar 50\n", | |
"šitqulū māti 22\n", | |
"innammarma 1 14\n", | |
"lā uqqīma 13\n", | |
"šitqulū atmû 10\n", | |
"...\n", | |
"Total: 422\n", | |
"\n", | |
"Most common words following any of the planets:\n", | |
"Word Count\n", | |
"-------------- -------\n", | |
"ina libbi 26\n", | |
"ina libbīšu 21\n", | |
"x x 20\n", | |
"ina harrāni 10\n", | |
"ina erēb-šamši 8\n", | |
"ina Nisanni 7\n", | |
"...\n", | |
"Total: 315\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Most common words following the moon:\")\n", | |
"print(tabulate.tabulate(c_moon.most_common()[:6] +\n", | |
" [('...','')] + [(\"Total:\", str(len(words_follow_moon)))],\n", | |
" headers=[\"Word\", \"Count\"]))\n", | |
"print()\n", | |
"print(\"Most common words following the sun:\")\n", | |
"print(tabulate.tabulate(c_sun.most_common()[:6] +\n", | |
" [('...','')] + [(\"Total:\", str(len(words_follow_sun)))],\n", | |
" headers=[\"Word\", \"Count\"]))\n", | |
"print()\n", | |
"print(\"Most common words following any of the planets:\")\n", | |
"print(tabulate.tabulate(c_planets.most_common()[:6] +\n", | |
" [('...','')] + [(\"Total:\", str(len(words_follow_planets)))],\n", | |
" headers=[\"Word\", \"Count\"]))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Compare with: http://oracc.museum.upenn.edu/saao/saa08/P336558/html