willismonroe · June 3, 2018 05:44 · willismonroe · Jun 2, 2018
diff --git a/ORACC Reader.ipynb b/ORACC Reader.ipynb
diff --git a/oracc_reader.py b/oracc_reader.py
 import json, pprint

 class ORACC_text_reader:
    def __init__(self, json_string, DEBUG=False):
        self.DEBUG = DEBUG
        self.data = json.loads(json_string)
        try:
            for node in self.data['cdl'][0]['cdl']:
                if 'cdl' in node.keys():
                    self.text = node['cdl'][0]['cdl']
        except:
            if self.DEBUG: pprint.pprint(self.data)


    def output_translit(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                line += ' ' + node['frag']
        output.append(line)
        
        return output
    
    def output_norm(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                if 'norm' in node['f'].keys():
                    line += ' ' + node['f']['norm']
                else:
                    line += ' ' + node ['f']['form']
        output.append(line)

        return output
    
    def output_sense(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                if 'sense' in node['f'].keys():
                    line += ' ' + node['f']['sense']
                else:
                    line += ' ' + node ['f']['form']
        output.append(line)
        
        return output
    
    def output_cuneiform(self, with_line_headers=True):
        output = []
        line = 'o' if with_line_headers else ''
        for node in self.text:
            if node['node'] == 'd' and 'label' in node.keys():
                output.append(line)
                line = node['label'] if with_line_headers else ''
            elif node['node'] == 'l':
                translit = node['frag']
                gdl = node['f']['gdl']
                if len(gdl) == 1:
                    # single gdl
                    sign = ''
                    if 'group' in gdl[0].keys():
                        group = gdl[0]['group']
                        for el in group:
                            if 'gdl_utf8' in el.keys():
                                sign += el['gdl_utf8']
                            else:
                                sign += el['seq'][0]['gdl_utf8']
                    else:
                        sign = gdl[0]['gdl_utf8']
                    if self.DEBUG: print("Single gdl ✓ {} = {}".format(translit, sign))
                    line += ' ' + sign
                elif len(gdl) > 1:
                    sign = ''
                    for el in gdl:
                        if 'gdl_utf8' in el.keys():
                            sign += el['gdl_utf8']
                        elif 'seq' in el.keys():
                            for seq in el['seq']:
                                sign += seq['gdl_utf8']
                        elif 'group' in el.keys():
                            for el2 in el['group']:
                                sign += el2['gdl_utf8']
                        else:
                            if self.DEBUG: print("Error with multi-gdl ✗ {} = {}".format(translit, sign))
                            pprint.pprint(gdl)
                    if self.DEBUG: print("Multi-gdl ✓ {} = {}".format(translit, sign))
                    line += ' ' + sign
                else:
                    if self.DEBUG: print("Can't process ✗")
                    pprint.pprint(node)
                    print()
        output.append(line)
        
        return output
diff --git a/SAA8.ipynb b/SAA8.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pprint\n",
    "import collections\n",
    "import tabulate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from oracc_reader import ORACC_text_reader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "big_text = ''\n",
    "for filename in os.listdir('corpusjson/'):\n",
    "    if filename.endswith(\".json\"):\n",
    "        oracc_reader = ORACC_text_reader(open('corpusjson/' + filename).read(), DEBUG=True)\n",
    "        output = oracc_reader.output_norm(with_line_headers=False)\n",
    "        big_text += ''.join(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "words_follow_moon = []\n",
    "big_text_words = big_text.split()\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w == \"Sin\":\n",
    "        words_follow_moon.append(' '.join(big_text_words[i+1:i+3]))\n",
    "\n",
    "words_follow_sun = []\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w == \"Šamaš\":\n",
    "        words_follow_sun.append(' '.join(big_text_words[i+1:i+3]))\n",
    "        \n",
    "words_follow_planets = []\n",
    "for i, w in enumerate(big_text_words):\n",
    "    if w in [\"Sagmegar\", \"Dilbat\", \"Kayyamanu\", \"Šihṭu\", \"Ṣalbatanu\"]:\n",
    "        words_follow_planets.append(' '.join(big_text_words[i+1:i+3]))\n",
    "        \n",
    "c_moon = collections.Counter(words_follow_moon)\n",
    "c_sun = collections.Counter(words_follow_sun)\n",
    "c_planets = collections.Counter(words_follow_planets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common words following the moon:\n",
      "Word            Count\n",
      "--------------  -------\n",
      "u Šamaš         207\n",
      "ina tāmartīšu   105\n",
      "tarbāṣu ilmīma  84\n",
      "ūm 01-KAM₂      44\n",
      "itti Šamaš      19\n",
      "ina Simani      18\n",
      "...\n",
      "Total:          933\n",
      "\n",
      "Most common words following the sun:\n",
      "Word          Count\n",
      "------------  -------\n",
      "itti ahāmeš   89\n",
      "šutātû šar    50\n",
      "šitqulū māti  22\n",
      "innammarma 1  14\n",
      "lā uqqīma     13\n",
      "šitqulū atmû  10\n",
      "...\n",
      "Total:        422\n",
      "\n",
      "Most common words following any of the planets:\n",
      "Word            Count\n",
      "--------------  -------\n",
      "ina libbi       26\n",
      "ina libbīšu     21\n",
      "x x             20\n",
      "ina harrāni     10\n",
      "ina erēb-šamši  8\n",
      "ina Nisanni     7\n",
      "...\n",
      "Total:          315\n"
     ]
    }
   ],
   "source": [
    "print(\"Most common words following the moon:\")\n",
    "print(tabulate.tabulate(c_moon.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_moon)))],\n",
    "                        headers=[\"Word\", \"Count\"]))\n",
    "print()\n",
    "print(\"Most common words following the sun:\")\n",
    "print(tabulate.tabulate(c_sun.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_sun)))],\n",
    "                        headers=[\"Word\", \"Count\"]))\n",
    "print()\n",
    "print(\"Most common words following any of the planets:\")\n",
    "print(tabulate.tabulate(c_planets.most_common()[:6] +\n",
    "                        [('...','')] + [(\"Total:\", str(len(words_follow_planets)))],\n",
    "                        headers=[\"Word\", \"Count\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	import json, pprint

	class ORACC_text_reader:
	def __init__(self, json_string, DEBUG=False):
	self.DEBUG = DEBUG
	self.data = json.loads(json_string)
	try:
	for node in self.data['cdl'][0]['cdl']:
	if 'cdl' in node.keys():
	self.text = node['cdl'][0]['cdl']
	except:
	if self.DEBUG: pprint.pprint(self.data)


	def output_translit(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	line += ' ' + node['frag']
	output.append(line)

	return output

	def output_norm(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	if 'norm' in node['f'].keys():
	line += ' ' + node['f']['norm']
	else:
	line += ' ' + node ['f']['form']
	output.append(line)

	return output

	def output_sense(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	if 'sense' in node['f'].keys():
	line += ' ' + node['f']['sense']
	else:
	line += ' ' + node ['f']['form']
	output.append(line)

	return output

	def output_cuneiform(self, with_line_headers=True):
	output = []
	line = 'o' if with_line_headers else ''
	for node in self.text:
	if node['node'] == 'd' and 'label' in node.keys():
	output.append(line)
	line = node['label'] if with_line_headers else ''
	elif node['node'] == 'l':
	translit = node['frag']
	gdl = node['f']['gdl']
	if len(gdl) == 1:
	# single gdl
	sign = ''
	if 'group' in gdl[0].keys():
	group = gdl[0]['group']
	for el in group:
	if 'gdl_utf8' in el.keys():
	sign += el['gdl_utf8']
	else:
	sign += el['seq'][0]['gdl_utf8']
	else:
	sign = gdl[0]['gdl_utf8']
	if self.DEBUG: print("Single gdl ✓ {} = {}".format(translit, sign))
	line += ' ' + sign
	elif len(gdl) > 1:
	sign = ''
	for el in gdl:
	if 'gdl_utf8' in el.keys():
	sign += el['gdl_utf8']
	elif 'seq' in el.keys():
	for seq in el['seq']:
	sign += seq['gdl_utf8']
	elif 'group' in el.keys():
	for el2 in el['group']:
	sign += el2['gdl_utf8']
	else:
	if self.DEBUG: print("Error with multi-gdl ✗ {} = {}".format(translit, sign))
	pprint.pprint(gdl)
	if self.DEBUG: print("Multi-gdl ✓ {} = {}".format(translit, sign))
	line += ' ' + sign
	else:
	if self.DEBUG: print("Can't process ✗")
	pprint.pprint(node)
	print()
	output.append(line)

	return output
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import json\n",
	"import pprint\n",
	"import collections\n",
	"import tabulate"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from oracc_reader import ORACC_text_reader"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"big_text = ''\n",
	"for filename in os.listdir('corpusjson/'):\n",
	" if filename.endswith(\".json\"):\n",
	" oracc_reader = ORACC_text_reader(open('corpusjson/' + filename).read(), DEBUG=True)\n",
	" output = oracc_reader.output_norm(with_line_headers=False)\n",
	" big_text += ''.join(output)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"words_follow_moon = []\n",
	"big_text_words = big_text.split()\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w == \"Sin\":\n",
	" words_follow_moon.append(' '.join(big_text_words[i+1:i+3]))\n",
	"\n",
	"words_follow_sun = []\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w == \"Šamaš\":\n",
	" words_follow_sun.append(' '.join(big_text_words[i+1:i+3]))\n",
	" \n",
	"words_follow_planets = []\n",
	"for i, w in enumerate(big_text_words):\n",
	" if w in [\"Sagmegar\", \"Dilbat\", \"Kayyamanu\", \"Šihṭu\", \"Ṣalbatanu\"]:\n",
	" words_follow_planets.append(' '.join(big_text_words[i+1:i+3]))\n",
	" \n",
	"c_moon = collections.Counter(words_follow_moon)\n",
	"c_sun = collections.Counter(words_follow_sun)\n",
	"c_planets = collections.Counter(words_follow_planets)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Most common words following the moon:\n",
	"Word Count\n",
	"-------------- -------\n",
	"u Šamaš 207\n",
	"ina tāmartīšu 105\n",
	"tarbāṣu ilmīma 84\n",
	"ūm 01-KAM₂ 44\n",
	"itti Šamaš 19\n",
	"ina Simani 18\n",
	"...\n",
	"Total: 933\n",
	"\n",
	"Most common words following the sun:\n",
	"Word Count\n",
	"------------ -------\n",
	"itti ahāmeš 89\n",
	"šutātû šar 50\n",
	"šitqulū māti 22\n",
	"innammarma 1 14\n",
	"lā uqqīma 13\n",
	"šitqulū atmû 10\n",
	"...\n",
	"Total: 422\n",
	"\n",
	"Most common words following any of the planets:\n",
	"Word Count\n",
	"-------------- -------\n",
	"ina libbi 26\n",
	"ina libbīšu 21\n",
	"x x 20\n",
	"ina harrāni 10\n",
	"ina erēb-šamši 8\n",
	"ina Nisanni 7\n",
	"...\n",
	"Total: 315\n"
	]
	}
	],
	"source": [
	"print(\"Most common words following the moon:\")\n",
	"print(tabulate.tabulate(c_moon.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_moon)))],\n",
	" headers=[\"Word\", \"Count\"]))\n",
	"print()\n",
	"print(\"Most common words following the sun:\")\n",
	"print(tabulate.tabulate(c_sun.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_sun)))],\n",
	" headers=[\"Word\", \"Count\"]))\n",
	"print()\n",
	"print(\"Most common words following any of the planets:\")\n",
	"print(tabulate.tabulate(c_planets.most_common()[:6] +\n",
	" [('...','')] + [(\"Total:\", str(len(words_follow_planets)))],\n",
	" headers=[\"Word\", \"Count\"]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}