Created
July 9, 2018 17:38
-
-
Save willismonroe/c4f2aaa310062db3341d36eda1c325e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter\n", | |
"from itertools import chain" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from Importer.file_importer import FileImport\n", | |
"from Importer.cdli_corpus import CDLICorpus\n", | |
"from ATFConverter.tokenizer import Tokenizer\n", | |
"from ATFConverter.atf_converter import ATFConverter" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Frequency analysis on the ARM1 corpus\n", | |
"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fi = FileImport('texts/ARM1Akkadian.txt')\n", | |
"fi.read_file()\n", | |
"cc = CDLICorpus()\n", | |
"cc.ingest_text_file(fi.file_lines)\n", | |
"tk = Tokenizer()\n", | |
"atf = ATFConverter()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('a-wi-lum', 133),\n", | |
" ('lu', 108),\n", | |
" ('be-el', 78),\n", | |
" ('_e2_', 76),\n", | |
" ('a-wi-lim', 65),\n", | |
" ('i-na-ad-di-in', 50),\n", | |
" ('šu-u₂', 45),\n", | |
" ('_a-ša3_', 44),\n", | |
" ('_ku₃-babbar_', 43),\n", | |
" ('ku₃-babbar_', 40)]" | |
] | |
}, | |
"execution_count": 78, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=',\n", | |
" 'ARM', '01,', 'lang', 'akk', 'um-ma', 'la',\n", | |
" 'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken',\n", | |
" 'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma',\n", | |
" 'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2',\n", | |
" 'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)',\n", | |
" '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di']\n", | |
"bag_of_words = []\n", | |
"for lines in [text['transliteration'][0] for text in cc.texts]:\n", | |
" for line in lines:\n", | |
" for word in tk.word_tokenizer(line):\n", | |
" if word[0] not in stopwords:\n", | |
" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n", | |
"Counter(bag_of_words).most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Frequency analysis on the law code of H\n", | |
"These two cells show how to load and analyze a CDLI corpus file for most common words using the ARM1 corpus as an example. The stopwords can be customized to match the text as the researcher sees fit." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fi = FileImport('texts/Akkadian.txt')\n", | |
"fi.read_file()\n", | |
"cc = CDLICorpus()\n", | |
"cc.ingest_text_file(fi.file_lines)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('a-wi-lum', 133),\n", | |
" ('be-el', 78),\n", | |
" ('_e2_', 76),\n", | |
" ('a-wi-lim', 65),\n", | |
" ('i-na-ad-di-in', 50),\n", | |
" ('_a-ša3_', 44),\n", | |
" ('_ku₃-babbar_', 43),\n", | |
" ('ku₃-babbar_', 40),\n", | |
" ('šu-a-ti', 39),\n", | |
" ('_dumu-meš_', 36)]" | |
] | |
}, | |
"execution_count": 77, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"stopwords = ['a-na', 'szum-ma', 'i-na', 'u3', 'sza', 'la',\n", | |
" 'lu', 'u2-ul', 'szu-u2']\n", | |
"bag_of_words = []\n", | |
"for lines in [text['transliteration'][0] for text in cc.texts]:\n", | |
" for line in lines:\n", | |
" for word in tk.word_tokenizer(line):\n", | |
" if word[0] not in stopwords:\n", | |
" bag_of_words.append('-'.join(atf.process(word[0].split('-'))))\n", | |
"Counter(bag_of_words).most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment