Last active
September 22, 2017 02:51
-
-
Save markomanninen/9984423 to your computer and use it in GitHub Desktop.
Perseus Greek Isopsephy Project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
data = {} | |
""" | |
Data mapping between roman and greek letters, isopsephy values and linguistic components | |
Resources: | |
- http://www.perseus.tufts.edu/img/keyCaps.gif for source | |
- http://www.class.uh.edu/mcl/faculty/pozzi/grnl1/intr/0.2.1.pract.vow.htm | |
- http://en.wikipedia.org/wiki/Greek_alphabet | |
- http://www.chlt.org/FirstGreekBook/JWW_FGB1.html | |
- http://www.webtopos.gr/eng/languages/greek/alphabet/alpha.htm | |
Segments: | |
- vowel | |
- consonant | |
Subsegments: | |
- semivowel (liquid, siblant and γ-nasal not specified on data table) | |
- double | |
- mute | |
Mutes (not specified on data table): | |
{class-order} {letter} | |
labial-smooth π | |
labial-middle β | |
labial-rought φ | |
palatal-smooth κ | |
palatal-middle γ | |
palatal-rought χ | |
lingual-smooth τ | |
lingual-middle δ | |
lingual-rought θ | |
Seven vowels: e h i o w u | |
""" | |
# letters from 1 to 9 | |
#alpha:http://en.wiktionary.org/wiki/ἄλφα | |
data[1] = {'greek': 'α', | |
'capital': 'Α', | |
'name': 'αλφα', | |
'segment': 'vowel', | |
'subsegment': 'short', | |
'roman': 'a', | |
'value': 1} | |
#beta:http://en.wiktionary.org/wiki/βῆτα | |
data[2] = {'greek': 'β', | |
'capital': 'Β', | |
'name': 'βητα', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'b', | |
'value': 2} | |
#gamma:http://en.wiktionary.org/wiki/γάμμα | |
data[3] = {'greek': 'γ', | |
'capital': 'Γ', | |
'name': 'γαμμα', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'g', | |
'value': 3} | |
#delta:http://en.wiktionary.org/wiki/δέλτα | |
data[4] = {'greek': 'δ', | |
'capital': 'Δ', | |
'name': 'δελτα', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'd', | |
'value': 4} | |
#epsilon:http://en.wiktionary.org/wiki/epsilon | |
data[5] = {'greek': 'ε', | |
'capital': 'Ε', | |
'name': 'ε ψιλον', | |
'segment': 'vowel', | |
'subsegment': 'short', | |
'roman': 'e', | |
'value': 5} | |
#digamma/stigma/episemon/wau | |
#http://en.wikipedia.org/wiki/Digamma | |
data[6] = {'greek': 'ϛ', 'small2': 'ϝ', | |
'capital': 'Ϛ', 'capital2': 'Ϝ', | |
'name': 'διγαμμα', 'name2': 'στιγμα', 'name3': 'επισημον', 'name4': 'βαυ', | |
'segment': 'numeral', | |
#'subsegment': '', | |
#'roman': '_', | |
'value': 6} | |
#zeta:http://en.wiktionary.org/wiki/ζῆτα | |
data[7] = {'greek': 'ζ', | |
'capital': 'Ζ', | |
'name': 'ζητα', | |
'segment': 'consonant', | |
'subsegment': 'double', | |
'roman': 'z', | |
'value': 7} | |
#eta:http://en.wiktionary.org/wiki/ἦτα | |
data[8] = {'greek': 'η', | |
'capital': 'Η', | |
'name': 'ητα', | |
'segment': 'vowel', | |
'subsegment': 'long', | |
'roman': 'h', | |
'value': 8} | |
#theta:http://en.wiktionary.org/wiki/θῆτα | |
data[9] = {'greek': 'θ', | |
'capital': 'Θ', | |
'name': 'θητα', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'q', | |
'value': 9} | |
# letters from 10 to 90 | |
#iota:http://en.wiktionary.org/wiki/ἰῶτα | |
data[10] = {'greek': 'ι', | |
'capital': 'Ι', | |
'name': 'ιωτα', | |
'segment': 'vowel', | |
'subsegment': 'short', | |
'roman': 'i', | |
'value': 10} | |
#kappa:http://en.wiktionary.org/wiki/κάππα | |
data[20] = {'greek': 'κ', | |
'capital': 'Κ', | |
'name': 'καππα', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'k', | |
'value': 20} | |
#lambda:http://en.wiktionary.org/wiki/λάμβδα | |
data[30] = {'greek': 'λ', | |
'capital': 'Λ', | |
'name': 'λαμβδα', | |
'segment': 'consonant', | |
'subsegment': 'semivowel', | |
'roman': 'l', | |
'value': 30} | |
#mu:http://en.wiktionary.org/wiki/mu | |
data[40] = {'greek': 'μ', | |
'capital': 'Μ', | |
'name': 'μυ', | |
'segment': 'consonant', | |
'subsegment': 'semivowel', | |
'roman': 'm', | |
'value': 40} | |
#nu:http://en.wiktionary.org/wiki/νῦ | |
data[50] = {'greek': 'ν', | |
'capital': 'Ν', | |
'name': 'νυ', | |
'segment': 'consonant', | |
'subsegment': 'semivowel', | |
'roman': 'n', | |
'value': 50} | |
#xi:http://en.wiktionary.org/wiki/ξεῖ | |
data[60] = {'greek': 'ξ', | |
'capital': 'Ξ', | |
'name': 'ξει', | |
'segment': 'consonant', | |
'subsegment': 'double', | |
'roman': 'c', | |
'value': 60} | |
#omicron:http://en.wiktionary.org/wiki/omicron | |
data[70] = {'greek': 'ο', | |
'capital': 'Ο', | |
'name': 'ο μικρον', | |
'segment': 'vowel', | |
'subsegment': 'short', | |
'roman': 'o', | |
'value': 70} | |
#pi:http://en.wiktionary.org/wiki/πεῖ | |
data[80] = {'greek': 'π', | |
'capital': 'Π', | |
'name': 'πει', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'p', | |
'value': 80} | |
#koppa:http://en.wikipedia.org/wiki/Koppa_(letter) | |
#http://www.webtopos.gr/eng/languages/greek/alphabet/earlyletters.htm | |
data[90] = {'greek': 'ϙ', 'small2': 'ϟ', | |
'capital': 'Ϙ', 'capital2': 'Ϟ', | |
'name': 'κοππα', | |
'segment': 'numeral', | |
#'subsegment': '', | |
#'roman': '_', | |
'value': 90} | |
# letters from 100 to 900 | |
#rho:http://en.wiktionary.org/wiki/ῥῶ | |
data[100] = {'greek': 'ρ', | |
'capital': 'Ρ', | |
'name': 'ρω', | |
'segment': 'consonant', | |
'subsegment': 'semivowel', | |
'roman': 'r', | |
'value': 100} | |
#sigma:http://en.wiktionary.org/wiki/σίγμα | |
data[200] = {'greek': 'σ', 'small2': 'ϲ', 'small3': 'ς', | |
'capital': 'Σ', 'capital2': 'Ϲ', 'capital3': 'Σ', | |
'name': 'σιγμα', | |
'segment': 'consonant', | |
'subsegment': 'semivowel', | |
'roman': 's', | |
'value': 200} | |
#tau:http://en.wiktionary.org/wiki/tau | |
data[300] = {'greek': 'τ', | |
'capital': 'Τ', | |
'name': 'ταυ', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 't', | |
'value': 300} | |
#upsilon:http://en.wiktionary.org/wiki/upsilon | |
data[400] = {'greek': 'υ', | |
'capital': 'Υ', | |
'name': 'υ ψιλον', | |
'segment': 'vowel', | |
'subsegment': 'short', | |
'roman': 'u', | |
'value': 400} | |
#phi:http://en.wiktionary.org/wiki/phi | |
data[500] = {'greek': 'φ', | |
'capital': 'Φ', | |
'name': 'φει', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'f', | |
'value': 500} | |
#khi, chi:http://en.wiktionary.org/wiki/chi | |
data[600] = {'greek': 'χ', | |
'capital': 'Χ', | |
'name': 'χει', | |
'segment': 'consonant', | |
'subsegment': 'mute', | |
'roman': 'x', | |
'value': 600} | |
#psi:http://en.wiktionary.org/wiki/psi | |
data[700] = {'greek': 'ψ', | |
'capital': 'Ψ', | |
'name': 'ψει', | |
'segment': 'consonant', | |
'subsegment': 'double', | |
'roman': 'y', | |
'value': 700} | |
#omega:http://en.wiktionary.org/wiki/omega | |
data[800] = {'greek': 'ω', | |
'capital': 'Ω', | |
'name': 'ω μεγα', | |
'segment': 'vowel', | |
'subsegment': 'long', | |
'roman': 'w', | |
'value': 800} | |
#sampi/disigma | |
#http://en.wikipedia.org/wiki/Sampi | |
#http://www.tlg.uci.edu/~opoudjis/unicode/other_nonattic.html#sampi | |
#http://www.parthia.com/fonts/sampi.htm | |
#http://www.jstor.org/stable/636031 | |
data[900] = {'greek': 'ϡ', 'small2': 'ͳ', | |
'capital': 'Ϡ', 'capital2': 'Ͳ', | |
'name': 'σαμπι', 'name2': 'δισιγμα', | |
'segment': 'numeral', | |
#'subsegment': '', | |
#'roman': '_', | |
'value': 900} | |
greek_roman_values = {} | |
greek_roman_letters = {} | |
roman_greek_letters = {} | |
keys = ['roman', 'greek', 'capital', 'capital2', 'small2', 'small3', 'small4'] | |
for num, d in data.items(): | |
for k in keys: | |
if d.has_key(k): | |
greek_roman_values[d[k]] = num | |
if k == 'roman': | |
greek_roman_letters[d[k]] = d['greek'] | |
greek_roman_letters[d[k].upper()] = d['capital'] | |
greek_roman_values[d[k].upper()] = num | |
else: | |
if d.has_key('roman'): | |
if k == 'capital' or k == 'capital2': | |
roman_greek_letters[d[k]] = d['roman'].upper() | |
else: | |
roman_greek_letters[d[k]] = d['roman'] | |
regex_greek_roman_values = re.compile('|'.join(greek_roman_values.keys())) | |
regex_greek_to_roman_letters = re.compile('|'.join(roman_greek_letters.keys())) | |
regex_roman_to_greek_letters = re.compile('|'.join(greek_roman_letters.keys())) | |
def isopsephy(str): | |
""" | |
Str is a roman letter representation (transliteration) of the greek word or sentence | |
that will be converted to the numerical value letter by letter | |
""" | |
str = regex_greek_roman_values.sub(lambda x: '%s ' % greek_roman_values[x.group()], str) | |
return sum([int(i) for i in str.split()]) | |
def to_roman(word): | |
""" | |
Create a roman letter version of the greek word. | |
This will change all greek (primary), capital, capital2, small2, small3, and small4 | |
letters to roman letter. Capital letters are honored. | |
""" | |
return regex_greek_to_roman_letters.sub(lambda x: roman_greek_letters[x.group()], word) | |
def to_greek(word): | |
""" | |
Create a greek version of the roman letter word. | |
This will change a-zA-Z except j, J, v & V to the corresponding greek letters | |
Capital letters are honored. | |
""" | |
return regex_roman_to_greek_letters.sub(lambda x: greek_roman_letters[x.group()], word) | |
names = {'name': 'name_value', 'name2': 'name_value2', 'name3': 'name_value3', 'name4': 'name_value4'} | |
for num, d in data.items(): | |
for k, v in names.items(): | |
if d.has_key(k): | |
d[v] = isopsephy(d[k]) | |
# accents / diacritics for simplified greek letters | |
accents = {} | |
accents['Ὑ'] = 'υ' | |
accents['Ὕ'] = 'υ' | |
accents['ὖ'] = 'υ' | |
accents['ῦ'] = 'υ' | |
accents['ύ'] = 'υ' | |
accents['ὗ'] = 'υ' | |
accents['ὐ'] = 'υ' | |
accents['ὑ'] = 'υ' | |
accents['ϋ'] = 'υ' | |
accents['ὔ'] = 'υ' | |
accents['ὺ'] = 'υ' | |
accents['ὕ'] = 'υ' | |
accents['ὓ'] = 'υ' | |
accents['Ἠ'] = 'η' | |
accents['Ἦ'] = 'η' | |
accents['Ἤ'] = 'η' | |
accents['Ἡ'] = 'η' | |
accents['ἠ'] = 'η' | |
accents['ἦ'] = 'η' | |
accents['ὴ'] = 'η' | |
accents['ῇ'] = 'η' | |
accents['ἡ'] = 'η' | |
accents['ή'] = 'η' | |
accents['ῃ'] = 'η' | |
accents['ῆ'] = 'η' | |
accents['ἥ'] = 'η' | |
accents['ἢ'] = 'η' | |
accents['ᾖ'] = 'η' | |
accents['ἤ'] = 'η' | |
accents['ῄ'] = 'η' | |
accents['ᾗ'] = 'η' | |
accents['ᾔ'] = 'η' | |
accents['ἣ'] = 'η' | |
accents['ἧ'] = 'η' | |
accents['ᾐ'] = 'η' | |
accents['ή'] = 'η' | |
accents['ῇ'] = 'η' | |
accents['Ἰ'] = 'ι' | |
accents['Ἴ'] = 'ι' | |
accents['Ἱ'] = 'ι' | |
accents['ῖ'] = 'ι' | |
accents['ί'] = 'ι' | |
accents['ῖ'] = 'ι' | |
accents['ὶ'] = 'ι' | |
accents['ἰ'] = 'ι' | |
accents['ἵ'] = 'ι' | |
accents['ἴ'] = 'ι' | |
accents['ἱ'] = 'ι' | |
accents['ἶ'] = 'ι' | |
accents['ΐ'] = 'ι' | |
accents['ἷ'] = 'ι' | |
accents['ϊ'] = 'ι' | |
accents['ῒ'] = 'ι' | |
accents['ἳ'] = 'ι' | |
accents['ἳ'] = 'ι' | |
accents['ί'] = 'ι' | |
accents['Ὁ'] = 'ο' | |
accents['Ὃ'] = 'ο' | |
accents['Ὅ'] = 'ο' | |
accents['Ὅ'] = 'ο' | |
accents[' Ὄ'] = 'ο' | |
accents['Ὀ'] = 'ο' | |
accents['Ὄ'] = 'ο' | |
accents['ὁ'] = 'ο' | |
accents['ὃ'] = 'ο' | |
accents['ό'] = 'ο' | |
accents['ὸ'] = 'ο' | |
accents['ὅ'] = 'ο' | |
accents['ὄ'] = 'ο' | |
accents['ὀ'] = 'ο' | |
accents['ὰ'] = 'α' | |
accents['ά'] = 'α' | |
accents['ἀ'] = 'α' | |
accents['ᾳ'] = 'α' | |
accents['ἄ'] = 'α' | |
accents['ἂ'] = 'α' | |
accents['ἃ'] = 'α' | |
accents['ᾶ'] = 'α' | |
accents['ᾷ'] = 'α' | |
accents['ἅ'] = 'α' | |
accents['ἁ'] = 'α' | |
accents['ἆ'] = 'α' | |
accents['ά'] = 'α' | |
accents['ά'] = 'α' | |
accents['Ὡ'] = 'ω' | |
accents['ῷ'] = 'ω' | |
accents['ῳ'] = 'ω' | |
accents['ώ'] = 'ω' | |
accents['ῶ'] = 'ω' | |
accents['ώ'] = 'ω' | |
accents['ὡ'] = 'ω' | |
accents['ᾧ'] = 'ω' | |
accents['ὥ'] = 'ω' | |
accents['ὢ'] = 'ω' | |
accents['ὼ'] = 'ω' | |
accents['ὦ'] = 'ω' | |
accents['ὧ'] = 'ω' | |
accents['ὤ'] = 'ω' | |
accents['ὠ'] = 'ω' | |
accents['ῴ'] = 'ω' | |
accents['Ἐ'] = 'ε' | |
accents['Ἔ'] = 'ε' | |
accents['Ἑ'] = 'ε' | |
accents['Ἓ'] = 'ε' | |
accents['Ἕ'] = 'ε' | |
accents['έ'] = 'ε' | |
accents['ἐ'] = 'ε' | |
accents['ἔ'] = 'ε' | |
accents['ἑ'] = 'ε' | |
accents['ὲ'] = 'ε' | |
accents['ἕ'] = 'ε' | |
accents['ἐ'] = 'ε' | |
accents['ἓ'] = 'ε' | |
accents['Ἀ'] = 'α' | |
accents['Ἁ'] = 'α' | |
accents['Ἆ'] = 'α' | |
accents['Ἄ'] = 'α' | |
accents['Ἅ'] = 'α' | |
accents['ά'] = 'α' | |
accents['Ῥ'] = 'ρ' | |
accents['ῥ'] = 'ρ' | |
regex_roman = re.compile(r'[^a-z ]+') | |
def preprocess_roman(str): | |
# regex to remove all special characters leaving only a-z and empty scape | |
# for example: a)a/atos tau1 -> aaatos tau | |
return regex_roman.sub('', str) | |
regex_greek = re.compile('|'.join(accents.keys())) | |
def preprocess_greek(str): | |
# handle diacritics | |
return regex_greek.sub(lambda x: accents[x.group()], str) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import pandas as pd | |
from os.path import isfile | |
from urllib import urlretrieve | |
from lxml import etree | |
from isopsephy import isopsephy, data, to_roman, to_greek, preprocess_roman, preprocess_greek | |
#fileName = "Perseus_text_1999.04.0058.xml" | |
def load_dataframe(fileName): | |
csvFileName = "%s.csv" % fileName | |
xmlFileName = "%s.xml" % fileName | |
if isfile(csvFileName): | |
print "Retrieving data from local csv copy..." | |
return pd.read_csv(csvFileName) | |
else: | |
return load_dataframe_xml(fileName) | |
def load_dataframe_xml(fileName): | |
xmlFileName = "%s.xml" % fileName | |
if isfile(xmlFileName): | |
print "Retrieving data from local xml copy..." | |
else: | |
print "Downloading and saving data to the local copy..." | |
urlretrieve("http://www.perseus.tufts.edu/hopper/dltext?doc=%s" % xmlFileName, xmlFileName) | |
try: | |
tree = etree.parse(xmlFileName) | |
print "XML tree loaded for pursuit!" | |
except Exception as e: | |
print(e.message) | |
ids = [] | |
words = [] | |
transliterations = [] | |
translations = [] | |
isopsephies = [] | |
charcounts = [] | |
# perseus dictionary words are divided to alphabetic sections | |
for div in tree.xpath("text/body/div0"): | |
# entry is the main element | |
for entry in div.xpath("entry"): | |
# collects word ids | |
ids.append(entry.attrib['id']) | |
# sanitize transliterated word and add to the collection | |
word = preprocess_roman(entry.attrib['key']) | |
transliterations.append(word) | |
# calculate characters | |
charcounts.append(len(word)) | |
# get isopsephy value of the word (still translitered) | |
val = isopsephy(word) | |
isopsephies.append(val) | |
# create greek version of the word and add to the collection | |
words.append(to_greek(word)) | |
# get translations for the word and add to the collection | |
for sense in entry.xpath("sense"): | |
trs = [] | |
for tr in sense.xpath("trans/tr//text()"): | |
tr = tr.encode('utf-8') | |
trs.append(tr) | |
translations.append('|'.join(trs)) | |
df = pd.DataFrame({'Id': ids, | |
'Word': words, | |
'Transliteration': transliterations, | |
'Translation': translations, | |
'Isopsephy': isopsephies, | |
'CharCount': charcounts | |
}) | |
df.to_csv("%s.csv" % fileName, index=False) | |
print "Data collected from xml file and saved to csv" | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment