Skip to content

Instantly share code, notes, and snippets.

@markomanninen
Last active September 22, 2017 02:51
Show Gist options
  • Save markomanninen/9984423 to your computer and use it in GitHub Desktop.
Save markomanninen/9984423 to your computer and use it in GitHub Desktop.
Perseus Greek Isopsephy Project
# -*- coding: utf-8 -*-
import re
data = {}
"""
Data mapping between roman and greek letters, isopsephy values and linguistic components
Resources:
- http://www.perseus.tufts.edu/img/keyCaps.gif for source
- http://www.class.uh.edu/mcl/faculty/pozzi/grnl1/intr/0.2.1.pract.vow.htm
- http://en.wikipedia.org/wiki/Greek_alphabet
- http://www.chlt.org/FirstGreekBook/JWW_FGB1.html
- http://www.webtopos.gr/eng/languages/greek/alphabet/alpha.htm
Segments:
- vowel
- consonant
Subsegments:
- semivowel (liquid, siblant and γ-nasal not specified on data table)
- double
- mute
Mutes (not specified on data table):
{class-order} {letter}
labial-smooth π
labial-middle β
labial-rought φ
palatal-smooth κ
palatal-middle γ
palatal-rought χ
lingual-smooth τ
lingual-middle δ
lingual-rought θ
Seven vowels: e h i o w u
"""
# letters from 1 to 9
#alpha:http://en.wiktionary.org/wiki/ἄλφα
data[1] = {'greek': 'α',
'capital': 'Α',
'name': 'αλφα',
'segment': 'vowel',
'subsegment': 'short',
'roman': 'a',
'value': 1}
#beta:http://en.wiktionary.org/wiki/βῆτα
data[2] = {'greek': 'β',
'capital': 'Β',
'name': 'βητα',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'b',
'value': 2}
#gamma:http://en.wiktionary.org/wiki/γάμμα
data[3] = {'greek': 'γ',
'capital': 'Γ',
'name': 'γαμμα',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'g',
'value': 3}
#delta:http://en.wiktionary.org/wiki/δέλτα
data[4] = {'greek': 'δ',
'capital': 'Δ',
'name': 'δελτα',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'd',
'value': 4}
#epsilon:http://en.wiktionary.org/wiki/epsilon
data[5] = {'greek': 'ε',
'capital': 'Ε',
'name': 'ε ψιλον',
'segment': 'vowel',
'subsegment': 'short',
'roman': 'e',
'value': 5}
#digamma/stigma/episemon/wau
#http://en.wikipedia.org/wiki/Digamma
data[6] = {'greek': 'ϛ', 'small2': 'ϝ',
'capital': 'Ϛ', 'capital2': 'Ϝ',
'name': 'διγαμμα', 'name2': 'στιγμα', 'name3': 'επισημον', 'name4': 'βαυ',
'segment': 'numeral',
#'subsegment': '',
#'roman': '_',
'value': 6}
#zeta:http://en.wiktionary.org/wiki/ζῆτα
data[7] = {'greek': 'ζ',
'capital': 'Ζ',
'name': 'ζητα',
'segment': 'consonant',
'subsegment': 'double',
'roman': 'z',
'value': 7}
#eta:http://en.wiktionary.org/wiki/ἦτα
data[8] = {'greek': 'η',
'capital': 'Η',
'name': 'ητα',
'segment': 'vowel',
'subsegment': 'long',
'roman': 'h',
'value': 8}
#theta:http://en.wiktionary.org/wiki/θῆτα
data[9] = {'greek': 'θ',
'capital': 'Θ',
'name': 'θητα',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'q',
'value': 9}
# letters from 10 to 90
#iota:http://en.wiktionary.org/wiki/ἰῶτα
data[10] = {'greek': 'ι',
'capital': 'Ι',
'name': 'ιωτα',
'segment': 'vowel',
'subsegment': 'short',
'roman': 'i',
'value': 10}
#kappa:http://en.wiktionary.org/wiki/κάππα
data[20] = {'greek': 'κ',
'capital': 'Κ',
'name': 'καππα',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'k',
'value': 20}
#lambda:http://en.wiktionary.org/wiki/λάμβδα
data[30] = {'greek': 'λ',
'capital': 'Λ',
'name': 'λαμβδα',
'segment': 'consonant',
'subsegment': 'semivowel',
'roman': 'l',
'value': 30}
#mu:http://en.wiktionary.org/wiki/mu
data[40] = {'greek': 'μ',
'capital': 'Μ',
'name': 'μυ',
'segment': 'consonant',
'subsegment': 'semivowel',
'roman': 'm',
'value': 40}
#nu:http://en.wiktionary.org/wiki/νῦ
data[50] = {'greek': 'ν',
'capital': 'Ν',
'name': 'νυ',
'segment': 'consonant',
'subsegment': 'semivowel',
'roman': 'n',
'value': 50}
#xi:http://en.wiktionary.org/wiki/ξεῖ
data[60] = {'greek': 'ξ',
'capital': 'Ξ',
'name': 'ξει',
'segment': 'consonant',
'subsegment': 'double',
'roman': 'c',
'value': 60}
#omicron:http://en.wiktionary.org/wiki/omicron
data[70] = {'greek': 'ο',
'capital': 'Ο',
'name': 'ο μικρον',
'segment': 'vowel',
'subsegment': 'short',
'roman': 'o',
'value': 70}
#pi:http://en.wiktionary.org/wiki/πεῖ
data[80] = {'greek': 'π',
'capital': 'Π',
'name': 'πει',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'p',
'value': 80}
#koppa:http://en.wikipedia.org/wiki/Koppa_(letter)
#http://www.webtopos.gr/eng/languages/greek/alphabet/earlyletters.htm
data[90] = {'greek': 'ϙ', 'small2': 'ϟ',
'capital': 'Ϙ', 'capital2': 'Ϟ',
'name': 'κοππα',
'segment': 'numeral',
#'subsegment': '',
#'roman': '_',
'value': 90}
# letters from 100 to 900
#rho:http://en.wiktionary.org/wiki/ῥῶ
data[100] = {'greek': 'ρ',
'capital': 'Ρ',
'name': 'ρω',
'segment': 'consonant',
'subsegment': 'semivowel',
'roman': 'r',
'value': 100}
#sigma:http://en.wiktionary.org/wiki/σίγμα
data[200] = {'greek': 'σ', 'small2': 'ϲ', 'small3': 'ς',
'capital': 'Σ', 'capital2': 'Ϲ', 'capital3': 'Σ',
'name': 'σιγμα',
'segment': 'consonant',
'subsegment': 'semivowel',
'roman': 's',
'value': 200}
#tau:http://en.wiktionary.org/wiki/tau
data[300] = {'greek': 'τ',
'capital': 'Τ',
'name': 'ταυ',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 't',
'value': 300}
#upsilon:http://en.wiktionary.org/wiki/upsilon
data[400] = {'greek': 'υ',
'capital': 'Υ',
'name': 'υ ψιλον',
'segment': 'vowel',
'subsegment': 'short',
'roman': 'u',
'value': 400}
#phi:http://en.wiktionary.org/wiki/phi
data[500] = {'greek': 'φ',
'capital': 'Φ',
'name': 'φει',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'f',
'value': 500}
#khi, chi:http://en.wiktionary.org/wiki/chi
data[600] = {'greek': 'χ',
'capital': 'Χ',
'name': 'χει',
'segment': 'consonant',
'subsegment': 'mute',
'roman': 'x',
'value': 600}
#psi:http://en.wiktionary.org/wiki/psi
data[700] = {'greek': 'ψ',
'capital': 'Ψ',
'name': 'ψει',
'segment': 'consonant',
'subsegment': 'double',
'roman': 'y',
'value': 700}
#omega:http://en.wiktionary.org/wiki/omega
data[800] = {'greek': 'ω',
'capital': 'Ω',
'name': 'ω μεγα',
'segment': 'vowel',
'subsegment': 'long',
'roman': 'w',
'value': 800}
#sampi/disigma
#http://en.wikipedia.org/wiki/Sampi
#http://www.tlg.uci.edu/~opoudjis/unicode/other_nonattic.html#sampi
#http://www.parthia.com/fonts/sampi.htm
#http://www.jstor.org/stable/636031
data[900] = {'greek': 'ϡ', 'small2': 'ͳ',
'capital': 'Ϡ', 'capital2': 'Ͳ',
'name': 'σαμπι', 'name2': 'δισιγμα',
'segment': 'numeral',
#'subsegment': '',
#'roman': '_',
'value': 900}
greek_roman_values = {}
greek_roman_letters = {}
roman_greek_letters = {}
keys = ['roman', 'greek', 'capital', 'capital2', 'small2', 'small3', 'small4']
for num, d in data.items():
for k in keys:
if d.has_key(k):
greek_roman_values[d[k]] = num
if k == 'roman':
greek_roman_letters[d[k]] = d['greek']
greek_roman_letters[d[k].upper()] = d['capital']
greek_roman_values[d[k].upper()] = num
else:
if d.has_key('roman'):
if k == 'capital' or k == 'capital2':
roman_greek_letters[d[k]] = d['roman'].upper()
else:
roman_greek_letters[d[k]] = d['roman']
regex_greek_roman_values = re.compile('|'.join(greek_roman_values.keys()))
regex_greek_to_roman_letters = re.compile('|'.join(roman_greek_letters.keys()))
regex_roman_to_greek_letters = re.compile('|'.join(greek_roman_letters.keys()))
def isopsephy(str):
"""
Str is a roman letter representation (transliteration) of the greek word or sentence
that will be converted to the numerical value letter by letter
"""
str = regex_greek_roman_values.sub(lambda x: '%s ' % greek_roman_values[x.group()], str)
return sum([int(i) for i in str.split()])
def to_roman(word):
"""
Create a roman letter version of the greek word.
This will change all greek (primary), capital, capital2, small2, small3, and small4
letters to roman letter. Capital letters are honored.
"""
return regex_greek_to_roman_letters.sub(lambda x: roman_greek_letters[x.group()], word)
def to_greek(word):
"""
Create a greek version of the roman letter word.
This will change a-zA-Z except j, J, v & V to the corresponding greek letters
Capital letters are honored.
"""
return regex_roman_to_greek_letters.sub(lambda x: greek_roman_letters[x.group()], word)
names = {'name': 'name_value', 'name2': 'name_value2', 'name3': 'name_value3', 'name4': 'name_value4'}
for num, d in data.items():
for k, v in names.items():
if d.has_key(k):
d[v] = isopsephy(d[k])
# accents / diacritics for simplified greek letters
accents = {}
accents['Ὑ'] = 'υ'
accents['Ὕ'] = 'υ'
accents['ὖ'] = 'υ'
accents['ῦ'] = 'υ'
accents['ύ'] = 'υ'
accents['ὗ'] = 'υ'
accents['ὐ'] = 'υ'
accents['ὑ'] = 'υ'
accents['ϋ'] = 'υ'
accents['ὔ'] = 'υ'
accents['ὺ'] = 'υ'
accents['ὕ'] = 'υ'
accents['ὓ'] = 'υ'
accents['Ἠ'] = 'η'
accents['Ἦ'] = 'η'
accents['Ἤ'] = 'η'
accents['Ἡ'] = 'η'
accents['ἠ'] = 'η'
accents['ἦ'] = 'η'
accents['ὴ'] = 'η'
accents['ῇ'] = 'η'
accents['ἡ'] = 'η'
accents['ή'] = 'η'
accents['ῃ'] = 'η'
accents['ῆ'] = 'η'
accents['ἥ'] = 'η'
accents['ἢ'] = 'η'
accents['ᾖ'] = 'η'
accents['ἤ'] = 'η'
accents['ῄ'] = 'η'
accents['ᾗ'] = 'η'
accents['ᾔ'] = 'η'
accents['ἣ'] = 'η'
accents['ἧ'] = 'η'
accents['ᾐ'] = 'η'
accents['ή'] = 'η'
accents['ῇ'] = 'η'
accents['Ἰ'] = 'ι'
accents['Ἴ'] = 'ι'
accents['Ἱ'] = 'ι'
accents['ῖ'] = 'ι'
accents['ί'] = 'ι'
accents['ῖ'] = 'ι'
accents['ὶ'] = 'ι'
accents['ἰ'] = 'ι'
accents['ἵ'] = 'ι'
accents['ἴ'] = 'ι'
accents['ἱ'] = 'ι'
accents['ἶ'] = 'ι'
accents['ΐ'] = 'ι'
accents['ἷ'] = 'ι'
accents['ϊ'] = 'ι'
accents['ῒ'] = 'ι'
accents['ἳ'] = 'ι'
accents['ἳ'] = 'ι'
accents['ί'] = 'ι'
accents['Ὁ'] = 'ο'
accents['Ὃ'] = 'ο'
accents['Ὅ'] = 'ο'
accents['Ὅ'] = 'ο'
accents[' Ὄ'] = 'ο'
accents['Ὀ'] = 'ο'
accents['Ὄ'] = 'ο'
accents['ὁ'] = 'ο'
accents['ὃ'] = 'ο'
accents['ό'] = 'ο'
accents['ὸ'] = 'ο'
accents['ὅ'] = 'ο'
accents['ὄ'] = 'ο'
accents['ὀ'] = 'ο'
accents['ὰ'] = 'α'
accents['ά'] = 'α'
accents['ἀ'] = 'α'
accents['ᾳ'] = 'α'
accents['ἄ'] = 'α'
accents['ἂ'] = 'α'
accents['ἃ'] = 'α'
accents['ᾶ'] = 'α'
accents['ᾷ'] = 'α'
accents['ἅ'] = 'α'
accents['ἁ'] = 'α'
accents['ἆ'] = 'α'
accents['ά'] = 'α'
accents['ά'] = 'α'
accents['Ὡ'] = 'ω'
accents['ῷ'] = 'ω'
accents['ῳ'] = 'ω'
accents['ώ'] = 'ω'
accents['ῶ'] = 'ω'
accents['ώ'] = 'ω'
accents['ὡ'] = 'ω'
accents['ᾧ'] = 'ω'
accents['ὥ'] = 'ω'
accents['ὢ'] = 'ω'
accents['ὼ'] = 'ω'
accents['ὦ'] = 'ω'
accents['ὧ'] = 'ω'
accents['ὤ'] = 'ω'
accents['ὠ'] = 'ω'
accents['ῴ'] = 'ω'
accents['Ἐ'] = 'ε'
accents['Ἔ'] = 'ε'
accents['Ἑ'] = 'ε'
accents['Ἓ'] = 'ε'
accents['Ἕ'] = 'ε'
accents['έ'] = 'ε'
accents['ἐ'] = 'ε'
accents['ἔ'] = 'ε'
accents['ἑ'] = 'ε'
accents['ὲ'] = 'ε'
accents['ἕ'] = 'ε'
accents['ἐ'] = 'ε'
accents['ἓ'] = 'ε'
accents['Ἀ'] = 'α'
accents['Ἁ'] = 'α'
accents['Ἆ'] = 'α'
accents['Ἄ'] = 'α'
accents['Ἅ'] = 'α'
accents['ά'] = 'α'
accents['Ῥ'] = 'ρ'
accents['ῥ'] = 'ρ'
regex_roman = re.compile(r'[^a-z ]+')
def preprocess_roman(str):
# regex to remove all special characters leaving only a-z and empty scape
# for example: a)a/atos tau1 -> aaatos tau
return regex_roman.sub('', str)
regex_greek = re.compile('|'.join(accents.keys()))
def preprocess_greek(str):
# handle diacritics
return regex_greek.sub(lambda x: accents[x.group()], str)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
import re
import pandas as pd
from os.path import isfile
from urllib import urlretrieve
from lxml import etree
from isopsephy import isopsephy, data, to_roman, to_greek, preprocess_roman, preprocess_greek
#fileName = "Perseus_text_1999.04.0058.xml"
def load_dataframe(fileName):
csvFileName = "%s.csv" % fileName
xmlFileName = "%s.xml" % fileName
if isfile(csvFileName):
print "Retrieving data from local csv copy..."
return pd.read_csv(csvFileName)
else:
return load_dataframe_xml(fileName)
def load_dataframe_xml(fileName):
xmlFileName = "%s.xml" % fileName
if isfile(xmlFileName):
print "Retrieving data from local xml copy..."
else:
print "Downloading and saving data to the local copy..."
urlretrieve("http://www.perseus.tufts.edu/hopper/dltext?doc=%s" % xmlFileName, xmlFileName)
try:
tree = etree.parse(xmlFileName)
print "XML tree loaded for pursuit!"
except Exception as e:
print(e.message)
ids = []
words = []
transliterations = []
translations = []
isopsephies = []
charcounts = []
# perseus dictionary words are divided to alphabetic sections
for div in tree.xpath("text/body/div0"):
# entry is the main element
for entry in div.xpath("entry"):
# collects word ids
ids.append(entry.attrib['id'])
# sanitize transliterated word and add to the collection
word = preprocess_roman(entry.attrib['key'])
transliterations.append(word)
# calculate characters
charcounts.append(len(word))
# get isopsephy value of the word (still translitered)
val = isopsephy(word)
isopsephies.append(val)
# create greek version of the word and add to the collection
words.append(to_greek(word))
# get translations for the word and add to the collection
for sense in entry.xpath("sense"):
trs = []
for tr in sense.xpath("trans/tr//text()"):
tr = tr.encode('utf-8')
trs.append(tr)
translations.append('|'.join(trs))
df = pd.DataFrame({'Id': ids,
'Word': words,
'Transliteration': transliterations,
'Translation': translations,
'Isopsephy': isopsephies,
'CharCount': charcounts
})
df.to_csv("%s.csv" % fileName, index=False)
print "Data collected from xml file and saved to csv"
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment