markomanninen · September 22, 2017 02:51
diff --git a/isopsephy.py b/isopsephy.py
 # -*- coding: utf-8 -*-

 import re

 data = {}

 """

 Data mapping between roman and greek letters, isopsephy values and linguistic components

 Resources:

 - http://www.perseus.tufts.edu/img/keyCaps.gif for source
 - http://www.class.uh.edu/mcl/faculty/pozzi/grnl1/intr/0.2.1.pract.vow.htm
 - http://en.wikipedia.org/wiki/Greek_alphabet
 - http://www.chlt.org/FirstGreekBook/JWW_FGB1.html
 - http://www.webtopos.gr/eng/languages/greek/alphabet/alpha.htm

 Segments:
 - vowel
 - consonant

 Subsegments:
 - semivowel (liquid, siblant and γ-nasal not specified on data table)
 - double
 - mute

 Mutes (not specified on data table):

 {class-order} {letter}

 labial-smooth π
 labial-middle β
 labial-rought φ

 palatal-smooth κ
 palatal-middle γ
 palatal-rought χ

 lingual-smooth τ
 lingual-middle δ
 lingual-rought θ


 Seven vowels: e h i o w u

 """

 # letters from 1 to 9
 #alpha:http://en.wiktionary.org/wiki/ἄλφα
 data[1] = {'greek': 'α', 
           'capital': 'Α',
           'name': 'αλφα',
           'segment': 'vowel',
           'subsegment': 'short',
           'roman': 'a',
           'value': 1}
 #beta:http://en.wiktionary.org/wiki/βῆτα
 data[2] = {'greek': 'β',
           'capital': 'Β',
           'name': 'βητα',
           'segment': 'consonant',
           'subsegment': 'mute',
           'roman': 'b',
           'value': 2}
 #gamma:http://en.wiktionary.org/wiki/γάμμα
 data[3] = {'greek': 'γ',
           'capital': 'Γ',
           'name': 'γαμμα',
           'segment': 'consonant',
           'subsegment': 'mute',
           'roman': 'g',
           'value': 3}
 #delta:http://en.wiktionary.org/wiki/δέλτα
 data[4] = {'greek': 'δ',
           'capital': 'Δ',
           'name': 'δελτα',
           'segment': 'consonant',
           'subsegment': 'mute',
           'roman': 'd',
           'value': 4}
 #epsilon:http://en.wiktionary.org/wiki/epsilon
 data[5] = {'greek': 'ε',
           'capital': 'Ε',
           'name': 'ε ψιλον',
           'segment': 'vowel',
           'subsegment': 'short',
           'roman': 'e',
           'value': 5}
 #digamma/stigma/episemon/wau
 #http://en.wikipedia.org/wiki/Digamma
 data[6] = {'greek': 'ϛ', 'small2': 'ϝ',
           'capital': 'Ϛ', 'capital2': 'Ϝ',
           'name': 'διγαμμα', 'name2': 'στιγμα', 'name3': 'επισημον', 'name4': 'βαυ',
           'segment': 'numeral',
           #'subsegment': '',
           #'roman': '_',
           'value': 6}
 #zeta:http://en.wiktionary.org/wiki/ζῆτα
 data[7] = {'greek': 'ζ',
           'capital': 'Ζ',
           'name': 'ζητα',
           'segment': 'consonant',
           'subsegment': 'double',
           'roman': 'z',
           'value': 7}
 #eta:http://en.wiktionary.org/wiki/ἦτα
 data[8] = {'greek': 'η',
           'capital': 'Η',
           'name': 'ητα',
           'segment': 'vowel',
           'subsegment': 'long',
           'roman': 'h',
           'value': 8}
 #theta:http://en.wiktionary.org/wiki/θῆτα
 data[9] = {'greek': 'θ',
           'capital': 'Θ',
           'name': 'θητα',
           'segment': 'consonant',
           'subsegment': 'mute',
           'roman': 'q',
           'value': 9}

 # letters from 10 to 90
 #iota:http://en.wiktionary.org/wiki/ἰῶτα
 data[10] = {'greek': 'ι',
            'capital': 'Ι',
            'name': 'ιωτα',
            'segment': 'vowel',
            'subsegment': 'short',
            'roman': 'i',
            'value': 10}
 #kappa:http://en.wiktionary.org/wiki/κάππα
 data[20] = {'greek': 'κ',
            'capital': 'Κ',
            'name': 'καππα',
            'segment': 'consonant',
            'subsegment': 'mute',
            'roman': 'k',
            'value': 20}
 #lambda:http://en.wiktionary.org/wiki/λάμβδα
 data[30] = {'greek': 'λ',
            'capital': 'Λ',
            'name': 'λαμβδα',
            'segment': 'consonant',
            'subsegment': 'semivowel',
            'roman': 'l',
            'value': 30}
 #mu:http://en.wiktionary.org/wiki/mu
 data[40] = {'greek': 'μ',
            'capital': 'Μ',
            'name': 'μυ',
            'segment': 'consonant',
            'subsegment': 'semivowel',
            'roman': 'm',
            'value': 40}
 #nu:http://en.wiktionary.org/wiki/νῦ
 data[50] = {'greek': 'ν',
            'capital': 'Ν',
            'name': 'νυ',
            'segment': 'consonant',
            'subsegment': 'semivowel',
            'roman': 'n',
            'value': 50}
 #xi:http://en.wiktionary.org/wiki/ξεῖ
 data[60] = {'greek': 'ξ',
            'capital': 'Ξ',
            'name': 'ξει',
            'segment': 'consonant',
            'subsegment': 'double',
            'roman': 'c',
            'value': 60}
 #omicron:http://en.wiktionary.org/wiki/omicron
 data[70] = {'greek': 'ο',
            'capital': 'Ο',
            'name': 'ο μικρον',
            'segment': 'vowel',
            'subsegment': 'short',
            'roman': 'o',
            'value': 70}
 #pi:http://en.wiktionary.org/wiki/πεῖ
 data[80] = {'greek': 'π',
            'capital': 'Π',
            'name': 'πει',
            'segment': 'consonant',
            'subsegment': 'mute',
            'roman': 'p',
            'value': 80}
 #koppa:http://en.wikipedia.org/wiki/Koppa_(letter)
 #http://www.webtopos.gr/eng/languages/greek/alphabet/earlyletters.htm
 data[90] = {'greek': 'ϙ', 'small2': 'ϟ',
            'capital': 'Ϙ', 'capital2': 'Ϟ',
            'name': 'κοππα',
            'segment': 'numeral',
            #'subsegment': '',
            #'roman': '_',
            'value': 90}

 # letters from 100 to 900
 #rho:http://en.wiktionary.org/wiki/ῥῶ
 data[100] = {'greek': 'ρ',
             'capital': 'Ρ',
             'name': 'ρω',
             'segment': 'consonant',
             'subsegment': 'semivowel',
             'roman': 'r',
             'value': 100}
 #sigma:http://en.wiktionary.org/wiki/σίγμα
 data[200] = {'greek': 'σ', 'small2': 'ϲ', 'small3': 'ς',
             'capital': 'Σ', 'capital2': 'Ϲ', 'capital3': 'Σ',
             'name': 'σιγμα',
             'segment': 'consonant',
             'subsegment': 'semivowel',
             'roman': 's',
             'value': 200}
 #tau:http://en.wiktionary.org/wiki/tau
 data[300] = {'greek': 'τ',
             'capital': 'Τ',
             'name': 'ταυ',
             'segment': 'consonant',
             'subsegment': 'mute',
             'roman': 't',
             'value': 300}
 #upsilon:http://en.wiktionary.org/wiki/upsilon
 data[400] = {'greek': 'υ',
             'capital': 'Υ',
             'name': 'υ ψιλον',
             'segment': 'vowel',
             'subsegment': 'short',
             'roman': 'u',
             'value': 400}
 #phi:http://en.wiktionary.org/wiki/phi
 data[500] = {'greek': 'φ',
             'capital': 'Φ',
             'name': 'φει',
             'segment': 'consonant',
             'subsegment': 'mute',
             'roman': 'f',
             'value': 500}
 #khi, chi:http://en.wiktionary.org/wiki/chi
 data[600] = {'greek': 'χ',
             'capital': 'Χ',
             'name': 'χει',
             'segment': 'consonant',
             'subsegment': 'mute',
             'roman': 'x',
             'value': 600}
 #psi:http://en.wiktionary.org/wiki/psi
 data[700] = {'greek': 'ψ',
             'capital': 'Ψ',
             'name': 'ψει',
             'segment': 'consonant',
             'subsegment': 'double',
             'roman': 'y',
             'value': 700}
 #omega:http://en.wiktionary.org/wiki/omega
 data[800] = {'greek': 'ω',
             'capital': 'Ω',
             'name': 'ω μεγα',
             'segment': 'vowel',
             'subsegment': 'long',
             'roman': 'w',
             'value': 800}
 #sampi/disigma
 #http://en.wikipedia.org/wiki/Sampi
 #http://www.tlg.uci.edu/~opoudjis/unicode/other_nonattic.html#sampi
 #http://www.parthia.com/fonts/sampi.htm
 #http://www.jstor.org/stable/636031
 data[900] = {'greek': 'ϡ', 'small2': 'ͳ',
             'capital': 'Ϡ', 'capital2': 'Ͳ',
             'name': 'σαμπι', 'name2': 'δισιγμα',
             'segment': 'numeral',
             #'subsegment': '',
             #'roman': '_',
             'value': 900}

 greek_roman_values = {}
 greek_roman_letters = {}
 roman_greek_letters = {}

 keys = ['roman', 'greek', 'capital', 'capital2', 'small2', 'small3', 'small4']
 for num, d in data.items():
    for k in keys:
        if d.has_key(k):
            greek_roman_values[d[k]] = num
            if k == 'roman':
                greek_roman_letters[d[k]] = d['greek']
                greek_roman_letters[d[k].upper()] = d['capital']
                greek_roman_values[d[k].upper()] = num
            else:
                if d.has_key('roman'):
                    if k == 'capital' or k == 'capital2':
                        roman_greek_letters[d[k]] = d['roman'].upper()
                    else:
                        roman_greek_letters[d[k]] = d['roman']

 regex_greek_roman_values = re.compile('|'.join(greek_roman_values.keys()))
 regex_greek_to_roman_letters = re.compile('|'.join(roman_greek_letters.keys()))
 regex_roman_to_greek_letters = re.compile('|'.join(greek_roman_letters.keys()))

 def isopsephy(str):
    """
    Str is a roman letter representation (transliteration) of the greek word or sentence 
    that will be converted to the numerical value letter by letter
    """
    str = regex_greek_roman_values.sub(lambda x: '%s ' % greek_roman_values[x.group()], str)
    return sum([int(i) for i in str.split()])

 def to_roman(word):
    """
    Create a roman letter version of the greek word.
    This will change all greek (primary), capital, capital2, small2, small3, and small4
    letters to roman letter. Capital letters are honored.
    """
    return regex_greek_to_roman_letters.sub(lambda x: roman_greek_letters[x.group()], word)

 def to_greek(word):
    """
    Create a greek version of the roman letter word.
    This will change a-zA-Z except j, J, v & V to the corresponding greek letters
    Capital letters are honored.
    """
    return regex_roman_to_greek_letters.sub(lambda x: greek_roman_letters[x.group()], word)

 names = {'name': 'name_value', 'name2': 'name_value2', 'name3': 'name_value3', 'name4': 'name_value4'}
 for num, d in data.items():
    for k, v in names.items():
        if d.has_key(k):
            d[v] = isopsephy(d[k])

 # accents / diacritics for simplified greek letters
 accents = {}

 accents['Ὑ'] = 'υ'
 accents['Ὕ'] = 'υ'
 accents['ὖ'] = 'υ'
 accents['ῦ'] = 'υ'
 accents['ύ'] = 'υ'
 accents['ὗ'] = 'υ'
 accents['ὐ'] = 'υ'
 accents['ὑ'] = 'υ'
 accents['ϋ'] = 'υ'
 accents['ὔ'] = 'υ'
 accents['ὺ'] = 'υ'
 accents['ὕ'] = 'υ'
 accents['ὓ'] = 'υ'

 accents['Ἠ'] = 'η'
 accents['Ἦ'] = 'η'
 accents['Ἤ'] = 'η'
 accents['Ἡ'] = 'η'
 accents['ἠ'] = 'η'
 accents['ἦ'] = 'η'
 accents['ὴ'] = 'η'
 accents['ῇ'] = 'η'
 accents['ἡ'] = 'η'
 accents['ή'] = 'η'
 accents['ῃ'] = 'η'
 accents['ῆ'] = 'η'
 accents['ἥ'] = 'η'
 accents['ἢ'] = 'η'
 accents['ᾖ'] = 'η'
 accents['ἤ'] = 'η'
 accents['ῄ'] = 'η'
 accents['ᾗ'] = 'η'
 accents['ᾔ'] = 'η'
 accents['ἣ'] = 'η'
 accents['ἧ'] = 'η'
 accents['ᾐ'] = 'η'
 accents['ή'] = 'η'
 accents['ῇ'] = 'η'

 accents['Ἰ'] = 'ι'
 accents['Ἴ'] = 'ι'
 accents['Ἱ'] = 'ι'
 accents['ῖ'] = 'ι'
 accents['ί'] = 'ι'
 accents['ῖ'] = 'ι'
 accents['ὶ'] = 'ι'
 accents['ἰ'] = 'ι'
 accents['ἵ'] = 'ι'
 accents['ἴ'] = 'ι'
 accents['ἱ'] = 'ι'
 accents['ἶ'] = 'ι'
 accents['ΐ'] = 'ι'
 accents['ἷ'] = 'ι'
 accents['ϊ'] = 'ι'
 accents['ῒ'] = 'ι'
 accents['ἳ'] = 'ι'
 accents['ἳ'] = 'ι'
 accents['ί'] = 'ι'

 accents['Ὁ'] = 'ο'
 accents['Ὃ'] = 'ο'
 accents['Ὅ'] = 'ο'
 accents['Ὅ'] = 'ο'
 accents[' Ὄ'] = 'ο'
 accents['Ὀ'] = 'ο'
 accents['Ὄ'] = 'ο'
 accents['ὁ'] = 'ο'
 accents['ὃ'] = 'ο'
 accents['ό'] = 'ο'
 accents['ὸ'] = 'ο'
 accents['ὅ'] = 'ο'
 accents['ὄ'] = 'ο'
 accents['ὀ'] = 'ο'

 accents['ὰ'] = 'α'
 accents['ά'] = 'α'
 accents['ἀ'] = 'α'
 accents['ᾳ'] = 'α'
 accents['ἄ'] = 'α'
 accents['ἂ'] = 'α'
 accents['ἃ'] = 'α'
 accents['ᾶ'] = 'α'
 accents['ᾷ'] = 'α'
 accents['ἅ'] = 'α'
 accents['ἁ'] = 'α'
 accents['ἆ'] = 'α'
 accents['ά'] = 'α'
 accents['ά'] = 'α'

 accents['Ὡ'] = 'ω'
 accents['ῷ'] = 'ω'
 accents['ῳ'] = 'ω'
 accents['ώ'] = 'ω'
 accents['ῶ'] = 'ω'
 accents['ώ'] = 'ω'
 accents['ὡ'] = 'ω'
 accents['ᾧ'] = 'ω'
 accents['ὥ'] = 'ω'
 accents['ὢ'] = 'ω'
 accents['ὼ'] = 'ω'
 accents['ὦ'] = 'ω'
 accents['ὧ'] = 'ω'
 accents['ὤ'] = 'ω'
 accents['ὠ'] = 'ω'
 accents['ῴ'] = 'ω'

 accents['Ἐ'] = 'ε'
 accents['Ἔ'] = 'ε'
 accents['Ἑ'] = 'ε'
 accents['Ἓ'] = 'ε'
 accents['Ἕ'] = 'ε'
 accents['έ'] = 'ε'
 accents['ἐ'] = 'ε'
 accents['ἔ'] = 'ε'
 accents['ἑ'] = 'ε'
 accents['ὲ'] = 'ε'
 accents['ἕ'] = 'ε'
 accents['ἐ'] = 'ε'
 accents['ἓ'] = 'ε'

 accents['Ἀ'] = 'α'
 accents['Ἁ'] = 'α'
 accents['Ἆ'] = 'α'
 accents['Ἄ'] = 'α'
 accents['Ἅ'] = 'α'
 accents['ά'] = 'α'

 accents['Ῥ'] = 'ρ'
 accents['ῥ'] = 'ρ'

 regex_roman = re.compile(r'[^a-z ]+')
 def preprocess_roman(str):
    # regex to remove all special characters leaving only a-z and empty scape
    # for example: a)a/atos tau1 -> aaatos tau
    return regex_roman.sub('', str)

 regex_greek = re.compile('|'.join(accents.keys()))
 def preprocess_greek(str):
    # handle diacritics
    return regex_greek.sub(lambda x: accents[x.group()], str)
diff --git a/Perseus Greek Isopsephy Project.ipynb b/Perseus Greek Isopsephy Project.ipynb
diff --git a/perseus.py b/perseus.py
 # -*- coding: utf-8 -*-

 import re
 import pandas as pd
 from os.path import isfile
 from urllib import urlretrieve
 from lxml import etree
 from isopsephy import isopsephy, data, to_roman, to_greek, preprocess_roman, preprocess_greek

 #fileName = "Perseus_text_1999.04.0058.xml"

 def load_dataframe(fileName):

    csvFileName = "%s.csv" % fileName
    xmlFileName = "%s.xml" % fileName

    if isfile(csvFileName):
        print "Retrieving data from local csv copy..."
        return pd.read_csv(csvFileName)
    else:
        return load_dataframe_xml(fileName)

 def load_dataframe_xml(fileName):

    xmlFileName = "%s.xml" % fileName

    if isfile(xmlFileName):
        print "Retrieving data from local xml copy..."
    else:
        print "Downloading and saving data to the local copy..."
        urlretrieve("http://www.perseus.tufts.edu/hopper/dltext?doc=%s" % xmlFileName, xmlFileName)

    try:
        tree = etree.parse(xmlFileName)
        print "XML tree loaded for pursuit!"
    except Exception as e:
        print(e.message)

    ids = []
    words = []
    transliterations = []
    translations = []
    isopsephies = []
    charcounts = []

    # perseus dictionary words are divided to alphabetic sections
    for div in tree.xpath("text/body/div0"):
        # entry is the main element
        for entry in div.xpath("entry"):
            # collects word ids
            ids.append(entry.attrib['id'])
            # sanitize transliterated word and add to the collection
            word = preprocess_roman(entry.attrib['key'])
            transliterations.append(word)
            # calculate characters
            charcounts.append(len(word))
            # get isopsephy value of the word (still translitered)
            val = isopsephy(word)
            isopsephies.append(val)
            # create greek version of the word and add to the collection
            words.append(to_greek(word))
            # get translations for the word and add to the collection
            for sense in entry.xpath("sense"):
                trs = []
                for tr in sense.xpath("trans/tr//text()"):
                    tr = tr.encode('utf-8')
                    trs.append(tr)
            translations.append('|'.join(trs))

    df = pd.DataFrame({'Id': ids,
                       'Word': words,
                       'Transliteration': transliterations,
                       'Translation': translations,
                       'Isopsephy': isopsephies,
                       'CharCount': charcounts
                        })

    df.to_csv("%s.csv" % fileName, index=False)

    print "Data collected from xml file and saved to csv"

    return df
	# -- coding: utf-8 --

	import re

	data = {}

	"""

	Data mapping between roman and greek letters, isopsephy values and linguistic components

	Resources:

	- http://www.perseus.tufts.edu/img/keyCaps.gif for source
	- http://www.class.uh.edu/mcl/faculty/pozzi/grnl1/intr/0.2.1.pract.vow.htm
	- http://en.wikipedia.org/wiki/Greek_alphabet
	- http://www.chlt.org/FirstGreekBook/JWW_FGB1.html
	- http://www.webtopos.gr/eng/languages/greek/alphabet/alpha.htm

	Segments:
	- vowel
	- consonant

	Subsegments:
	- semivowel (liquid, siblant and γ-nasal not specified on data table)
	- double
	- mute

	Mutes (not specified on data table):

	{class-order} {letter}

	labial-smooth π
	labial-middle β
	labial-rought φ

	palatal-smooth κ
	palatal-middle γ
	palatal-rought χ

	lingual-smooth τ
	lingual-middle δ
	lingual-rought θ


	Seven vowels: e h i o w u

	"""

	# letters from 1 to 9
	#alpha:http://en.wiktionary.org/wiki/ἄλφα
	data[1] = {'greek': 'α',
	'capital': 'Α',
	'name': 'αλφα',
	'segment': 'vowel',
	'subsegment': 'short',
	'roman': 'a',
	'value': 1}
	#beta:http://en.wiktionary.org/wiki/βῆτα
	data[2] = {'greek': 'β',
	'capital': 'Β',
	'name': 'βητα',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'b',
	'value': 2}
	#gamma:http://en.wiktionary.org/wiki/γάμμα
	data[3] = {'greek': 'γ',
	'capital': 'Γ',
	'name': 'γαμμα',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'g',
	'value': 3}
	#delta:http://en.wiktionary.org/wiki/δέλτα
	data[4] = {'greek': 'δ',
	'capital': 'Δ',
	'name': 'δελτα',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'd',
	'value': 4}
	#epsilon:http://en.wiktionary.org/wiki/epsilon
	data[5] = {'greek': 'ε',
	'capital': 'Ε',
	'name': 'ε ψιλον',
	'segment': 'vowel',
	'subsegment': 'short',
	'roman': 'e',
	'value': 5}
	#digamma/stigma/episemon/wau
	#http://en.wikipedia.org/wiki/Digamma
	data[6] = {'greek': 'ϛ', 'small2': 'ϝ',
	'capital': 'Ϛ', 'capital2': 'Ϝ',
	'name': 'διγαμμα', 'name2': 'στιγμα', 'name3': 'επισημον', 'name4': 'βαυ',
	'segment': 'numeral',
	#'subsegment': '',
	#'roman': '_',
	'value': 6}
	#zeta:http://en.wiktionary.org/wiki/ζῆτα
	data[7] = {'greek': 'ζ',
	'capital': 'Ζ',
	'name': 'ζητα',
	'segment': 'consonant',
	'subsegment': 'double',
	'roman': 'z',
	'value': 7}
	#eta:http://en.wiktionary.org/wiki/ἦτα
	data[8] = {'greek': 'η',
	'capital': 'Η',
	'name': 'ητα',
	'segment': 'vowel',
	'subsegment': 'long',
	'roman': 'h',
	'value': 8}
	#theta:http://en.wiktionary.org/wiki/θῆτα
	data[9] = {'greek': 'θ',
	'capital': 'Θ',
	'name': 'θητα',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'q',
	'value': 9}

	# letters from 10 to 90
	#iota:http://en.wiktionary.org/wiki/ἰῶτα
	data[10] = {'greek': 'ι',
	'capital': 'Ι',
	'name': 'ιωτα',
	'segment': 'vowel',
	'subsegment': 'short',
	'roman': 'i',
	'value': 10}
	#kappa:http://en.wiktionary.org/wiki/κάππα
	data[20] = {'greek': 'κ',
	'capital': 'Κ',
	'name': 'καππα',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'k',
	'value': 20}
	#lambda:http://en.wiktionary.org/wiki/λάμβδα
	data[30] = {'greek': 'λ',
	'capital': 'Λ',
	'name': 'λαμβδα',
	'segment': 'consonant',
	'subsegment': 'semivowel',
	'roman': 'l',
	'value': 30}
	#mu:http://en.wiktionary.org/wiki/mu
	data[40] = {'greek': 'μ',
	'capital': 'Μ',
	'name': 'μυ',
	'segment': 'consonant',
	'subsegment': 'semivowel',
	'roman': 'm',
	'value': 40}
	#nu:http://en.wiktionary.org/wiki/νῦ
	data[50] = {'greek': 'ν',
	'capital': 'Ν',
	'name': 'νυ',
	'segment': 'consonant',
	'subsegment': 'semivowel',
	'roman': 'n',
	'value': 50}
	#xi:http://en.wiktionary.org/wiki/ξεῖ
	data[60] = {'greek': 'ξ',
	'capital': 'Ξ',
	'name': 'ξει',
	'segment': 'consonant',
	'subsegment': 'double',
	'roman': 'c',
	'value': 60}
	#omicron:http://en.wiktionary.org/wiki/omicron
	data[70] = {'greek': 'ο',
	'capital': 'Ο',
	'name': 'ο μικρον',
	'segment': 'vowel',
	'subsegment': 'short',
	'roman': 'o',
	'value': 70}
	#pi:http://en.wiktionary.org/wiki/πεῖ
	data[80] = {'greek': 'π',
	'capital': 'Π',
	'name': 'πει',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'p',
	'value': 80}
	#koppa:http://en.wikipedia.org/wiki/Koppa_(letter)
	#http://www.webtopos.gr/eng/languages/greek/alphabet/earlyletters.htm
	data[90] = {'greek': 'ϙ', 'small2': 'ϟ',
	'capital': 'Ϙ', 'capital2': 'Ϟ',
	'name': 'κοππα',
	'segment': 'numeral',
	#'subsegment': '',
	#'roman': '_',
	'value': 90}

	# letters from 100 to 900
	#rho:http://en.wiktionary.org/wiki/ῥῶ
	data[100] = {'greek': 'ρ',
	'capital': 'Ρ',
	'name': 'ρω',
	'segment': 'consonant',
	'subsegment': 'semivowel',
	'roman': 'r',
	'value': 100}
	#sigma:http://en.wiktionary.org/wiki/σίγμα
	data[200] = {'greek': 'σ', 'small2': 'ϲ', 'small3': 'ς',
	'capital': 'Σ', 'capital2': 'Ϲ', 'capital3': 'Σ',
	'name': 'σιγμα',
	'segment': 'consonant',
	'subsegment': 'semivowel',
	'roman': 's',
	'value': 200}
	#tau:http://en.wiktionary.org/wiki/tau
	data[300] = {'greek': 'τ',
	'capital': 'Τ',
	'name': 'ταυ',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 't',
	'value': 300}
	#upsilon:http://en.wiktionary.org/wiki/upsilon
	data[400] = {'greek': 'υ',
	'capital': 'Υ',
	'name': 'υ ψιλον',
	'segment': 'vowel',
	'subsegment': 'short',
	'roman': 'u',
	'value': 400}
	#phi:http://en.wiktionary.org/wiki/phi
	data[500] = {'greek': 'φ',
	'capital': 'Φ',
	'name': 'φει',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'f',
	'value': 500}
	#khi, chi:http://en.wiktionary.org/wiki/chi
	data[600] = {'greek': 'χ',
	'capital': 'Χ',
	'name': 'χει',
	'segment': 'consonant',
	'subsegment': 'mute',
	'roman': 'x',
	'value': 600}
	#psi:http://en.wiktionary.org/wiki/psi
	data[700] = {'greek': 'ψ',
	'capital': 'Ψ',
	'name': 'ψει',
	'segment': 'consonant',
	'subsegment': 'double',
	'roman': 'y',
	'value': 700}
	#omega:http://en.wiktionary.org/wiki/omega
	data[800] = {'greek': 'ω',
	'capital': 'Ω',
	'name': 'ω μεγα',
	'segment': 'vowel',
	'subsegment': 'long',
	'roman': 'w',
	'value': 800}
	#sampi/disigma
	#http://en.wikipedia.org/wiki/Sampi
	#http://www.tlg.uci.edu/~opoudjis/unicode/other_nonattic.html#sampi
	#http://www.parthia.com/fonts/sampi.htm
	#http://www.jstor.org/stable/636031
	data[900] = {'greek': 'ϡ', 'small2': 'ͳ',
	'capital': 'Ϡ', 'capital2': 'Ͳ',
	'name': 'σαμπι', 'name2': 'δισιγμα',
	'segment': 'numeral',
	#'subsegment': '',
	#'roman': '_',
	'value': 900}

	greek_roman_values = {}
	greek_roman_letters = {}
	roman_greek_letters = {}

	keys = ['roman', 'greek', 'capital', 'capital2', 'small2', 'small3', 'small4']
	for num, d in data.items():
	for k in keys:
	if d.has_key(k):
	greek_roman_values[d[k]] = num
	if k == 'roman':
	greek_roman_letters[d[k]] = d['greek']
	greek_roman_letters[d[k].upper()] = d['capital']
	greek_roman_values[d[k].upper()] = num
	else:
	if d.has_key('roman'):
	if k == 'capital' or k == 'capital2':
	roman_greek_letters[d[k]] = d['roman'].upper()
	else:
	roman_greek_letters[d[k]] = d['roman']

	regex_greek_roman_values = re.compile('\|'.join(greek_roman_values.keys()))
	regex_greek_to_roman_letters = re.compile('\|'.join(roman_greek_letters.keys()))
	regex_roman_to_greek_letters = re.compile('\|'.join(greek_roman_letters.keys()))

	def isopsephy(str):
	"""
	Str is a roman letter representation (transliteration) of the greek word or sentence
	that will be converted to the numerical value letter by letter
	"""
	str = regex_greek_roman_values.sub(lambda x: '%s ' % greek_roman_values[x.group()], str)
	return sum([int(i) for i in str.split()])

	def to_roman(word):
	"""
	Create a roman letter version of the greek word.
	This will change all greek (primary), capital, capital2, small2, small3, and small4
	letters to roman letter. Capital letters are honored.
	"""
	return regex_greek_to_roman_letters.sub(lambda x: roman_greek_letters[x.group()], word)

	def to_greek(word):
	"""
	Create a greek version of the roman letter word.
	This will change a-zA-Z except j, J, v & V to the corresponding greek letters
	Capital letters are honored.
	"""
	return regex_roman_to_greek_letters.sub(lambda x: greek_roman_letters[x.group()], word)

	names = {'name': 'name_value', 'name2': 'name_value2', 'name3': 'name_value3', 'name4': 'name_value4'}
	for num, d in data.items():
	for k, v in names.items():
	if d.has_key(k):
	d[v] = isopsephy(d[k])

	# accents / diacritics for simplified greek letters
	accents = {}

	accents['Ὑ'] = 'υ'
	accents['Ὕ'] = 'υ'
	accents['ὖ'] = 'υ'
	accents['ῦ'] = 'υ'
	accents['ύ'] = 'υ'
	accents['ὗ'] = 'υ'
	accents['ὐ'] = 'υ'
	accents['ὑ'] = 'υ'
	accents['ϋ'] = 'υ'
	accents['ὔ'] = 'υ'
	accents['ὺ'] = 'υ'
	accents['ὕ'] = 'υ'
	accents['ὓ'] = 'υ'

	accents['Ἠ'] = 'η'
	accents['Ἦ'] = 'η'
	accents['Ἤ'] = 'η'
	accents['Ἡ'] = 'η'
	accents['ἠ'] = 'η'
	accents['ἦ'] = 'η'
	accents['ὴ'] = 'η'
	accents['ῇ'] = 'η'
	accents['ἡ'] = 'η'
	accents['ή'] = 'η'
	accents['ῃ'] = 'η'
	accents['ῆ'] = 'η'
	accents['ἥ'] = 'η'
	accents['ἢ'] = 'η'
	accents['ᾖ'] = 'η'
	accents['ἤ'] = 'η'
	accents['ῄ'] = 'η'
	accents['ᾗ'] = 'η'
	accents['ᾔ'] = 'η'
	accents['ἣ'] = 'η'
	accents['ἧ'] = 'η'
	accents['ᾐ'] = 'η'
	accents['ή'] = 'η'
	accents['ῇ'] = 'η'

	accents['Ἰ'] = 'ι'
	accents['Ἴ'] = 'ι'
	accents['Ἱ'] = 'ι'
	accents['ῖ'] = 'ι'
	accents['ί'] = 'ι'
	accents['ῖ'] = 'ι'
	accents['ὶ'] = 'ι'
	accents['ἰ'] = 'ι'
	accents['ἵ'] = 'ι'
	accents['ἴ'] = 'ι'
	accents['ἱ'] = 'ι'
	accents['ἶ'] = 'ι'
	accents['ΐ'] = 'ι'
	accents['ἷ'] = 'ι'
	accents['ϊ'] = 'ι'
	accents['ῒ'] = 'ι'
	accents['ἳ'] = 'ι'
	accents['ἳ'] = 'ι'
	accents['ί'] = 'ι'

	accents['Ὁ'] = 'ο'
	accents['Ὃ'] = 'ο'
	accents['Ὅ'] = 'ο'
	accents['Ὅ'] = 'ο'
	accents[' Ὄ'] = 'ο'
	accents['Ὀ'] = 'ο'
	accents['Ὄ'] = 'ο'
	accents['ὁ'] = 'ο'
	accents['ὃ'] = 'ο'
	accents['ό'] = 'ο'
	accents['ὸ'] = 'ο'
	accents['ὅ'] = 'ο'
	accents['ὄ'] = 'ο'
	accents['ὀ'] = 'ο'

	accents['ὰ'] = 'α'
	accents['ά'] = 'α'
	accents['ἀ'] = 'α'
	accents['ᾳ'] = 'α'
	accents['ἄ'] = 'α'
	accents['ἂ'] = 'α'
	accents['ἃ'] = 'α'
	accents['ᾶ'] = 'α'
	accents['ᾷ'] = 'α'
	accents['ἅ'] = 'α'
	accents['ἁ'] = 'α'
	accents['ἆ'] = 'α'
	accents['ά'] = 'α'
	accents['ά'] = 'α'

	accents['Ὡ'] = 'ω'
	accents['ῷ'] = 'ω'
	accents['ῳ'] = 'ω'
	accents['ώ'] = 'ω'
	accents['ῶ'] = 'ω'
	accents['ώ'] = 'ω'
	accents['ὡ'] = 'ω'
	accents['ᾧ'] = 'ω'
	accents['ὥ'] = 'ω'
	accents['ὢ'] = 'ω'
	accents['ὼ'] = 'ω'
	accents['ὦ'] = 'ω'
	accents['ὧ'] = 'ω'
	accents['ὤ'] = 'ω'
	accents['ὠ'] = 'ω'
	accents['ῴ'] = 'ω'

	accents['Ἐ'] = 'ε'
	accents['Ἔ'] = 'ε'
	accents['Ἑ'] = 'ε'
	accents['Ἓ'] = 'ε'
	accents['Ἕ'] = 'ε'
	accents['έ'] = 'ε'
	accents['ἐ'] = 'ε'
	accents['ἔ'] = 'ε'
	accents['ἑ'] = 'ε'
	accents['ὲ'] = 'ε'
	accents['ἕ'] = 'ε'
	accents['ἐ'] = 'ε'
	accents['ἓ'] = 'ε'

	accents['Ἀ'] = 'α'
	accents['Ἁ'] = 'α'
	accents['Ἆ'] = 'α'
	accents['Ἄ'] = 'α'
	accents['Ἅ'] = 'α'
	accents['ά'] = 'α'

	accents['Ῥ'] = 'ρ'
	accents['ῥ'] = 'ρ'

	regex_roman = re.compile(r'[^a-z ]+')
	def preprocess_roman(str):
	# regex to remove all special characters leaving only a-z and empty scape
	# for example: a)a/atos tau1 -> aaatos tau
	return regex_roman.sub('', str)

	regex_greek = re.compile('\|'.join(accents.keys()))
	def preprocess_greek(str):
	# handle diacritics
	return regex_greek.sub(lambda x: accents[x.group()], str)
	# -- coding: utf-8 --

	import re
	import pandas as pd
	from os.path import isfile
	from urllib import urlretrieve
	from lxml import etree
	from isopsephy import isopsephy, data, to_roman, to_greek, preprocess_roman, preprocess_greek

	#fileName = "Perseus_text_1999.04.0058.xml"

	def load_dataframe(fileName):

	csvFileName = "%s.csv" % fileName
	xmlFileName = "%s.xml" % fileName

	if isfile(csvFileName):
	print "Retrieving data from local csv copy..."
	return pd.read_csv(csvFileName)
	else:
	return load_dataframe_xml(fileName)

	def load_dataframe_xml(fileName):

	xmlFileName = "%s.xml" % fileName

	if isfile(xmlFileName):
	print "Retrieving data from local xml copy..."
	else:
	print "Downloading and saving data to the local copy..."
	urlretrieve("http://www.perseus.tufts.edu/hopper/dltext?doc=%s" % xmlFileName, xmlFileName)

	try:
	tree = etree.parse(xmlFileName)
	print "XML tree loaded for pursuit!"
	except Exception as e:
	print(e.message)

	ids = []
	words = []
	transliterations = []
	translations = []
	isopsephies = []
	charcounts = []

	# perseus dictionary words are divided to alphabetic sections
	for div in tree.xpath("text/body/div0"):
	# entry is the main element
	for entry in div.xpath("entry"):
	# collects word ids
	ids.append(entry.attrib['id'])
	# sanitize transliterated word and add to the collection
	word = preprocess_roman(entry.attrib['key'])
	transliterations.append(word)
	# calculate characters
	charcounts.append(len(word))
	# get isopsephy value of the word (still translitered)
	val = isopsephy(word)
	isopsephies.append(val)
	# create greek version of the word and add to the collection
	words.append(to_greek(word))
	# get translations for the word and add to the collection
	for sense in entry.xpath("sense"):
	trs = []
	for tr in sense.xpath("trans/tr//text()"):
	tr = tr.encode('utf-8')
	trs.append(tr)
	translations.append('\|'.join(trs))

	df = pd.DataFrame({'Id': ids,
	'Word': words,
	'Transliteration': transliterations,
	'Translation': translations,
	'Isopsephy': isopsephies,
	'CharCount': charcounts
	})

	df.to_csv("%s.csv" % fileName, index=False)

	print "Data collected from xml file and saved to csv"

	return df