Skip to content

Instantly share code, notes, and snippets.

@markomanninen
Last active January 31, 2018 14:54
Show Gist options
  • Save markomanninen/a68f200b4e98f018d7618dab0365ffe5 to your computer and use it in GitHub Desktop.
Save markomanninen/a68f200b4e98f018d7618dab0365ffe5 to your computer and use it in GitHub Desktop.
Import Greek corpora to local file system by cltk library, parse Homer Iliad to cards and lines by using betacode decoder
# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
import unicodedata
map_b2u = {
'A': '\u03b1', # alpha
'B': '\u03b2', # beta
'C': '\u03be', # xi
'D': '\u03b4', # delta
'E': '\u03b5', # epsilon
'F': '\u03c6', # phi
'G': '\u03b3', # gamma
'H': '\u03b7', # eta
'I': '\u03b9', # iota
'K': '\u03ba', # kappa
'L': '\u03bb', # lambda
'M': '\u03bc', # mu
'N': '\u03bd', # nu
'O': '\u03bf', # omicron
'P': '\u03c0', # pi
'Q': '\u03b8', # theta
'R': '\u03c1', # rho
'S': '\u03c3', # medial sigma (see special case in translator)
'T': '\u03c4', # tau
'U': '\u03c5', # upsilon
'V': '\u03dd', # digamma
'W': '\u03c9', # omega
'X': '\u03c7', # chi
'Y': '\u03c8', # psi
'Z': '\u03b6', # zeta
')': '\u0313', # smooth breathing
'(': '\u0314', # rough breathing
'/': '\u0301', # acute
'=': '\u0342', # circumflex
'\\': '\u0300', # grave
'+': '\u0308', # diaeresis
'|': '\u0345', # iota subscript
'?': '\u0323', # dot below
':': '\u00b7', # middle dot
'-': '\u2010', # hyphen
'_': '\u2014', # em dash
}
map_b2u_sigma = {
'1': '\u03c3', # medial sigma
'2': '\u03c2', # final sigma
'3': '\u03f2', # lunate sigma
}
class Converter:
def __init__(self):
self.result_chunks = []
self.capitalize_next = False
self.last_is_letter = False
self.hold = []
def input(self, betacode):
global map_b2u_sigma, map_b2u
i = 0
while i < len(betacode):
c = betacode[i]
if c == '*':
self.capitalize_next = True
i += 1
continue
if c.upper() == 'S':
if i == len(betacode) - 1:
self.append_out('\u03c2') # final sigma
i += 1
continue
c2 = betacode[i + 1]
if c2 in map_b2u_sigma:
self.append_out(map_b2u_sigma[c2])
i += 2
continue
if c2 == "'":
self.append_out('\u03c3') # medial sigma
i += 1
continue
if self.is_letter(c2):
self.append_out('\u03c3') # medial sigma
i += 1
continue
else:
self.append_out('\u03c2') # final sigma
i += 1
continue
self.append_out(map_b2u.get(c.upper(), c))
i += 1
def append_out(self, c):
if self.is_letter(c):
self.last_is_letter = True
if self.capitalize_next:
c = c.upper()
self.capitalize_next = False
self.result_chunks.append(c)
# if any held accents, they go on this letter
self.result_chunks.extend(self.hold)
self.hold = []
elif self.is_nonspacing_mark(c):
if self.last_is_letter:
self.result_chunks.append(c)
else:
# accents after a non-letter. hold them for the next letter
self.hold.append(c)
else:
self.last_is_letter = False
self.result_chunks.append(c)
def is_letter(self, c):
return unicodedata.category(c)[0] == 'L'
def is_nonspacing_mark(self, c):
return unicodedata.category(c) == 'Mn'
def __str__(self):
return ''.join(self.result_chunks)
def betacode_to_unicode(betacode):
c = Converter()
c.input(betacode)
return str(c)
#pip install cltk
from cltk.corpus.utils.importer import CorpusImporter
corpus_importer = CorpusImporter('greek')
corpus_importer.list_corpora
# load greek corpora from perseus
corpus = "greek_text_perseus"
corpus_importer.import_corpus(corpus)
# download greek betacode decoder
# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
import os, re, betacode
from os.path import expanduser
home = expanduser("~")
# by default corpora is downloaded to the user root under cltk_data
dire = home + '\\cltk_data\\greek\\text\\' + corpus + '\\%s\\opensource\\'
def filter_empty(x):
return x.strip() != ""
def betadecode(x):
return betacode.betacode_to_unicode(x)
def remove_tags(x):
return betadecode(re.sub('<[^<]+?>', '', x))
def get_file_content(name, file):
with open((dire % name) + file, 'r') as f:
return f.read()
def get_milestones(content):
content = content.lower()
# split to lines by using milestones tag as an indicator
# for homer iliad
milestones = content.split("<milestone ed=\"p\" unit=\"para\"/>")
# could not split, try other version
if len(milestones) < 2:
# for homer odyssey
milestones = content.split('<milestone n="1" unit="card" ed="p"/>')
# filter empty lines and remove tags + decode betacode to greek
# first item is metadata, discard it
return [list(filter(filter_empty, map(remove_tags, l.split("\n")))) for l in milestones][1:]
author = "Homer"
print(os.listdir(dire % author))
file = "hom.il_gk.xml"
ml = get_milestones(get_file_content(author, file))
print("number of cards: %s" % len(ml))
print("number of lines: %s" % sum(map(len, ml)))
print("number of words: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.split(" ")), lines)), ml)))
print("number of chars: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.replace(" ", "")), lines)), ml)))
"""
number of cards: 1049
number of lines: 15683
number of words: 111862
number of chars: 732954
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment