Last active
January 31, 2018 14:54
-
-
Save markomanninen/a68f200b4e98f018d7618dab0365ffe5 to your computer and use it in GitHub Desktop.
Import Greek corpora to local file system by cltk library, parse Homer Iliad to cards and lines by using betacode decoder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py | |
import unicodedata | |
map_b2u = { | |
'A': '\u03b1', # alpha | |
'B': '\u03b2', # beta | |
'C': '\u03be', # xi | |
'D': '\u03b4', # delta | |
'E': '\u03b5', # epsilon | |
'F': '\u03c6', # phi | |
'G': '\u03b3', # gamma | |
'H': '\u03b7', # eta | |
'I': '\u03b9', # iota | |
'K': '\u03ba', # kappa | |
'L': '\u03bb', # lambda | |
'M': '\u03bc', # mu | |
'N': '\u03bd', # nu | |
'O': '\u03bf', # omicron | |
'P': '\u03c0', # pi | |
'Q': '\u03b8', # theta | |
'R': '\u03c1', # rho | |
'S': '\u03c3', # medial sigma (see special case in translator) | |
'T': '\u03c4', # tau | |
'U': '\u03c5', # upsilon | |
'V': '\u03dd', # digamma | |
'W': '\u03c9', # omega | |
'X': '\u03c7', # chi | |
'Y': '\u03c8', # psi | |
'Z': '\u03b6', # zeta | |
')': '\u0313', # smooth breathing | |
'(': '\u0314', # rough breathing | |
'/': '\u0301', # acute | |
'=': '\u0342', # circumflex | |
'\\': '\u0300', # grave | |
'+': '\u0308', # diaeresis | |
'|': '\u0345', # iota subscript | |
'?': '\u0323', # dot below | |
':': '\u00b7', # middle dot | |
'-': '\u2010', # hyphen | |
'_': '\u2014', # em dash | |
} | |
map_b2u_sigma = { | |
'1': '\u03c3', # medial sigma | |
'2': '\u03c2', # final sigma | |
'3': '\u03f2', # lunate sigma | |
} | |
class Converter: | |
def __init__(self): | |
self.result_chunks = [] | |
self.capitalize_next = False | |
self.last_is_letter = False | |
self.hold = [] | |
def input(self, betacode): | |
global map_b2u_sigma, map_b2u | |
i = 0 | |
while i < len(betacode): | |
c = betacode[i] | |
if c == '*': | |
self.capitalize_next = True | |
i += 1 | |
continue | |
if c.upper() == 'S': | |
if i == len(betacode) - 1: | |
self.append_out('\u03c2') # final sigma | |
i += 1 | |
continue | |
c2 = betacode[i + 1] | |
if c2 in map_b2u_sigma: | |
self.append_out(map_b2u_sigma[c2]) | |
i += 2 | |
continue | |
if c2 == "'": | |
self.append_out('\u03c3') # medial sigma | |
i += 1 | |
continue | |
if self.is_letter(c2): | |
self.append_out('\u03c3') # medial sigma | |
i += 1 | |
continue | |
else: | |
self.append_out('\u03c2') # final sigma | |
i += 1 | |
continue | |
self.append_out(map_b2u.get(c.upper(), c)) | |
i += 1 | |
def append_out(self, c): | |
if self.is_letter(c): | |
self.last_is_letter = True | |
if self.capitalize_next: | |
c = c.upper() | |
self.capitalize_next = False | |
self.result_chunks.append(c) | |
# if any held accents, they go on this letter | |
self.result_chunks.extend(self.hold) | |
self.hold = [] | |
elif self.is_nonspacing_mark(c): | |
if self.last_is_letter: | |
self.result_chunks.append(c) | |
else: | |
# accents after a non-letter. hold them for the next letter | |
self.hold.append(c) | |
else: | |
self.last_is_letter = False | |
self.result_chunks.append(c) | |
def is_letter(self, c): | |
return unicodedata.category(c)[0] == 'L' | |
def is_nonspacing_mark(self, c): | |
return unicodedata.category(c) == 'Mn' | |
def __str__(self): | |
return ''.join(self.result_chunks) | |
def betacode_to_unicode(betacode): | |
c = Converter() | |
c.input(betacode) | |
return str(c) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pip install cltk | |
from cltk.corpus.utils.importer import CorpusImporter | |
corpus_importer = CorpusImporter('greek') | |
corpus_importer.list_corpora | |
# load greek corpora from perseus | |
corpus = "greek_text_perseus" | |
corpus_importer.import_corpus(corpus) | |
# download greek betacode decoder | |
# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py | |
import os, re, betacode | |
from os.path import expanduser | |
home = expanduser("~") | |
# by default corpora is downloaded to the user root under cltk_data | |
dire = home + '\\cltk_data\\greek\\text\\' + corpus + '\\%s\\opensource\\' | |
def filter_empty(x): | |
return x.strip() != "" | |
def betadecode(x): | |
return betacode.betacode_to_unicode(x) | |
def remove_tags(x): | |
return betadecode(re.sub('<[^<]+?>', '', x)) | |
def get_file_content(name, file): | |
with open((dire % name) + file, 'r') as f: | |
return f.read() | |
def get_milestones(content): | |
content = content.lower() | |
# split to lines by using milestones tag as an indicator | |
# for homer iliad | |
milestones = content.split("<milestone ed=\"p\" unit=\"para\"/>") | |
# could not split, try other version | |
if len(milestones) < 2: | |
# for homer odyssey | |
milestones = content.split('<milestone n="1" unit="card" ed="p"/>') | |
# filter empty lines and remove tags + decode betacode to greek | |
# first item is metadata, discard it | |
return [list(filter(filter_empty, map(remove_tags, l.split("\n")))) for l in milestones][1:] | |
author = "Homer" | |
print(os.listdir(dire % author)) | |
file = "hom.il_gk.xml" | |
ml = get_milestones(get_file_content(author, file)) | |
print("number of cards: %s" % len(ml)) | |
print("number of lines: %s" % sum(map(len, ml))) | |
print("number of words: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.split(" ")), lines)), ml))) | |
print("number of chars: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.replace(" ", "")), lines)), ml))) | |
""" | |
number of cards: 1049 | |
number of lines: 15683 | |
number of words: 111862 | |
number of chars: 732954 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment