markomanninen · January 31, 2018 14:54
diff --git a/betacode.py b/betacode.py
 # copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
 import unicodedata

 map_b2u = {
    'A':  '\u03b1', # alpha
    'B':  '\u03b2', # beta
    'C':  '\u03be', # xi
    'D':  '\u03b4', # delta
    'E':  '\u03b5', # epsilon
    'F':  '\u03c6', # phi
    'G':  '\u03b3', # gamma
    'H':  '\u03b7', # eta
    'I':  '\u03b9', # iota
    'K':  '\u03ba', # kappa
    'L':  '\u03bb', # lambda
    'M':  '\u03bc', # mu
    'N':  '\u03bd', # nu
    'O':  '\u03bf', # omicron
    'P':  '\u03c0', # pi
    'Q':  '\u03b8', # theta
    'R':  '\u03c1', # rho
    'S':  '\u03c3', # medial sigma (see special case in translator)
    'T':  '\u03c4', # tau
    'U':  '\u03c5', # upsilon
    'V':  '\u03dd', # digamma
    'W':  '\u03c9', # omega
    'X':  '\u03c7', # chi
    'Y':  '\u03c8', # psi
    'Z':  '\u03b6', # zeta
    ')':  '\u0313', # smooth breathing
    '(':  '\u0314', # rough breathing
    '/':  '\u0301', # acute
    '=':  '\u0342', # circumflex
    '\\': '\u0300', # grave
    '+':  '\u0308', # diaeresis
    '|':  '\u0345', # iota subscript
    '?':  '\u0323', # dot below
    ':':  '\u00b7', # middle dot
    '-':  '\u2010', # hyphen
    '_':  '\u2014', # em dash
 }

 map_b2u_sigma = {
    '1': '\u03c3', # medial sigma
    '2': '\u03c2', # final sigma
    '3': '\u03f2', # lunate sigma
 }

 class Converter:
    def __init__(self):
        self.result_chunks = []
        self.capitalize_next = False
        self.last_is_letter = False
        self.hold = []

    def input(self, betacode):
        global map_b2u_sigma, map_b2u
        i = 0
        while i < len(betacode):
            c = betacode[i]
            if c == '*':
                self.capitalize_next = True
                i += 1
                continue
            if c.upper() == 'S':
                if i == len(betacode) - 1:
                    self.append_out('\u03c2') # final sigma
                    i += 1
                    continue
                c2 = betacode[i + 1]
                if c2 in map_b2u_sigma:
                    self.append_out(map_b2u_sigma[c2])
                    i += 2
                    continue
                if c2 == "'":
                    self.append_out('\u03c3') # medial sigma
                    i += 1
                    continue

                if self.is_letter(c2):
                    self.append_out('\u03c3') # medial sigma
                    i += 1
                    continue
                else:
                    self.append_out('\u03c2') # final sigma
                    i += 1
                    continue

            self.append_out(map_b2u.get(c.upper(), c))
            i += 1

    def append_out(self, c):
        if self.is_letter(c):
            self.last_is_letter = True
            if self.capitalize_next:
                c = c.upper()
                self.capitalize_next = False
            self.result_chunks.append(c)
            # if any held accents, they go on this letter
            self.result_chunks.extend(self.hold)
            self.hold = []
        elif self.is_nonspacing_mark(c):
            if self.last_is_letter:
                self.result_chunks.append(c)
            else:
                # accents after a non-letter. hold them for the next letter
                self.hold.append(c)
        else:
            self.last_is_letter = False
            self.result_chunks.append(c)

    def is_letter(self, c):
        return unicodedata.category(c)[0] == 'L'

    def is_nonspacing_mark(self, c):
        return unicodedata.category(c) == 'Mn'

    def __str__(self):
        return ''.join(self.result_chunks)

 def betacode_to_unicode(betacode):
    c = Converter()
    c.input(betacode)
    return str(c)
diff --git a/perseus_local_file_parser.py b/perseus_local_file_parser.py
 #pip install cltk
 from cltk.corpus.utils.importer import CorpusImporter
 corpus_importer = CorpusImporter('greek')
 corpus_importer.list_corpora

 # load greek corpora from perseus
 corpus = "greek_text_perseus"
 corpus_importer.import_corpus(corpus)

 # download greek betacode decoder
 # copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
 import os, re, betacode
 from os.path import expanduser
 home = expanduser("~")

 # by default corpora is downloaded to the user root under cltk_data
 dire = home + '\\cltk_data\\greek\\text\\' + corpus + '\\%s\\opensource\\'

 def filter_empty(x):
    return x.strip() != ""

 def betadecode(x):
    return betacode.betacode_to_unicode(x)

 def remove_tags(x):
    return betadecode(re.sub('<[^<]+?>', '', x))

 def get_file_content(name, file):
    with open((dire % name) + file, 'r') as f:
        return f.read()

 def get_milestones(content):
    content = content.lower()
    # split to lines by using milestones tag as an indicator
    # for homer iliad
    milestones = content.split("<milestone ed=\"p\" unit=\"para\"/>")
    # could not split, try other version
    if len(milestones) < 2:
        # for homer odyssey
        milestones = content.split('<milestone n="1" unit="card" ed="p"/>')
    # filter empty lines and remove tags + decode betacode to greek
    # first item is metadata, discard it
    return [list(filter(filter_empty, map(remove_tags, l.split("\n")))) for l in milestones][1:]

 author = "Homer"
 print(os.listdir(dire % author))

 file = "hom.il_gk.xml"
 ml = get_milestones(get_file_content(author, file))

 print("number of cards: %s" % len(ml))
 print("number of lines: %s" % sum(map(len, ml)))
 print("number of words: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.split(" ")), lines)), ml)))
 print("number of chars: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.replace(" ", "")), lines)), ml)))

 """
 number of cards: 1049
 number of lines: 15683
 number of words: 111862
 number of chars: 732954
 """
	# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
	import unicodedata

	map_b2u = {
	'A': '\u03b1', # alpha
	'B': '\u03b2', # beta
	'C': '\u03be', # xi
	'D': '\u03b4', # delta
	'E': '\u03b5', # epsilon
	'F': '\u03c6', # phi
	'G': '\u03b3', # gamma
	'H': '\u03b7', # eta
	'I': '\u03b9', # iota
	'K': '\u03ba', # kappa
	'L': '\u03bb', # lambda
	'M': '\u03bc', # mu
	'N': '\u03bd', # nu
	'O': '\u03bf', # omicron
	'P': '\u03c0', # pi
	'Q': '\u03b8', # theta
	'R': '\u03c1', # rho
	'S': '\u03c3', # medial sigma (see special case in translator)
	'T': '\u03c4', # tau
	'U': '\u03c5', # upsilon
	'V': '\u03dd', # digamma
	'W': '\u03c9', # omega
	'X': '\u03c7', # chi
	'Y': '\u03c8', # psi
	'Z': '\u03b6', # zeta
	')': '\u0313', # smooth breathing
	'(': '\u0314', # rough breathing
	'/': '\u0301', # acute
	'=': '\u0342', # circumflex
	'\\': '\u0300', # grave
	'+': '\u0308', # diaeresis
	'\|': '\u0345', # iota subscript
	'?': '\u0323', # dot below
	':': '\u00b7', # middle dot
	'-': '\u2010', # hyphen
	'_': '\u2014', # em dash
	}

	map_b2u_sigma = {
	'1': '\u03c3', # medial sigma
	'2': '\u03c2', # final sigma
	'3': '\u03f2', # lunate sigma
	}

	class Converter:
	def __init__(self):
	self.result_chunks = []
	self.capitalize_next = False
	self.last_is_letter = False
	self.hold = []

	def input(self, betacode):
	global map_b2u_sigma, map_b2u
	i = 0
	while i < len(betacode):
	c = betacode[i]
	if c == '*':
	self.capitalize_next = True
	i += 1
	continue
	if c.upper() == 'S':
	if i == len(betacode) - 1:
	self.append_out('\u03c2') # final sigma
	i += 1
	continue
	c2 = betacode[i + 1]
	if c2 in map_b2u_sigma:
	self.append_out(map_b2u_sigma[c2])
	i += 2
	continue
	if c2 == "'":
	self.append_out('\u03c3') # medial sigma
	i += 1
	continue

	if self.is_letter(c2):
	self.append_out('\u03c3') # medial sigma
	i += 1
	continue
	else:
	self.append_out('\u03c2') # final sigma
	i += 1
	continue

	self.append_out(map_b2u.get(c.upper(), c))
	i += 1

	def append_out(self, c):
	if self.is_letter(c):
	self.last_is_letter = True
	if self.capitalize_next:
	c = c.upper()
	self.capitalize_next = False
	self.result_chunks.append(c)
	# if any held accents, they go on this letter
	self.result_chunks.extend(self.hold)
	self.hold = []
	elif self.is_nonspacing_mark(c):
	if self.last_is_letter:
	self.result_chunks.append(c)
	else:
	# accents after a non-letter. hold them for the next letter
	self.hold.append(c)
	else:
	self.last_is_letter = False
	self.result_chunks.append(c)

	def is_letter(self, c):
	return unicodedata.category(c)[0] == 'L'

	def is_nonspacing_mark(self, c):
	return unicodedata.category(c) == 'Mn'

	def __str__(self):
	return ''.join(self.result_chunks)

	def betacode_to_unicode(betacode):
	c = Converter()
	c.input(betacode)
	return str(c)
	#pip install cltk
	from cltk.corpus.utils.importer import CorpusImporter
	corpus_importer = CorpusImporter('greek')
	corpus_importer.list_corpora

	# load greek corpora from perseus
	corpus = "greek_text_perseus"
	corpus_importer.import_corpus(corpus)

	# download greek betacode decoder
	# copyright (c) https://raw.githubusercontent.com/epilanthanomai/hexameter/master/betacode.py
	import os, re, betacode
	from os.path import expanduser
	home = expanduser("~")

	# by default corpora is downloaded to the user root under cltk_data
	dire = home + '\\cltk_data\\greek\\text\\' + corpus + '\\%s\\opensource\\'

	def filter_empty(x):
	return x.strip() != ""

	def betadecode(x):
	return betacode.betacode_to_unicode(x)

	def remove_tags(x):
	return betadecode(re.sub('<[^<]+?>', '', x))

	def get_file_content(name, file):
	with open((dire % name) + file, 'r') as f:
	return f.read()

	def get_milestones(content):
	content = content.lower()
	# split to lines by using milestones tag as an indicator
	# for homer iliad
	milestones = content.split("<milestone ed=\"p\" unit=\"para\"/>")
	# could not split, try other version
	if len(milestones) < 2:
	# for homer odyssey
	milestones = content.split('<milestone n="1" unit="card" ed="p"/>')
	# filter empty lines and remove tags + decode betacode to greek
	# first item is metadata, discard it
	return [list(filter(filter_empty, map(remove_tags, l.split("\n")))) for l in milestones][1:]

	author = "Homer"
	print(os.listdir(dire % author))

	file = "hom.il_gk.xml"
	ml = get_milestones(get_file_content(author, file))

	print("number of cards: %s" % len(ml))
	print("number of lines: %s" % sum(map(len, ml)))
	print("number of words: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.split(" ")), lines)), ml)))
	print("number of chars: %s" % sum(map(lambda lines: sum(map(lambda line: len(line.replace(" ", "")), lines)), ml)))

	"""
	number of cards: 1049
	number of lines: 15683
	number of words: 111862
	number of chars: 732954
	"""