Created
September 28, 2016 08:25
-
-
Save PonteIneptique/9cda92b802e22d406c348eb6bf5bfe56 to your computer and use it in GitHub Desktop.
Parsing treebank data for http://proiel.github.io/ and http://perseusdl.github.io/treebank_data/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from collections import defaultdict | |
import os | |
from MyCapytain.common.utils import xmlparser | |
# Logging related dependencies | |
import logging | |
import time | |
import math | |
# Multi Proc | |
from multiprocessing import Pool | |
from lxml import etree | |
import copyreg | |
from io import StringIO | |
import re | |
pdlnormalizer = re.compile("([a-zA-Z]+)") | |
def element_unpickler(data): | |
return etree.fromstring(data) | |
def element_pickler(element): | |
data = etree.tostring(element) | |
return element_unpickler, (data,) | |
copyreg.pickle(etree._Element, element_pickler, element_unpickler) | |
def elementtree_unpickler(data): | |
data = StringIO(data) | |
return etree.parse(data) | |
def elementtree_pickler(tree): | |
data = StringIO() | |
tree.write(data) | |
return elementtree_unpickler, (data.getvalue(),) | |
copyreg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler) | |
NUM_PROC= 4 | |
def PROIEL_DISTRIBUTED(params): | |
""" Function to get distributed parsing and parse faster this crazy resource | |
:param chunk: Int identifier of the chunk number | |
:param lemma_group: List of xml nodes representing lemma | |
:param xml: Complete xml | |
:return: | |
..note :: This is a private function because this cannot be pickled without hack \ | |
See https://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods | |
.. todo:: Move away from xpath here but construct list of dict of attrib \ | |
Do also a dict of dependence and feed it, instead of XML | |
""" | |
start, source_tokens, xml = params | |
tokens = [] | |
for word in source_tokens: | |
attrib = dict(word.attrib) | |
citation = attrib["citation-part"] | |
_form = attrib["form"] | |
_lemma = attrib["lemma"] | |
_pos = attrib["relation"] | |
_id, head = attrib["id"], [attrib["head-id"]] | |
# Is this line really useful | |
deps = xml.xpath("//token[@head-id='%s']/@id" % _id) | |
tokens.append((_id, _form, _lemma, head, deps, _pos, citation)) | |
return start, tokens | |
def makePeID(sid, wid): | |
return "{}:{}".format(sid, wid) | |
def PERSEUS_DISTRIBUTED(params): | |
""" Function to get distributed parsing and parse faster this crazy resource | |
:param chunk: Int identifier of the chunk number | |
:param lemma_group: List of xml nodes representing lemma | |
:param xml: Complete xml | |
:return: | |
..note :: This is a private function because this cannot be pickled without hack \ | |
See https://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods | |
.. todo:: Move away from xpath here but construct list of dict of attrib \ | |
Do also a dict of dependence and feed it, instead of XML | |
""" | |
start, source_tokens, sentence, sentence_attrs = params | |
tokens = [] | |
for attrib in source_tokens: | |
citation = sentence_attrs["subdoc"] | |
_form = attrib["form"] | |
_lemma = pdlnormalizer.search(attrib["lemma"]).group() | |
_pos = attrib["relation"].lower() | |
_id, head = attrib["id"], [makePeID(sentence_attrs["id"], attrib["head"])] | |
# Is this line really useful | |
deps = [makePeID(sentence_attrs["id"], i) for i in sentence.xpath("./word[@head='%s' and @postag!='u--------']/@id" % _id)] | |
tokens.append((makePeID(sentence_attrs["id"], _id), _form, _lemma, head, deps, _pos, citation)) | |
return start, tokens | |
class Lemmatizer: | |
@staticmethod | |
def process(filename): | |
return {} | |
class ProielParserLemmatizer(Lemmatizer): | |
@staticmethod | |
def process(corpus): | |
logger = logging.getLogger(__name__) | |
file = corpus.getPath("xml") | |
_json = corpus.getPath("json") | |
if os.path.isfile(_json): | |
logger.info("Loading {} from cache".format(_json)) | |
with open(_json, "r") as f: | |
data = json.load(f) | |
else: | |
logger.info("Processing {}".format(file)) | |
ids = {} | |
citations = defaultdict(list) | |
order = [] | |
lemmas = [] | |
# we parse the original file | |
with open(file) as source: | |
xml = xmlparser(source) | |
# We retrieve each passage in a list | |
tokens = list(xml.xpath("//sentence")) | |
l = len(tokens) | |
logging.info("{} lemmas to process".format(l)) | |
# Here we multiprocess | |
steps = math.floor(l/20) | |
tokens_groups = [ | |
(i, tokens[i:i+steps], xml) | |
for i in range(0, len(tokens), steps) # 5% chunks | |
] | |
proc_pool = Pool(NUM_PROC) | |
tokens_groups = proc_pool.map(PROIEL_DISTRIBUTED, tokens_groups) | |
# Regroup | |
tokens_groups = {k: v for k, v in tokens_groups} | |
ks = [int(k) for k in list(tokens_groups.keys())] | |
ks = sorted(ks) | |
tokens = [l for k in ks for l in tokens_groups[k]] | |
for _id, _form, _lemma, head, deps, _pos, citation in tokens: | |
if citation not in order: | |
order.append(citation) | |
lemmas.append([citation]) | |
ids[_id] = _lemma | |
citations[citation].append( | |
(_id, (_form, _lemma, head, deps, _pos), ) | |
) | |
lemmas[-1].append(_lemma) | |
with open(_json, "w") as w: | |
json.dump({ | |
"produced": str(time.strftime("%c")), | |
"order": order, | |
"lemma": lemmas, | |
"tokens": citations, | |
"ids": ids | |
}, w) | |
data = citations | |
return data | |
class PerseusParserLemmatizer(Lemmatizer): | |
@staticmethod | |
def process(corpus): | |
logger = logging.getLogger(__name__) | |
file = corpus.getPath("xml") | |
_json = corpus.getPath("json") | |
if os.path.isfile(_json): | |
logger.info("Loading {} from cache".format(_json)) | |
with open(_json, "r") as f: | |
data = json.load(f) | |
else: | |
logger.info("Processing {}".format(file)) | |
ids = {} | |
citations = defaultdict(list) | |
order = [] | |
lemmas = [] | |
# we parse the original file | |
with open(file) as source: | |
xml = xmlparser(source) | |
# We retrieve each passage in a list | |
sentences = list(xml.xpath("//sentence")) | |
tokens = [ | |
( | |
sentence.get("id"), | |
[dict(token.attrib) for token in sentence.xpath("./word[@postag!='u--------' and @lemma!='']")], | |
sentence, | |
dict(sentence.attrib) | |
) | |
for sentence in sentences | |
] | |
logging.info("{} lemmas to process".format(len(xml.xpath("//word[@postag!='u--------']")))) | |
# Here we multiprocess | |
proc_pool = Pool(NUM_PROC) | |
tokens_groups = proc_pool.map(PERSEUS_DISTRIBUTED, tokens) | |
# Regroup | |
tokens_groups = {k: v for k, v in tokens_groups} | |
ks = [int(k) for k in list(tokens_groups.keys())] | |
ks = sorted(ks) | |
tokens = [l for k in ks for l in tokens_groups[str(k)]] | |
for _id, _form, _lemma, head, deps, _pos, citation in tokens: | |
if citation not in order: | |
order.append(citation) | |
lemmas.append([citation]) | |
ids[_id] = _lemma | |
citations[citation].append( | |
(_id, (_form, _lemma, head, deps, _pos), ) | |
) | |
lemmas[-1].append(_lemma) | |
with open(_json, "w") as w: | |
json.dump({ | |
"produced": str(time.strftime("%c")), | |
"order": order, | |
"lemma": lemmas, | |
"tokens": citations, | |
"ids": ids | |
}, w) | |
data = citations | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment