Skip to content

Instantly share code, notes, and snippets.

@PonteIneptique
Created September 28, 2016 08:25
Show Gist options
  • Save PonteIneptique/9cda92b802e22d406c348eb6bf5bfe56 to your computer and use it in GitHub Desktop.
Save PonteIneptique/9cda92b802e22d406c348eb6bf5bfe56 to your computer and use it in GitHub Desktop.
import json
from collections import defaultdict
import os
from MyCapytain.common.utils import xmlparser
# Logging related dependencies
import logging
import time
import math
# Multi Proc
from multiprocessing import Pool
from lxml import etree
import copyreg
from io import StringIO
import re
pdlnormalizer = re.compile("([a-zA-Z]+)")
def element_unpickler(data):
return etree.fromstring(data)
def element_pickler(element):
data = etree.tostring(element)
return element_unpickler, (data,)
copyreg.pickle(etree._Element, element_pickler, element_unpickler)
def elementtree_unpickler(data):
data = StringIO(data)
return etree.parse(data)
def elementtree_pickler(tree):
data = StringIO()
tree.write(data)
return elementtree_unpickler, (data.getvalue(),)
copyreg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler)
NUM_PROC= 4
def PROIEL_DISTRIBUTED(params):
""" Function to get distributed parsing and parse faster this crazy resource
:param chunk: Int identifier of the chunk number
:param lemma_group: List of xml nodes representing lemma
:param xml: Complete xml
:return:
..note :: This is a private function because this cannot be pickled without hack \
See https://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods
.. todo:: Move away from xpath here but construct list of dict of attrib \
Do also a dict of dependence and feed it, instead of XML
"""
start, source_tokens, xml = params
tokens = []
for word in source_tokens:
attrib = dict(word.attrib)
citation = attrib["citation-part"]
_form = attrib["form"]
_lemma = attrib["lemma"]
_pos = attrib["relation"]
_id, head = attrib["id"], [attrib["head-id"]]
# Is this line really useful
deps = xml.xpath("//token[@head-id='%s']/@id" % _id)
tokens.append((_id, _form, _lemma, head, deps, _pos, citation))
return start, tokens
def makePeID(sid, wid):
return "{}:{}".format(sid, wid)
def PERSEUS_DISTRIBUTED(params):
""" Function to get distributed parsing and parse faster this crazy resource
:param chunk: Int identifier of the chunk number
:param lemma_group: List of xml nodes representing lemma
:param xml: Complete xml
:return:
..note :: This is a private function because this cannot be pickled without hack \
See https://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods
.. todo:: Move away from xpath here but construct list of dict of attrib \
Do also a dict of dependence and feed it, instead of XML
"""
start, source_tokens, sentence, sentence_attrs = params
tokens = []
for attrib in source_tokens:
citation = sentence_attrs["subdoc"]
_form = attrib["form"]
_lemma = pdlnormalizer.search(attrib["lemma"]).group()
_pos = attrib["relation"].lower()
_id, head = attrib["id"], [makePeID(sentence_attrs["id"], attrib["head"])]
# Is this line really useful
deps = [makePeID(sentence_attrs["id"], i) for i in sentence.xpath("./word[@head='%s' and @postag!='u--------']/@id" % _id)]
tokens.append((makePeID(sentence_attrs["id"], _id), _form, _lemma, head, deps, _pos, citation))
return start, tokens
class Lemmatizer:
@staticmethod
def process(filename):
return {}
class ProielParserLemmatizer(Lemmatizer):
@staticmethod
def process(corpus):
logger = logging.getLogger(__name__)
file = corpus.getPath("xml")
_json = corpus.getPath("json")
if os.path.isfile(_json):
logger.info("Loading {} from cache".format(_json))
with open(_json, "r") as f:
data = json.load(f)
else:
logger.info("Processing {}".format(file))
ids = {}
citations = defaultdict(list)
order = []
lemmas = []
# we parse the original file
with open(file) as source:
xml = xmlparser(source)
# We retrieve each passage in a list
tokens = list(xml.xpath("//sentence"))
l = len(tokens)
logging.info("{} lemmas to process".format(l))
# Here we multiprocess
steps = math.floor(l/20)
tokens_groups = [
(i, tokens[i:i+steps], xml)
for i in range(0, len(tokens), steps) # 5% chunks
]
proc_pool = Pool(NUM_PROC)
tokens_groups = proc_pool.map(PROIEL_DISTRIBUTED, tokens_groups)
# Regroup
tokens_groups = {k: v for k, v in tokens_groups}
ks = [int(k) for k in list(tokens_groups.keys())]
ks = sorted(ks)
tokens = [l for k in ks for l in tokens_groups[k]]
for _id, _form, _lemma, head, deps, _pos, citation in tokens:
if citation not in order:
order.append(citation)
lemmas.append([citation])
ids[_id] = _lemma
citations[citation].append(
(_id, (_form, _lemma, head, deps, _pos), )
)
lemmas[-1].append(_lemma)
with open(_json, "w") as w:
json.dump({
"produced": str(time.strftime("%c")),
"order": order,
"lemma": lemmas,
"tokens": citations,
"ids": ids
}, w)
data = citations
return data
class PerseusParserLemmatizer(Lemmatizer):
@staticmethod
def process(corpus):
logger = logging.getLogger(__name__)
file = corpus.getPath("xml")
_json = corpus.getPath("json")
if os.path.isfile(_json):
logger.info("Loading {} from cache".format(_json))
with open(_json, "r") as f:
data = json.load(f)
else:
logger.info("Processing {}".format(file))
ids = {}
citations = defaultdict(list)
order = []
lemmas = []
# we parse the original file
with open(file) as source:
xml = xmlparser(source)
# We retrieve each passage in a list
sentences = list(xml.xpath("//sentence"))
tokens = [
(
sentence.get("id"),
[dict(token.attrib) for token in sentence.xpath("./word[@postag!='u--------' and @lemma!='']")],
sentence,
dict(sentence.attrib)
)
for sentence in sentences
]
logging.info("{} lemmas to process".format(len(xml.xpath("//word[@postag!='u--------']"))))
# Here we multiprocess
proc_pool = Pool(NUM_PROC)
tokens_groups = proc_pool.map(PERSEUS_DISTRIBUTED, tokens)
# Regroup
tokens_groups = {k: v for k, v in tokens_groups}
ks = [int(k) for k in list(tokens_groups.keys())]
ks = sorted(ks)
tokens = [l for k in ks for l in tokens_groups[str(k)]]
for _id, _form, _lemma, head, deps, _pos, citation in tokens:
if citation not in order:
order.append(citation)
lemmas.append([citation])
ids[_id] = _lemma
citations[citation].append(
(_id, (_form, _lemma, head, deps, _pos), )
)
lemmas[-1].append(_lemma)
with open(_json, "w") as w:
json.dump({
"produced": str(time.strftime("%c")),
"order": order,
"lemma": lemmas,
"tokens": citations,
"ids": ids
}, w)
data = citations
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment