Last active
December 23, 2015 22:09
-
-
Save vadimii/6701404 to your computer and use it in GitHub Desktop.
Medical standard HTML file parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import re | |
from collections import defaultdict | |
from json import dumps | |
from glob import glob | |
from os.path import basename, splitext, join | |
from lxml import etree | |
REFLAGS = re.IGNORECASE | re.UNICODE | re.MULTILINE | |
REMOVEWS = re.compile(ur'[\xa0\s]+', REFLAGS) | |
REMOVEAST = re.compile(ur'\*+', REFLAGS) | |
REMODEL = re.compile(ur'.*модель\sпациента.*', REFLAGS) | |
def subtree_text(root): | |
return ''.join(root.xpath('string()')) | |
def clear_string(wsstr): | |
wsstr = REMOVEWS.sub(' ', wsstr) | |
return wsstr.strip() | |
def subtree_clean_text(root): | |
return clear_string(subtree_text(root)) | |
def docpages(tree): | |
return tree.xpath('/html/body/div[starts-with(@class, "WordSection")]') | |
def parse_table(table): | |
result = defaultdict(lambda : defaultdict(unicode)) | |
for rowidx, row in enumerate(table.xpath('.//tr')): | |
for colidx, cell in enumerate(row.xpath('.//td|.//th')): | |
colspan = int(cell.get('colspan', 1)) | |
rowspan = int(cell.get('rowspan', 1)) | |
content = subtree_clean_text(cell) | |
while rowidx in result and colidx in result[rowidx]: | |
colidx += 1 | |
for i in range(rowidx, rowidx + rowspan): | |
for j in range(colidx, colidx + colspan): | |
result[i][j] = content | |
for row in result.itervalues(): | |
yield row.itervalues() | |
def try_modeldef(text): | |
parts = text.split(':') | |
if len(parts) < 2: | |
return None | |
parts = (parts[0], ' '.join(parts[1:])) | |
return map(clear_string, parts) | |
def lparser(): | |
class State(object): pass | |
state = State() | |
state.current = None | |
def parseline(row): | |
if row.tag == 'table': | |
state.current = 'table-header' | |
def return_table(): | |
for tr in parse_table(row): | |
yield (state.current, tr) | |
state.current = 'table-row' | |
return return_table() | |
rowtext = subtree_clean_text(row) | |
if not rowtext: | |
return [] | |
if REMODEL.match(rowtext): | |
state.current = 'model-def' | |
return [] | |
if state.current == 'model-def': | |
parts = try_modeldef(rowtext) | |
if parts: | |
return [(state.current, parts)] | |
return [('text-row', [rowtext])] | |
return parseline | |
def table_captions(rows): | |
buffer_row = None | |
for state, elems in rows: | |
if state == 'table-header': | |
if buffer_row and buffer_row[0] == 'text-row': | |
caption = buffer_row[1] | |
else: | |
caption = ['NO CAPTION'] | |
buffer_row = ('table-caption', caption) | |
if buffer_row: | |
yield buffer_row | |
buffer_row = (state, elems) | |
if buffer_row: | |
yield buffer_row | |
def docrows(tree): | |
lp = lparser() | |
def lines(): | |
for root in docpages(tree): | |
for row in root.iterchildren(): | |
lines = lp(row) | |
for line in lines: | |
yield line | |
return table_captions(lines()) | |
def create_object(lines): | |
class State(object): pass | |
mem = State() | |
mem.result = [] | |
mem.model = {} | |
def read_model(state, elems): | |
if state == 'model-def': | |
key, val = elems | |
mem.model[key] = val | |
elif state == 'table-caption': | |
if mem.model: | |
mem.result.append(dict(description=mem.model)) | |
mem.model = {} | |
def read_table(state, elems): | |
if not mem.result: | |
return | |
model = mem.result[-1] | |
if state == 'table-caption': | |
if 'tables' not in model: | |
model['tables'] = [] | |
caption = ''.join(elems) | |
model['tables'].append(dict(caption=caption, items=[])) | |
elif state == 'table-header': | |
mem.header = [REMOVEAST.sub('', th) for th in elems] | |
elif state == 'table-row': | |
item = {} | |
for ix, el in enumerate(elems): | |
if el: | |
item[mem.header[ix]] = el | |
model['tables'][-1]['items'].append(item) | |
for state, elems in lines: | |
read_model(state, elems) | |
read_table(state, elems) | |
def post_process_pharm(table): | |
table['caption'] = u'Препараты' | |
newitems = [] | |
for item in table['items']: | |
if u'Фармакотерапевтическая группа' in item: | |
pharmgroup = [] | |
item.pop(u'АТХ группа', None) | |
item.pop(u'Международное непатентованное наименование', None) | |
item[u'АТХ группы'] = pharmgroup | |
newitems.append(item) | |
elif u'АТХ группа' in item: | |
atxgroup = [] | |
item.pop(u'Международное непатентованное наименование', None) | |
item[u'Препараты'] = atxgroup | |
pharmgroup.append(item) | |
elif u'Международное непатентованное наименование' in item: | |
atxgroup.append(item) | |
table['items'] = newitems | |
for model in mem.result: | |
for table in model['tables']: | |
if table['caption'] == 'NO CAPTION': | |
post_process_pharm(table) | |
return mem.result | |
def proceed(html_path): | |
with open(html_path) as tom1: | |
tree = etree.parse(tom1, parser) | |
models = create_object(docrows(tree)) | |
json_str = dumps(models, indent=True, ensure_ascii=False, sort_keys=True) | |
out_path = join('out', splitext(basename(html_path))[0] + '.json') | |
with open(out_path, 'wb') as out: | |
out.write(json_str.encode('utf-8')) | |
out.write('\n'.encode('utf-8')) | |
if __name__ == '__main__': | |
parser = etree.HTMLParser() | |
for html_path in glob('data/tom*.html'): | |
proceed(html_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment