Skip to content

Instantly share code, notes, and snippets.

@vadimii
Last active December 23, 2015 22:09
Show Gist options
  • Save vadimii/6701404 to your computer and use it in GitHub Desktop.
Save vadimii/6701404 to your computer and use it in GitHub Desktop.
Medical standard HTML file parser
# coding: utf-8
import re
from collections import defaultdict
from json import dumps
from glob import glob
from os.path import basename, splitext, join
from lxml import etree
REFLAGS = re.IGNORECASE | re.UNICODE | re.MULTILINE
REMOVEWS = re.compile(ur'[\xa0\s]+', REFLAGS)
REMOVEAST = re.compile(ur'\*+', REFLAGS)
REMODEL = re.compile(ur'.*модель\sпациента.*', REFLAGS)
def subtree_text(root):
return ''.join(root.xpath('string()'))
def clear_string(wsstr):
wsstr = REMOVEWS.sub(' ', wsstr)
return wsstr.strip()
def subtree_clean_text(root):
return clear_string(subtree_text(root))
def docpages(tree):
return tree.xpath('/html/body/div[starts-with(@class, "WordSection")]')
def parse_table(table):
result = defaultdict(lambda : defaultdict(unicode))
for rowidx, row in enumerate(table.xpath('.//tr')):
for colidx, cell in enumerate(row.xpath('.//td|.//th')):
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
content = subtree_clean_text(cell)
while rowidx in result and colidx in result[rowidx]:
colidx += 1
for i in range(rowidx, rowidx + rowspan):
for j in range(colidx, colidx + colspan):
result[i][j] = content
for row in result.itervalues():
yield row.itervalues()
def try_modeldef(text):
parts = text.split(':')
if len(parts) < 2:
return None
parts = (parts[0], ' '.join(parts[1:]))
return map(clear_string, parts)
def lparser():
class State(object): pass
state = State()
state.current = None
def parseline(row):
if row.tag == 'table':
state.current = 'table-header'
def return_table():
for tr in parse_table(row):
yield (state.current, tr)
state.current = 'table-row'
return return_table()
rowtext = subtree_clean_text(row)
if not rowtext:
return []
if REMODEL.match(rowtext):
state.current = 'model-def'
return []
if state.current == 'model-def':
parts = try_modeldef(rowtext)
if parts:
return [(state.current, parts)]
return [('text-row', [rowtext])]
return parseline
def table_captions(rows):
buffer_row = None
for state, elems in rows:
if state == 'table-header':
if buffer_row and buffer_row[0] == 'text-row':
caption = buffer_row[1]
else:
caption = ['NO CAPTION']
buffer_row = ('table-caption', caption)
if buffer_row:
yield buffer_row
buffer_row = (state, elems)
if buffer_row:
yield buffer_row
def docrows(tree):
lp = lparser()
def lines():
for root in docpages(tree):
for row in root.iterchildren():
lines = lp(row)
for line in lines:
yield line
return table_captions(lines())
def create_object(lines):
class State(object): pass
mem = State()
mem.result = []
mem.model = {}
def read_model(state, elems):
if state == 'model-def':
key, val = elems
mem.model[key] = val
elif state == 'table-caption':
if mem.model:
mem.result.append(dict(description=mem.model))
mem.model = {}
def read_table(state, elems):
if not mem.result:
return
model = mem.result[-1]
if state == 'table-caption':
if 'tables' not in model:
model['tables'] = []
caption = ''.join(elems)
model['tables'].append(dict(caption=caption, items=[]))
elif state == 'table-header':
mem.header = [REMOVEAST.sub('', th) for th in elems]
elif state == 'table-row':
item = {}
for ix, el in enumerate(elems):
if el:
item[mem.header[ix]] = el
model['tables'][-1]['items'].append(item)
for state, elems in lines:
read_model(state, elems)
read_table(state, elems)
def post_process_pharm(table):
table['caption'] = u'Препараты'
newitems = []
for item in table['items']:
if u'Фармакотерапевтическая группа' in item:
pharmgroup = []
item.pop(u'АТХ группа', None)
item.pop(u'Международное непатентованное наименование', None)
item[u'АТХ группы'] = pharmgroup
newitems.append(item)
elif u'АТХ группа' in item:
atxgroup = []
item.pop(u'Международное непатентованное наименование', None)
item[u'Препараты'] = atxgroup
pharmgroup.append(item)
elif u'Международное непатентованное наименование' in item:
atxgroup.append(item)
table['items'] = newitems
for model in mem.result:
for table in model['tables']:
if table['caption'] == 'NO CAPTION':
post_process_pharm(table)
return mem.result
def proceed(html_path):
with open(html_path) as tom1:
tree = etree.parse(tom1, parser)
models = create_object(docrows(tree))
json_str = dumps(models, indent=True, ensure_ascii=False, sort_keys=True)
out_path = join('out', splitext(basename(html_path))[0] + '.json')
with open(out_path, 'wb') as out:
out.write(json_str.encode('utf-8'))
out.write('\n'.encode('utf-8'))
if __name__ == '__main__':
parser = etree.HTMLParser()
for html_path in glob('data/tom*.html'):
proceed(html_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment