Skip to content

Instantly share code, notes, and snippets.

@fredkingham
Last active September 29, 2021 16:46
Show Gist options
  • Save fredkingham/f1f6890ce391ea9b618c239de112b737 to your computer and use it in GitHub Desktop.
Save fredkingham/f1f6890ce391ea9b618c239de112b737 to your computer and use it in GitHub Desktop.
requires pyquery
from pyquery import PyQuery as pq
import json
import io
FILE_NAME = "IPC_GT_5_Years.htm"
def process():
with io.open("IPC_GT_5_Years.htm", mode="r", encoding="utf-8", errors='ignore') as f:
contents = f.read()
return pq(contents)
def parse_patient_number(td):
text_content = pq(td).text(squash_space=False)
text_content = text_content.replace("Audit", "")
identifiers = [
i.strip() for i in text_content.split("\n") if i.strip()
]
if not len(identifiers):
return {"hospital_number": ""}
if len(identifiers) > 1:
return {
"hospital_number": identifiers[0],
"nhs_number": identifiers[1],
}
return {
"hospital_number": identifiers[0],
}
def parse_name_dob(td):
name, dob, _ = pq(td).text(squash_space=False).split("\n")
surname, first_name = [i.strip() for i in name.split(",")]
return {
"first_name": first_name,
"surname": surname,
"date_of_birth": dob,
}
def get_alert_status(td):
trs = pq(td).find("tr")
result = {}
for tr in trs:
tds = tr.getchildren()
field_name = tds[0].text_content().strip()
date_str = tds[1].text_content().strip()
result[field_name] = date_str
return result
def get_comments(td):
comments = pq(td).text(squash_space=False).replace("Delete", "").strip()
return {"comments": comments}
def get_rows():
py_query_doc = process()
rows = py_query_doc.children()("body").children()[1].getchildren()[1].getchildren()
result = []
for tr in rows:
tds = tr.getchildren()
if not tds:
continue
tr_row = {}
tr_row.update(parse_patient_number(tds[0]))
tr_row.update(parse_name_dob(tds[1]))
tr_row.update(get_alert_status(tds[2]))
tr_row.update(get_comments(tds[-1]))
result.append(tr_row)
return result
def dump_file():
rows = get_rows()
with open('ipc_rows.json', 'w') as f:
json.dump(rows, f)
if __name__ == "__main__":
dump_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment