Last active
October 23, 2018 22:22
-
-
Save dslaw/95378b17f2a24ce9a81b9b94edc0df48 to your computer and use it in GitHub Desktop.
Extract data element numbers for Univ of Oregon solar data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Parse data element numbers as trees.""" | |
# XXX: Second spectral digit is broken - both values are 0! | |
from dataclasses import dataclass | |
from lxml import html | |
from typing import Any, Dict, List | |
DELIMITER = "\x97" # — | |
MIN_COLSPAN = 1 | |
URL = "http://solardat.uoregon.edu/DataElementNumbers.html" | |
@dataclass(frozen=True) | |
class _Node: | |
digit: int | |
description: str | |
children: List["_Node"] | |
def parse_table(table: html.HtmlElement) -> List[_Node]: | |
max_colspan = max(int(row[-1].get("colspan", -1)) for row in table) | |
if max_colspan <= MIN_COLSPAN: | |
raise RuntimeError | |
roots: List[_Node] = [] | |
for row in table: | |
# There will be either one or two cells. If two, the first | |
# is just padding. So we always want the last one in the row. | |
ele = row[-1] | |
# The row may not have an element number description. | |
if DELIMITER not in ele.text: | |
continue | |
digit, description = ele.text.split(DELIMITER) | |
description = " ".join(description.split()) | |
node = _Node(int(digit), description, children=[]) | |
colspan = int(ele.get("colspan", MIN_COLSPAN)) | |
depth = max_colspan - colspan | |
if depth < 0: | |
raise RuntimeError | |
siblings = roots | |
for _ in range(depth): | |
siblings = siblings[-1].children | |
siblings.append(node) | |
return roots | |
if __name__ == "__main__": | |
import json | |
import requests | |
def convert_node(node: _Node) -> Dict[str, Any]: | |
return { | |
"digit": node.digit, | |
"description": node.description, | |
"children": list(map(convert_node, node.children)), | |
} | |
response = requests.get(URL) | |
response.raise_for_status() | |
tree = html.fromstring(response.content) | |
table_names = [ | |
"solar", | |
"spectral", | |
"meteorological", | |
] | |
table_xpath = '//table[@cellpadding="4" and @cellspacing="0" and @border="1"]' | |
tables = tree.xpath(table_xpath) | |
out: Dict[str, List[_Node]] = {} | |
for name, table in zip(table_names, tables): | |
nodes = parse_table(table) | |
out.update({name: nodes}) | |
print(json.dumps(out, indent=2, default=convert_node)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Parse into a flat sequence of codes. | |
def parse_table(table: html.HtmlElement) -> List[_Node]: | |
max_colspan = max(int(row[-1].get("colspan", -1)) for row in table) | |
if max_colspan <= MIN_COLSPAN: | |
raise RuntimeError | |
codes = [] | |
current = [None] * max_colspan | |
for row in table: | |
ele = row[-1] | |
if DELIMITER not in ele.text: | |
continue | |
digit, description = ele.text.split(DELIMITER) | |
description = " ".join(description.split()) | |
colspan = int(ele.get("colspan", MIN_COLSPAN)) | |
depth = max_colspan - colspan | |
current[depth] = { | |
"digit": digit, | |
"description": description, | |
} | |
if depth == len(current) - 1: | |
codes.append({ | |
"digits": "".join(node["digit"] for node in current]), | |
"descriptions": [node["description"] for node in current], | |
} | |
current[depth] = None | |
return codes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment