dslaw · October 23, 2018 22:22
diff --git a/data_element_numbers.py b/data_element_numbers.py
 """Parse data element numbers as trees."""
 # XXX: Second spectral digit is broken - both values are 0!

 from dataclasses import dataclass
 from lxml import html
 from typing import Any, Dict, List


 DELIMITER = "\x97"  # &#151;
 MIN_COLSPAN = 1
 URL = "http://solardat.uoregon.edu/DataElementNumbers.html"


 @dataclass(frozen=True)
 class _Node:
    digit: int
    description: str
    children: List["_Node"]


 def parse_table(table: html.HtmlElement) -> List[_Node]:
    max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
    if max_colspan <= MIN_COLSPAN:
        raise RuntimeError

    roots: List[_Node] = []
    for row in table:
        # There will be either one or two cells. If two, the first
        # is just padding. So we always want the last one in the row.
        ele = row[-1]

        # The row may not have an element number description.
        if DELIMITER not in ele.text:
            continue

        digit, description = ele.text.split(DELIMITER)
        description = " ".join(description.split())
        node = _Node(int(digit), description, children=[])

        colspan = int(ele.get("colspan", MIN_COLSPAN))
        depth = max_colspan - colspan
        if depth < 0:
            raise RuntimeError

        siblings = roots
        for _ in range(depth):
            siblings = siblings[-1].children

        siblings.append(node)

    return roots


 if __name__ == "__main__":
    import json
    import requests

    def convert_node(node: _Node) -> Dict[str, Any]:
        return {
            "digit": node.digit,
            "description": node.description,
            "children": list(map(convert_node, node.children)),
        }

    response = requests.get(URL)
    response.raise_for_status()
    tree = html.fromstring(response.content)

    table_names = [
        "solar",
        "spectral",
        "meteorological",
    ]

    table_xpath = '//table[@cellpadding="4" and @cellspacing="0" and @border="1"]'
    tables = tree.xpath(table_xpath)
    out: Dict[str, List[_Node]] = {}
    for name, table in zip(table_names, tables):
        nodes = parse_table(table)
        out.update({name: nodes})

    print(json.dumps(out, indent=2, default=convert_node))
diff --git a/flat.py b/flat.py
 # Parse into a flat sequence of codes.
 def parse_table(table: html.HtmlElement) -> List[_Node]:
    max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
    if max_colspan <= MIN_COLSPAN:
        raise RuntimeError

    codes = []
    current = [None] * max_colspan
    for row in table:
        ele = row[-1]
        if DELIMITER not in ele.text:
            continue

        digit, description = ele.text.split(DELIMITER)
        description = " ".join(description.split())

        colspan = int(ele.get("colspan", MIN_COLSPAN))
        depth = max_colspan - colspan
        current[depth] = {
            "digit": digit,
            "description": description,
        }
        
        if depth == len(current) - 1:
            codes.append({
                "digits": "".join(node["digit"] for node in current]),
                "descriptions": [node["description"] for node in current],
            }
            current[depth] = None

   return codes
	"""Parse data element numbers as trees."""
	# XXX: Second spectral digit is broken - both values are 0!

	from dataclasses import dataclass
	from lxml import html
	from typing import Any, Dict, List


	DELIMITER = "\x97" #
	MIN_COLSPAN = 1
	URL = "http://solardat.uoregon.edu/DataElementNumbers.html"


	@dataclass(frozen=True)
	class _Node:
	digit: int
	description: str
	children: List["_Node"]


	def parse_table(table: html.HtmlElement) -> List[_Node]:
	max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
	if max_colspan <= MIN_COLSPAN:
	raise RuntimeError

	roots: List[_Node] = []
	for row in table:
	# There will be either one or two cells. If two, the first
	# is just padding. So we always want the last one in the row.
	ele = row[-1]

	# The row may not have an element number description.
	if DELIMITER not in ele.text:
	continue

	digit, description = ele.text.split(DELIMITER)
	description = " ".join(description.split())
	node = _Node(int(digit), description, children=[])

	colspan = int(ele.get("colspan", MIN_COLSPAN))
	depth = max_colspan - colspan
	if depth < 0:
	raise RuntimeError

	siblings = roots
	for _ in range(depth):
	siblings = siblings[-1].children

	siblings.append(node)

	return roots


	if __name__ == "__main__":
	import json
	import requests

	def convert_node(node: _Node) -> Dict[str, Any]:
	return {
	"digit": node.digit,
	"description": node.description,
	"children": list(map(convert_node, node.children)),
	}

	response = requests.get(URL)
	response.raise_for_status()
	tree = html.fromstring(response.content)

	table_names = [
	"solar",
	"spectral",
	"meteorological",
	]

	table_xpath = '//table[@cellpadding="4" and @cellspacing="0" and @border="1"]'
	tables = tree.xpath(table_xpath)
	out: Dict[str, List[_Node]] = {}
	for name, table in zip(table_names, tables):
	nodes = parse_table(table)
	out.update({name: nodes})

	print(json.dumps(out, indent=2, default=convert_node))
	# Parse into a flat sequence of codes.
	def parse_table(table: html.HtmlElement) -> List[_Node]:
	max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
	if max_colspan <= MIN_COLSPAN:
	raise RuntimeError

	codes = []
	current = [None] * max_colspan
	for row in table:
	ele = row[-1]
	if DELIMITER not in ele.text:
	continue

	digit, description = ele.text.split(DELIMITER)
	description = " ".join(description.split())

	colspan = int(ele.get("colspan", MIN_COLSPAN))
	depth = max_colspan - colspan
	current[depth] = {
	"digit": digit,
	"description": description,
	}

	if depth == len(current) - 1:
	codes.append({
	"digits": "".join(node["digit"] for node in current]),
	"descriptions": [node["description"] for node in current],
	}
	current[depth] = None

	return codes