foolip · January 18, 2018 09:30
diff --git a/main.py b/main.py
 #!/usr/bin/env python3
 #
 # Unpacks a JSON file exported from BigQuery httparchive table, with one body
 # per line, parses the body as HTML and prints some stuff about <main>.

 import html5lib
 import json
 import os
 import sys

 PREFIX = '{http://www.w3.org/1999/xhtml}'
 MAIN = '//' + PREFIX + 'main'

 def ancestors(element):
    while element is not None:
        element = element.getparent()
        if element is not None:
            yield element

 def tagname(element):
    name = element.tag
    if name.startswith(PREFIX):
        return name[len(PREFIX):]
    return name

 def report(data, msg):
    print('\t'.join([data['page'], data['url'], msg]))

 def process(data):
    page = data['page']
    url = data['url']
    body = data['body']

    try:
        doc = html5lib.parse(body, treebuilder='lxml')
    except:
        report(data, 'PARSEERROR')
        return
    mains = doc.findall(MAIN)
    maincount = len(mains)
    if maincount == 1:
        main = mains[0]
        tags = [tagname(e) for e in ancestors(main)]
        tags.reverse()
        report(data, ' '.join(tags))
    else:
        report(data, 'MAINCOUNT {}'.format(maincount))

 if __name__ == '__main__':
    for path in sys.argv[1:]:
        with open(path, mode='rt', encoding='utf-8') as f:
            for line in f.readlines():
                data = json.loads(line)
                process(data)
	#!/usr/bin/env python3
	#
	# Unpacks a JSON file exported from BigQuery httparchive table, with one body
	# per line, parses the body as HTML and prints some stuff about <main>.

	import html5lib
	import json
	import os
	import sys

	PREFIX = '{http://www.w3.org/1999/xhtml}'
	MAIN = '//' + PREFIX + 'main'

	def ancestors(element):
	while element is not None:
	element = element.getparent()
	if element is not None:
	yield element

	def tagname(element):
	name = element.tag
	if name.startswith(PREFIX):
	return name[len(PREFIX):]
	return name

	def report(data, msg):
	print('\t'.join([data['page'], data['url'], msg]))

	def process(data):
	page = data['page']
	url = data['url']
	body = data['body']

	try:
	doc = html5lib.parse(body, treebuilder='lxml')
	except:
	report(data, 'PARSEERROR')
	return
	mains = doc.findall(MAIN)
	maincount = len(mains)
	if maincount == 1:
	main = mains[0]
	tags = [tagname(e) for e in ancestors(main)]
	tags.reverse()
	report(data, ' '.join(tags))
	else:
	report(data, 'MAINCOUNT {}'.format(maincount))

	if __name__ == '__main__':
	for path in sys.argv[1:]:
	with open(path, mode='rt', encoding='utf-8') as f:
	for line in f.readlines():
	data = json.loads(line)
	process(data)