Skip to content

Instantly share code, notes, and snippets.

@foolip
Last active January 18, 2018 09:30
Show Gist options
  • Save foolip/53182eaf8c4dc434744b3b7364a8ac55 to your computer and use it in GitHub Desktop.
Save foolip/53182eaf8c4dc434744b3b7364a8ac55 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#
# Unpacks a JSON file exported from BigQuery httparchive table, with one body
# per line, parses the body as HTML and prints some stuff about <main>.
import html5lib
import json
import os
import sys
PREFIX = '{http://www.w3.org/1999/xhtml}'
MAIN = '//' + PREFIX + 'main'
def ancestors(element):
while element is not None:
element = element.getparent()
if element is not None:
yield element
def tagname(element):
name = element.tag
if name.startswith(PREFIX):
return name[len(PREFIX):]
return name
def report(data, msg):
print('\t'.join([data['page'], data['url'], msg]))
def process(data):
page = data['page']
url = data['url']
body = data['body']
try:
doc = html5lib.parse(body, treebuilder='lxml')
except:
report(data, 'PARSEERROR')
return
mains = doc.findall(MAIN)
maincount = len(mains)
if maincount == 1:
main = mains[0]
tags = [tagname(e) for e in ancestors(main)]
tags.reverse()
report(data, ' '.join(tags))
else:
report(data, 'MAINCOUNT {}'.format(maincount))
if __name__ == '__main__':
for path in sys.argv[1:]:
with open(path, mode='rt', encoding='utf-8') as f:
for line in f.readlines():
data = json.loads(line)
process(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment