Last active
August 29, 2015 14:23
-
-
Save Higgs1/e020321828ed3dc2d0a4 to your computer and use it in GitHub Desktop.
Extended JSON Parsing / Scraping in Python3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Rationale: Some websites make it difficult to scrape from by obfuscating their otherwise JSON data with | |
things such as unquoted object keys, concatenated strings, comments, etc. which are all valid JavaScript | |
constructs but invalid JSON. This usually forces would-be web scraper developers to emulate an entire | |
browser just to mine the data, which apparently is a fairly successful deterrent. This script parses the | |
JavaScript AST without executing any potentially malicious JavaScript DRM code, and correctly parses a | |
number of valid JavaScript constructs into a Python dictionary. | |
It also provides methods to quickly download a web page and search for a global variable using pyquery. | |
""" | |
import operator, json, ast, os | |
# From PyPI 'pyquery' | |
from pyquery import PyQuery as pq | |
# From PyPI 'slimit' | |
from slimit.parser import Parser | |
from slimit.ast import ( | |
Array, Boolean, BinOp, Identifier, | |
Null, Object, UnaryOp, VarStatement) | |
jsops = { | |
'+' : operator.add, | |
'-' : operator.sub, | |
'*' : operator.mul, | |
'/' : operator.truediv, | |
'%' : operator.mod, | |
} | |
def jsast2py(node): | |
"""Converts JSON or JSON-like formatted JavaScript AST into a Python dictionary.""" | |
if isinstance(node, (Object, VarStatement)): | |
return {jsast2py(k) : jsast2py(v) for k, v in node} | |
elif isinstance(node, Array): | |
return [jsast2py(e) for e in node] | |
elif isinstance(node, BinOp): | |
return jsops[node.op](*[jsast2py(c) for c in node]) | |
elif isinstance(node, UnaryOp): | |
return jsops[node.op](0, node.value) | |
elif isinstance(node, (Boolean, Identifier)): | |
return node.value | |
elif isinstance(node, Null): | |
return None | |
return ast.literal_eval(node.value) | |
jsparser = Parser() | |
def scrape_var_js(script, var): | |
"""Searches for a JSON-like structure in JavaScript or JSON without executing any JavaScript.""" | |
for statement in jsparser.parse(script): | |
if isinstance(statement, VarStatement): | |
for ident, obj in statement: | |
if ident.value == var: | |
return jsast2py(obj) | |
def scrape_var_html(*args, var = None, **kwargs): | |
"""Searches for a JSON-like structure in an HTML document without executing any JavaScript. | |
Accepts any form of input that pyquery takes (pq, lxml, string doc, url + requests...)""" | |
for script in pq(*args, **kwargs)('body script'): | |
if script.text: | |
ret = scrape_var_js(script.text, var) | |
if ret: | |
return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment