Last active
August 9, 2020 18:43
-
-
Save oshoham/0ee3c81013183ce57d6c48897cd25be5 to your computer and use it in GitHub Desktop.
Parse the Book of Blaseball from the site's minified JavaScript (Python 3 version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import esprima | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
class BookOfBlaseballVisitor(esprima.NodeVisitor): | |
def find_book_function_node(self, node): | |
self._ancestors = [] | |
self._book_function_node = None | |
self.visit(node) | |
return self._book_function_node | |
def visit_Object(self, node): | |
is_new = len(self._ancestors) == 0 or node != self._ancestors[len(self._ancestors) - 1] | |
if is_new: | |
self._ancestors.append(node) | |
yield super().visit_Object(node) | |
if is_new: | |
self._ancestors.pop() | |
def visit_Literal(self, node): | |
if node.value != 'The Book of Blaseball' or self._book_function_node is not None or len(self._ancestors) <= 1: | |
self.generic_visit(node) | |
return | |
for i in reversed(range(len(self._ancestors) - 2)): | |
if self._ancestors[i].type == 'FunctionDeclaration': | |
self._book_function_node = self._ancestors[i] | |
break | |
self.generic_visit(node) | |
class BookParserVisitor(esprima.NodeVisitor): | |
def __init__(self): | |
super().__init__() | |
self._text = '' | |
def parse_book(self, node): | |
self._text = '' | |
self.visit(node) | |
return self._text | |
def visit_CallExpression(self, node): | |
is_create_element = ( | |
node.callee.type == 'MemberExpression' and | |
node.callee.property.type == 'Identifier' and | |
node.callee.property.name == 'createElement' | |
) | |
if not is_create_element: | |
self.generic_visit(node) | |
return | |
self.visit(node.callee) | |
for i in range(len(node.arguments)): | |
if i == 0 and node.arguments[0].type == 'Literal': # HTML tag | |
if node.arguments[0].value == 'div' and self._text != '': | |
self._text += '\n' | |
continue | |
is_literal = ( | |
node.arguments[i].type == 'Literal' and | |
node.arguments[i].value is not None | |
) | |
has_str_property = ( | |
node.arguments[i].type == 'ObjectExpression' and | |
len(node.arguments[i].properties) == 1 and | |
node.arguments[i].properties[0].key.name == 'str' | |
) | |
has_class_name_property = ( | |
node.arguments[i].type == 'ObjectExpression' and | |
len(node.arguments[i].properties) == 1 and | |
node.arguments[i].properties[0].key.name == 'className' | |
) | |
if is_literal: | |
self._text += node.arguments[i].value | |
elif has_str_property: | |
self._text += node.arguments[i].properties[0].value.value | |
elif has_class_name_property: | |
class_names = node.arguments[i].properties[0].value.value.split(' ') | |
if 'TheBook-Bullet' in class_names: | |
self._text += '\n' | |
elif 'TheBook-SubBullet' in class_names: | |
self._text += ' ' | |
self.visit(node.arguments[i]) | |
def parse_book_from_javascript(): | |
html_response = requests.get('https://blaseball.com') | |
html = html_response.text | |
soup = BeautifulSoup(html, 'html.parser') | |
script_tags = soup.select('script[src^="/static/js/main\."]') | |
if len(script_tags) == 0: | |
raise Exception('Could not find the main JS file.') | |
if len(script_tags) > 1: | |
raise Exception('More than one main JS files found.') | |
script_tag = script_tags[0] | |
src = script_tag.attrs['src'] | |
js_url = urljoin('https://blaseball.com', src) | |
js_response = requests.get(js_url) | |
js = js_response.text | |
ast = esprima.parse(js) | |
book_of_blaseball_visitor = BookOfBlaseballVisitor() | |
book_function_node = book_of_blaseball_visitor.find_book_function_node(ast) | |
if book_function_node is None: | |
raise Exception('Could not find the FunctionDeclaration node for rendering the Book in the AST.') | |
book_parser_visitor = BookParserVisitor() | |
return book_parser_visitor.parse_book(book_function_node) | |
if __name__ == '__main__': | |
try: | |
book = parse_book_from_javascript() | |
print(book) | |
except Exception as e: | |
print(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.9.1 | |
esprima==4.0.1 | |
requests==2.24.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment