Last active
August 5, 2020 01:35
-
-
Save oshoham/b05e81e4759e5105d0a4c947172e025c to your computer and use it in GitHub Desktop.
Parse the Book of Blaseball from the site's minified JavaScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "blaseball-book-scraper", | |
"version": "1.0.0", | |
"description": "", | |
"main": "parse_blaseball_book.js", | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"acorn": "^7.4.0", | |
"acorn-walk": "^7.2.0", | |
"bent": "^7.3.9", | |
"cheerio": "^1.0.0-rc.3" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const url = require('url'); | |
const bent = require('bent'); | |
const cheerio = require('cheerio'); | |
const acorn = require('acorn'); | |
const walk = require('acorn-walk'); | |
const getString = bent('string'); | |
async function parseBookFromJavaScript() { | |
const html = await getString('https://blaseball.com'); | |
const $ = cheerio.load(html); | |
const scriptTags = $('script[src^="/static/js/main\."]'); | |
if (scriptTags.length === 0) { | |
throw new Error('Could not find the main JS file.') | |
} | |
if (scriptTags.length > 1) { | |
throw new Error('More than one main JS files found.') | |
} | |
const src = scriptTags.attr('src'); | |
const jsUrl = url.resolve('https://blaseball.com', src); | |
const js = await getString(jsUrl); | |
const ast = acorn.parse(js); | |
let bookFunctionNode = null; | |
walk.ancestor(ast, { | |
Literal(node, ancestors) { | |
if (node.value !== 'The Book of Blaseball' || bookFunctionNode !== null || ancestors.length <= 1) { | |
return; | |
} | |
// start at the 2nd-to-last ancestor since the last ancestor is the current node | |
for (let i = ancestors.length - 2; i >= 0; i--) { | |
if (ancestors[i].type === 'FunctionDeclaration') { | |
bookFunctionNode = ancestors[i]; | |
break; | |
} | |
} | |
} | |
}); | |
if (bookFunctionNode === null) { | |
throw new Error('Could not find the FunctionDeclaration node for rendering the Book in the AST.') | |
} | |
let text = ''; | |
walk.recursive(bookFunctionNode, null, { | |
CallExpression(node, st, c) { | |
const isCreateElement = ( | |
node.callee.type === 'MemberExpression' && | |
node.callee.property.type === 'Identifier' && | |
node.callee.property.name === 'createElement' | |
); | |
if (!isCreateElement) { | |
return; | |
} | |
c(node.callee, st, 'Expression'); | |
if (!node.arguments) { | |
return; | |
} | |
for (let i = 0; i < node.arguments.length; i++) { | |
if (i === 0 && node.arguments[0].type === 'Literal') { // HTML tag | |
if (node.arguments[0].value === 'div' && text !== '') { | |
text += '\n'; | |
} | |
continue; | |
} | |
const isLiteral = ( | |
node.arguments[i].type === 'Literal' && | |
node.arguments[i].value !== null | |
); | |
const hasStrProperty = ( | |
node.arguments[i].type === 'ObjectExpression' && | |
node.arguments[i].properties.length === 1 && | |
node.arguments[i].properties[0].key.name === 'str' | |
); | |
const hasClassNameProperty = ( | |
node.arguments[i].type === 'ObjectExpression' && | |
node.arguments[i].properties.length === 1 && | |
node.arguments[i].properties[0].key.name === 'className' | |
); | |
if (isLiteral) { | |
text += node.arguments[i].value; | |
} else if (hasStrProperty) { | |
text += node.arguments[i].properties[0].value.value; | |
} else if (hasClassNameProperty) { | |
const classNames = node.arguments[i].properties[0].value.value.split(' '); | |
if (classNames.includes('TheBook-Bullet')) { | |
text += '\n' | |
} else if (classNames.includes('TheBook-SubBullet')) { | |
text += ' ' | |
} | |
} | |
c(node.arguments[i], st, 'Expression'); | |
} | |
} | |
}); | |
return text; | |
} | |
(async () => { | |
try { | |
const text = await parseBookFromJavaScript(); | |
console.log(text); | |
} catch (e) { | |
console.error(e); | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment