Created
August 17, 2021 14:09
-
-
Save spicyjpeg/eb589e2cab59f01751beac081371bf85 to your computer and use it in GitHub Desktop.
Friday Night Funkin' / HaxeFlixel HTML5 game downloader script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
"""Friday Night Funkin' downloader | |
I made this script to automate downloading of the FNF HTML5 version hosted on | |
Newgrounds, as well as bootlegs and mods hosted by other people. Given an empty | |
directory and the URL to index.html, this script downloads all assets and files | |
required to play the game fully offline (no patching is done so FNF will retain | |
Newgrounds functionality). Running downloaded games in a browser still requires | |
a local web server, but you can easily use Python itself as a server by | |
executing "python3 -m http.server 8000" from the same folder as index.html. | |
This script is standalone and only depends on the requests library you can | |
download using pip ("pip3 install requests"). Feel free to copy and use it in | |
your mod's build pipeline or anywhere else (but please respect the license of | |
anything you download). It should also work with other HTML5 games built with | |
Lime/OpenFL/HaxeFlixel as long as they use the default asset loader. | |
""" | |
__version__ = "0.1.0" | |
__author__ = "spicyjpeg" | |
import os, re, json, logging | |
from time import perf_counter | |
from binascii import a2b_base64 | |
from ast import literal_eval | |
from itertools import accumulate | |
from argparse import ArgumentParser, FileType | |
from requests import Session | |
from requests.utils import unquote, urlparse, urlunparse | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36" | |
DEFAULT_URLS = { | |
"fnf": "https://uploads.ungrounded.net/alternate/1528000/1528775_alternate_113347_r88.zip", | |
"fnf_prototype": "https://v6p9d9t4.ssl.hwcdn.net/html/2876359-359162", | |
"flappybalt": "https://demos.haxeflixel.com/html5/Flappybalt" | |
} | |
## Utilities | |
def resolvePath(path): | |
""" | |
Resolves a path, removing "." and ".." as well as trailing slashes. This | |
always uses slashes, does not reference the filesystem at all and is safe | |
to use for URL paths. | |
""" | |
_path = path.split("/") | |
output = [] | |
for component in _path: | |
if (not component) or (component == "."): | |
continue | |
elif component == "..": | |
output.pop() | |
else: | |
output.append(component) | |
return "/".join(output) | |
def ensureParentDir(path): | |
""" | |
Ensures a file with the given path can be saved by creating all parent | |
directories. Returns the OS-specific normalized (i.e. using backslashes on | |
Windows) path. | |
""" | |
_path = path.split("/") | |
for parent in accumulate(_path[:-1], os.path.join): | |
if parent and not os.path.isdir(parent): | |
os.mkdir(parent) | |
return os.path.join(*_path) | |
def cleanURL(url): | |
""" | |
Removes query parameters, hashes, trailing slashes and "index.html" from | |
any URL. | |
""" | |
scheme, hostname, path, *_ = urlparse(url) | |
if path.endswith(( "/", "/index.html" )): | |
path, _ = path.rsplit("/", 1) | |
return urlunparse(( | |
scheme, | |
hostname, | |
path, | |
"", | |
"", | |
"" | |
)) | |
## Haxe parser | |
HAXE_INT_REGEX = re.compile(r"[+-]?[0-9]+") | |
HAXE_FLOAT_REGEX = re.compile(r"[+-]?[0-9]*(?:\.[0-9]*)?(?:[Ee][+-]?[0-9]+)?") | |
class HaxeParser: | |
""" | |
Simple parser class for Haxe's built-in serialization format. Almost all | |
basic data types are supported, with the exception of classes and dates. | |
https://haxe.org/manual/std-serialization-format.html | |
""" | |
def _parseInt(self, payload, positive = False): | |
""" | |
Parses an integer and returns a ( value, length ) tuple. Used | |
internally, do not call. | |
""" | |
match = HAXE_INT_REGEX.match(payload) | |
match = match.group() | |
if positive and int(match) < 0: | |
raise ValueError("expected positive integer") | |
return int(match), len(match) | |
def _parseFloat(self, payload): | |
""" | |
Parses a floating-point value and returns a ( value, length ) tuple. | |
Used internally, do not call. | |
""" | |
match = HAXE_FLOAT_REGEX.match(payload) | |
match = match.group() | |
return float(match), len(match) | |
def _parseString(self, payload, isBytes = False): | |
""" | |
Parses a string (or byte string) and returns a ( value, length ) tuple. | |
Used internally, do not call. | |
""" | |
length, data = payload.split(":", 1) | |
data = data[0:int(length)] | |
totalLength = len(length) + int(length) + 1 | |
string = unquote(data) | |
if isBytes: | |
# Pad the base64 string to ensure its length is a multiple of 4 | |
# characters (required since most base64 implementations decode 4 | |
# characters at a time, as they map to exactly 3 bytes). | |
string += "=" * (4 - (len(data) % 4)) | |
return a2b_base64(string), totalLength | |
else: | |
return string, totalLength | |
def _parseObject(self, payload): | |
""" | |
Parses an object or map and returns a ( value, length ) tuple. Used | |
internally, do not call. | |
""" | |
obj = {} | |
offset = 0 | |
# "g" terminates generic objects, while "h" is for specialized built-in | |
# types (maps in this case). | |
while payload[offset] not in "gh": | |
key, keyLength = self._parse(payload[offset:]) | |
value, valueLength = self._parse(payload[offset + keyLength:]) | |
obj[key] = value | |
offset += keyLength + valueLength | |
# Add 1 to the length to account for the terminator. | |
return obj, offset + 1 | |
def _parseList(self, payload): | |
""" | |
Parses a list or array and returns a ( value, length ) tuple. Used | |
internally, do not call. | |
""" | |
obj = [] | |
offset = 0 | |
while payload[offset] != "h": | |
item, length = self._parse(payload[offset:]) | |
# Unpack any tuple (which is currently only used for consecutive | |
# nulls) into the array. | |
if type(item) is tuple: | |
obj.extend(item) | |
else: | |
obj.append(item) | |
offset += length | |
# Add 1 to the length to account for the terminator. | |
return obj, offset + 1 | |
def _parse(self, data): | |
""" | |
Parses a type identifier followed by the respective payload and returns | |
a ( value, length ) tuple. This is the internal implementation of | |
parse(). | |
""" | |
_type = data[0] | |
payload = data[1:] | |
if _type in "Rr": # Cache back-reference | |
# Haxe "compresses" serialized data by saving all strings and | |
# objects into a cache, then inserting pointers to cache entries | |
# instead of any duplicate value. | |
value, length = self._parseInt(payload, True) | |
value = { | |
"R": self.stringCache, | |
"r": self.objectCache | |
}[_type][value] | |
elif _type in "nzkmptf": # Constant | |
value = { | |
"n": None, | |
"z": 0, | |
#"k": math.nan, | |
#"m": -math.inf, | |
#"p": math.inf, | |
"t": True, | |
"f": False | |
}[_type] | |
length = 0 | |
elif _type in "i:": # Integer (":" is used in integer maps) | |
value, length = self._parseInt(payload) | |
elif _type == "d": # Float | |
value, length = self._parseFloat(payload) | |
elif _type == "y": # String (same encoding as serialized string) | |
value, length = self._parseString(payload) | |
self.stringCache.append(value) | |
elif _type == "s": # Byte string | |
value, length = self._parseString(payload, True) | |
elif _type == "u": # Consecutive nulls (used in arrays) | |
value, length = self._parseInt(payload, True) | |
value = ( None, ) * value | |
elif _type in "obqM": # Object/struct/string map/integer map | |
value, length = self._parseObject(payload) | |
# TODO: is cache only used for "o" as stated in docs? | |
self.objectCache.append(value) | |
elif _type in "la": # List or array | |
value, length = self._parseList(payload) | |
#elif _type == "v": # Date (???) | |
#value = strptime(payload[0:19], "%Y-%m-%d %H:%M:%S") | |
#length = 19 | |
else: | |
raise NotImplementedError(f"unsupported Haxe type '{_type}'") | |
# Add 1 to the length to account for the type identifier. | |
return value, length + 1 | |
def parse(self, data): | |
""" | |
Deserializes a serialized string, i.e. a type identifier followed by | |
the respective payload. Unlike _parse(), this function also clears the | |
cache and performs length checking to make sure the whole string gets | |
parsed. | |
""" | |
self.stringCache = [] | |
self.objectCache = [] | |
value, length = self._parse(data) | |
if length != len(data): | |
raise RuntimeError("failed to parse extra data at end of string") | |
return value | |
## Downloader class | |
# I know I shouldn't parse HTML using regex if I don't want some demon from a | |
# parallel universe to kill me. But this works well enough. | |
HTML_SCRIPT_REGEX = re.compile(r"<\s*?script.+?src=\"(.+?)\"\s*?>", re.IGNORECASE) | |
HTML_FONT_REGEX = re.compile(r"url\s*\(\s*(?P<q>['\"]?)(.+?\.(?:ttf|otf|woff2?))(?P=q)\s*\)") | |
# OpenFL/HaxeFlixel knows which assets to load by using manifest files. FNF | |
# takes advantage of this by having different manifests for each week (hosted | |
# as separate JSON files) plus a "preload" manifest that lists the assets to | |
# load on startup, which is embedded in the main JS file along with paths to | |
# all other manifests. These regexes take care of extracting everything we need | |
# from that script -- I hope they don't break with updates (even though we all | |
# know ninjamuffin is likely never going to update anything other than the | |
# Full-A$$ Game). | |
MANIFEST_REGEX = re.compile(r"'\{\s*?\"name\"\s*?:\s*?null\s*?,.*\}'|\"\{\s*?\\\"name\\\"\s*?:\s*?null\s*?,.*\}\"") | |
EXT_MANIFEST_REGEX = re.compile(r"(?P<q>['\"])(manifest/.+?\.json)(?P=q)", re.IGNORECASE) | |
class Downloader: | |
""" | |
This class handles pretty much everything. | |
""" | |
def __init__( | |
self, | |
root, | |
assetsOnly = False, | |
redownload = False, | |
whitelist = None, | |
blacklist = None | |
): | |
""" | |
Initializes the downloader with the given options. | |
""" | |
self.root = cleanURL(root) | |
self.assetsOnly = assetsOnly | |
self.redownload = redownload | |
self.whitelist = whitelist | |
self.blacklist = blacklist | |
self.session = Session() | |
self.parser = HaxeParser() | |
self.numAssets = 0 | |
self.totalSize = 0 | |
logging.debug(f"Root URL: {self.root}") | |
def _get(self, path): | |
""" | |
Performs a GET request and returns the response object. If assetsOnly | |
is disabled, the response's contents are also saved to a file. | |
""" | |
response = self.session.get(f"{self.root}/{path}") | |
if not response.ok: | |
logging.fatal(f"{path} error (HTTP {response.status_code}, {path})") | |
exit(1) | |
if not self.assetsOnly: | |
savePath = ensureParentDir(path) | |
with open(savePath, "wb") as outputFile: | |
outputFile.write(response.content) | |
logging.debug(f"{path} saved") | |
return response | |
def _download(self, path, expectedSize = None): | |
""" | |
Downloads a file. Unlike _get() this functions streams the file and is | |
thus suitable for downloading large assets. | |
""" | |
savePath = ensureParentDir(path) | |
size = 0 | |
# Skip this file if it has already been downloaded before (and its size | |
# matches the expected size). | |
if not self.redownload and os.path.isfile(savePath): | |
if os.stat(savePath).st_size == expectedSize: | |
logging.info(f"{path} skipped") | |
return | |
elif expectedSize is not None: | |
logging.warning(f"{path} found with invalid size, redownloading") | |
response = self.session.get(f"{self.root}/{path}", stream = True) | |
if not response.ok: | |
logging.error(f"{path} error (HTTP {response.status_code}, {path})") | |
with open(savePath, "wb") as outputFile: | |
for chunk in response.iter_content(1024): | |
outputFile.write(chunk) | |
size += len(chunk) | |
if expectedSize is not None and size != expectedSize: | |
logging.warning(f"{path} size does not match expected size") | |
self.numAssets += 1 | |
self.totalSize += size | |
logging.info(f"{path} downloaded ({size // 1024} KB)") | |
def _manifestAllowed(self, name): | |
""" | |
Returns whether the manifest with the specified name should be | |
downloaded or skipped, based on the whitelist and blacklist. | |
""" | |
*_, _name = name.rsplit("/", 1) | |
if self.whitelist is not None: | |
if _name not in self.whitelist: | |
return False | |
if self.blacklist is not None: | |
if _name in self.blacklist: | |
return False | |
return True | |
def parseManifest(self, name, manifest): | |
""" | |
Downloads all assets listed in the given manifest object. Returns a | |
decoded copy of the manifest object. | |
""" | |
_version = manifest["version"] | |
if _version != 2: | |
logging.warning(f"({name}) Manifest version {_version} is not officially supported") | |
# The root path specified in the manifest is relative to the manifest | |
# itself's URL. Note that this prefix only applies to paths, not asset | |
# IDs (which are identical to paths in FNF, but might be different in | |
# other games). | |
prefix = manifest.get("rootPath", None) or "." | |
prefix = resolvePath(f"{name}/../{prefix}") | |
if prefix: | |
prefix += "/" | |
# Each manifest is a JSON wrapper over a weird URL-encoded string... | |
# well, after digging through OpenFL/Lime sources it turns out it's | |
# actually the output of Haxe's built-in serializer. So I wrote a | |
# parser for that too. | |
assets = self.parser.parse(manifest["assets"]) | |
logging.info(f"({name}) Downloading {len(assets)} assets") | |
for asset in assets: | |
_id = asset.get("id", "") | |
_size = asset.get("size", 0) | |
# Some assets seem to use "path groups", i.e. lists of multiple | |
# paths (even though all files in FNF which use this feature have a | |
# single entry in the group). Other assets (fonts) are loaded via | |
# CSS instead and don't have an associated path, only a CSS class. | |
if "className" in asset: | |
logging.debug(f"Skipping font asset: {_id}") | |
continue | |
if "pathGroup" in asset: | |
paths = asset["pathGroup"] | |
else: | |
paths = asset["path"], | |
for path in paths: | |
self._download(prefix + path, _size) | |
# Return a shallow copy of the manifest, with the Haxe blob replaced | |
# with the deserialized object. | |
return { **manifest, "assets": assets } | |
def downloadJS(self, url): | |
""" | |
Downloads a JavaScript file from the given URL and searches it for | |
embedded and linked manifests, then calls downloadManifest() for each | |
manifest found. Yields ( manifestName, manifestObject ) tuples. | |
""" | |
name, _ = os.path.splitext(url) | |
script = self._get(url) | |
# Extract the manifest(s) embedded within the script itself. These | |
# are JSON blobs wrapped into string literals; thankfully Python | |
# and JS literals have almost the same syntax. | |
matches = MANIFEST_REGEX.findall(script.text) | |
for _id, literal in enumerate(matches): | |
# Add a suffix if multiple manifest blobs are present. | |
manifestName = name | |
if _id: | |
manifestName += f"_{_id}" | |
if not self._manifestAllowed(manifestName): | |
continue | |
logging.info(f"({name}) Found embedded manifest: {manifestName}") | |
manifest = literal_eval(literal) | |
yield manifestName, self.parseManifest( | |
manifestName, | |
json.loads(manifest) | |
) | |
# Download any external manifests whose paths are referenced in the | |
# script. | |
matches = EXT_MANIFEST_REGEX.findall(script.text) | |
for _, path in matches: | |
manifestName, _ = os.path.splitext(path) | |
if not self._manifestAllowed(manifestName): | |
continue | |
logging.info(f"({name}) Found linked manifest: {manifestName}") | |
manifest = self._get(path) | |
yield manifestName, self.parseManifest( | |
manifestName, | |
manifest.json() | |
) | |
def downloadHTML(self, url): | |
""" | |
Downloads a HTML file from the given URL and searches it for fonts and | |
JS files, then calls downloadJS() for each script found. Yields | |
( manifestName, manifestObject ) tuples. | |
""" | |
logging.info("Downloading HTML, fonts and JS") | |
html = self._get(url) | |
# Download fonts. Due to "limitations" of HTML5, these are loaded | |
# directly from the main HTML page through inline CSS rather than via | |
# manifests. | |
if self._manifestAllowed("_fonts"): | |
for _, _path in HTML_FONT_REGEX.findall(html.text): | |
path = resolvePath(_path) | |
self._download(path) | |
# Obtain all manifest files from each script. | |
for _path in HTML_SCRIPT_REGEX.findall(html.text): | |
path = resolvePath(_path) | |
logging.info(f"Found linked JS: {path}") | |
yield from self.downloadJS(path) | |
## Command line crap | |
def parserSetup(): | |
""" | |
Creates a command line parser. This is only used internally by main(). | |
""" | |
names = ", ".join(DEFAULT_URLS) | |
parser = ArgumentParser( | |
description = f"Downloads a copy of any Lime/OpenFL/HaxeFlixel-based web game. The following games can be specified by name: {names}", | |
epilog = "Use '-E _fonts' to skip downloading font files referenced by CSS.", | |
add_help = False | |
) | |
toolsGroup = parser.add_argument_group("Tools") | |
toolsGroup.add_argument( | |
"-h", "--help", | |
action = "help", | |
help = "Shows this help message and exits" | |
) | |
toolsGroup.add_argument( | |
"-v", "--verbose", | |
action = "count", | |
help = "Shows additional debug information (2 levels)" | |
) | |
manifestGroup = parser.add_argument_group("Manifest options") | |
manifestGroup.add_argument( | |
"-M", "--manifests", | |
action = "append", | |
type = str, | |
help = "Only downloads assets from the specified manifest(s)", | |
metavar = "name" | |
) | |
manifestGroup.add_argument( | |
"-E", "--exclude", | |
action = "append", | |
type = str, | |
help = "Skips downloading assets from the specified manifest(s)", | |
metavar = "name" | |
) | |
manifestGroup.add_argument( | |
"-S", "--save", | |
type = FileType("wt"), | |
help = "Outputs a JSON file containing all decoded manifests", | |
metavar = "outputFile" | |
) | |
downloadGroup = parser.add_argument_group("Download options") | |
downloadGroup.add_argument( | |
"-a", "--assets-only", | |
action = "store_true", | |
help = "Disables saving parsed HTML, JS and manifest files" | |
) | |
downloadGroup.add_argument( | |
"-r", "--redownload", | |
action = "store_true", | |
help = "Forces redownloading existing assets" | |
) | |
downloadGroup.add_argument( | |
"-u", "--user-agent", | |
type = str, | |
default = USER_AGENT, | |
help = "User agent (browser ID) to use when downloading", | |
metavar = "string" | |
) | |
fileGroup = parser.add_argument_group("File paths") | |
fileGroup.add_argument( | |
"game", | |
type = str, | |
nargs = "?", | |
default = "fnf", | |
help = "Game name or URL to the game's root or index.html" | |
) | |
fileGroup.add_argument( | |
"outputPath", | |
type = str, | |
nargs = "?", | |
default = ".", | |
help = "Where to store all downloaded files" | |
) | |
return parser | |
def loggerSetup(verbose = None): | |
""" | |
Configures logging. This is only used internally by main(). | |
""" | |
if verbose: | |
_level = min(verbose, 2) | |
else: | |
_level = 0 | |
logging.basicConfig( | |
format = "[%(funcName)-13s %(levelname)-7s] %(message)s", | |
level = ( | |
logging.WARNING, | |
logging.INFO, # -v | |
logging.DEBUG # -vv | |
)[_level] | |
) | |
## Main | |
def main(): | |
""" | |
Main entry point of the script. | |
""" | |
parser = parserSetup() | |
args = parser.parse_args() | |
loggerSetup(args.verbose) | |
logging.info(f"FNF downloader v{__version__}") | |
url = DEFAULT_URLS.get( | |
args.game.lower(), | |
args.game | |
) | |
downloader = Downloader( | |
url, | |
args.assets_only, | |
args.redownload, | |
args.manifests, | |
args.exclude | |
) | |
downloader.session.headers["User-Agent"] = args.user_agent | |
startTime = perf_counter() | |
manifests = {} | |
outputPath = ensureParentDir(f"{args.outputPath}/.") # lol | |
os.chdir(outputPath) | |
for name, manifest in downloader.downloadHTML("index.html"): | |
manifests[name] = manifest | |
# Save all manifests's contents to a single JSON file if -S was passed. | |
if args.save: | |
with args.save as outputFile: | |
json.dump( | |
manifests, | |
outputFile, | |
indent = "\t" | |
) | |
downloadTime = round(perf_counter() - startTime) | |
logging.info(f"Files downloaded: {downloader.numAssets}, {downloader.totalSize // 1048576} MB") | |
logging.info(f"Done ({downloadTime // 60}m {downloadTime % 60}s)") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment