Skip to content

Instantly share code, notes, and snippets.

@spicyjpeg
Created August 17, 2021 14:09
Show Gist options
  • Save spicyjpeg/eb589e2cab59f01751beac081371bf85 to your computer and use it in GitHub Desktop.
Save spicyjpeg/eb589e2cab59f01751beac081371bf85 to your computer and use it in GitHub Desktop.
Friday Night Funkin' / HaxeFlixel HTML5 game downloader script
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Friday Night Funkin' downloader
I made this script to automate downloading of the FNF HTML5 version hosted on
Newgrounds, as well as bootlegs and mods hosted by other people. Given an empty
directory and the URL to index.html, this script downloads all assets and files
required to play the game fully offline (no patching is done so FNF will retain
Newgrounds functionality). Running downloaded games in a browser still requires
a local web server, but you can easily use Python itself as a server by
executing "python3 -m http.server 8000" from the same folder as index.html.
This script is standalone and only depends on the requests library you can
download using pip ("pip3 install requests"). Feel free to copy and use it in
your mod's build pipeline or anywhere else (but please respect the license of
anything you download). It should also work with other HTML5 games built with
Lime/OpenFL/HaxeFlixel as long as they use the default asset loader.
"""
__version__ = "0.1.0"
__author__ = "spicyjpeg"
import os, re, json, logging
from time import perf_counter
from binascii import a2b_base64
from ast import literal_eval
from itertools import accumulate
from argparse import ArgumentParser, FileType
from requests import Session
from requests.utils import unquote, urlparse, urlunparse
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.7113.93 Safari/537.36"
DEFAULT_URLS = {
"fnf": "https://uploads.ungrounded.net/alternate/1528000/1528775_alternate_113347_r88.zip",
"fnf_prototype": "https://v6p9d9t4.ssl.hwcdn.net/html/2876359-359162",
"flappybalt": "https://demos.haxeflixel.com/html5/Flappybalt"
}
## Utilities
def resolvePath(path):
"""
Resolves a path, removing "." and ".." as well as trailing slashes. This
always uses slashes, does not reference the filesystem at all and is safe
to use for URL paths.
"""
_path = path.split("/")
output = []
for component in _path:
if (not component) or (component == "."):
continue
elif component == "..":
output.pop()
else:
output.append(component)
return "/".join(output)
def ensureParentDir(path):
"""
Ensures a file with the given path can be saved by creating all parent
directories. Returns the OS-specific normalized (i.e. using backslashes on
Windows) path.
"""
_path = path.split("/")
for parent in accumulate(_path[:-1], os.path.join):
if parent and not os.path.isdir(parent):
os.mkdir(parent)
return os.path.join(*_path)
def cleanURL(url):
"""
Removes query parameters, hashes, trailing slashes and "index.html" from
any URL.
"""
scheme, hostname, path, *_ = urlparse(url)
if path.endswith(( "/", "/index.html" )):
path, _ = path.rsplit("/", 1)
return urlunparse((
scheme,
hostname,
path,
"",
"",
""
))
## Haxe parser
HAXE_INT_REGEX = re.compile(r"[+-]?[0-9]+")
HAXE_FLOAT_REGEX = re.compile(r"[+-]?[0-9]*(?:\.[0-9]*)?(?:[Ee][+-]?[0-9]+)?")
class HaxeParser:
"""
Simple parser class for Haxe's built-in serialization format. Almost all
basic data types are supported, with the exception of classes and dates.
https://haxe.org/manual/std-serialization-format.html
"""
def _parseInt(self, payload, positive = False):
"""
Parses an integer and returns a ( value, length ) tuple. Used
internally, do not call.
"""
match = HAXE_INT_REGEX.match(payload)
match = match.group()
if positive and int(match) < 0:
raise ValueError("expected positive integer")
return int(match), len(match)
def _parseFloat(self, payload):
"""
Parses a floating-point value and returns a ( value, length ) tuple.
Used internally, do not call.
"""
match = HAXE_FLOAT_REGEX.match(payload)
match = match.group()
return float(match), len(match)
def _parseString(self, payload, isBytes = False):
"""
Parses a string (or byte string) and returns a ( value, length ) tuple.
Used internally, do not call.
"""
length, data = payload.split(":", 1)
data = data[0:int(length)]
totalLength = len(length) + int(length) + 1
string = unquote(data)
if isBytes:
# Pad the base64 string to ensure its length is a multiple of 4
# characters (required since most base64 implementations decode 4
# characters at a time, as they map to exactly 3 bytes).
string += "=" * (4 - (len(data) % 4))
return a2b_base64(string), totalLength
else:
return string, totalLength
def _parseObject(self, payload):
"""
Parses an object or map and returns a ( value, length ) tuple. Used
internally, do not call.
"""
obj = {}
offset = 0
# "g" terminates generic objects, while "h" is for specialized built-in
# types (maps in this case).
while payload[offset] not in "gh":
key, keyLength = self._parse(payload[offset:])
value, valueLength = self._parse(payload[offset + keyLength:])
obj[key] = value
offset += keyLength + valueLength
# Add 1 to the length to account for the terminator.
return obj, offset + 1
def _parseList(self, payload):
"""
Parses a list or array and returns a ( value, length ) tuple. Used
internally, do not call.
"""
obj = []
offset = 0
while payload[offset] != "h":
item, length = self._parse(payload[offset:])
# Unpack any tuple (which is currently only used for consecutive
# nulls) into the array.
if type(item) is tuple:
obj.extend(item)
else:
obj.append(item)
offset += length
# Add 1 to the length to account for the terminator.
return obj, offset + 1
def _parse(self, data):
"""
Parses a type identifier followed by the respective payload and returns
a ( value, length ) tuple. This is the internal implementation of
parse().
"""
_type = data[0]
payload = data[1:]
if _type in "Rr": # Cache back-reference
# Haxe "compresses" serialized data by saving all strings and
# objects into a cache, then inserting pointers to cache entries
# instead of any duplicate value.
value, length = self._parseInt(payload, True)
value = {
"R": self.stringCache,
"r": self.objectCache
}[_type][value]
elif _type in "nzkmptf": # Constant
value = {
"n": None,
"z": 0,
#"k": math.nan,
#"m": -math.inf,
#"p": math.inf,
"t": True,
"f": False
}[_type]
length = 0
elif _type in "i:": # Integer (":" is used in integer maps)
value, length = self._parseInt(payload)
elif _type == "d": # Float
value, length = self._parseFloat(payload)
elif _type == "y": # String (same encoding as serialized string)
value, length = self._parseString(payload)
self.stringCache.append(value)
elif _type == "s": # Byte string
value, length = self._parseString(payload, True)
elif _type == "u": # Consecutive nulls (used in arrays)
value, length = self._parseInt(payload, True)
value = ( None, ) * value
elif _type in "obqM": # Object/struct/string map/integer map
value, length = self._parseObject(payload)
# TODO: is cache only used for "o" as stated in docs?
self.objectCache.append(value)
elif _type in "la": # List or array
value, length = self._parseList(payload)
#elif _type == "v": # Date (???)
#value = strptime(payload[0:19], "%Y-%m-%d %H:%M:%S")
#length = 19
else:
raise NotImplementedError(f"unsupported Haxe type '{_type}'")
# Add 1 to the length to account for the type identifier.
return value, length + 1
def parse(self, data):
"""
Deserializes a serialized string, i.e. a type identifier followed by
the respective payload. Unlike _parse(), this function also clears the
cache and performs length checking to make sure the whole string gets
parsed.
"""
self.stringCache = []
self.objectCache = []
value, length = self._parse(data)
if length != len(data):
raise RuntimeError("failed to parse extra data at end of string")
return value
## Downloader class
# I know I shouldn't parse HTML using regex if I don't want some demon from a
# parallel universe to kill me. But this works well enough.
HTML_SCRIPT_REGEX = re.compile(r"<\s*?script.+?src=\"(.+?)\"\s*?>", re.IGNORECASE)
HTML_FONT_REGEX = re.compile(r"url\s*\(\s*(?P<q>['\"]?)(.+?\.(?:ttf|otf|woff2?))(?P=q)\s*\)")
# OpenFL/HaxeFlixel knows which assets to load by using manifest files. FNF
# takes advantage of this by having different manifests for each week (hosted
# as separate JSON files) plus a "preload" manifest that lists the assets to
# load on startup, which is embedded in the main JS file along with paths to
# all other manifests. These regexes take care of extracting everything we need
# from that script -- I hope they don't break with updates (even though we all
# know ninjamuffin is likely never going to update anything other than the
# Full-A$$ Game).
MANIFEST_REGEX = re.compile(r"'\{\s*?\"name\"\s*?:\s*?null\s*?,.*\}'|\"\{\s*?\\\"name\\\"\s*?:\s*?null\s*?,.*\}\"")
EXT_MANIFEST_REGEX = re.compile(r"(?P<q>['\"])(manifest/.+?\.json)(?P=q)", re.IGNORECASE)
class Downloader:
"""
This class handles pretty much everything.
"""
def __init__(
self,
root,
assetsOnly = False,
redownload = False,
whitelist = None,
blacklist = None
):
"""
Initializes the downloader with the given options.
"""
self.root = cleanURL(root)
self.assetsOnly = assetsOnly
self.redownload = redownload
self.whitelist = whitelist
self.blacklist = blacklist
self.session = Session()
self.parser = HaxeParser()
self.numAssets = 0
self.totalSize = 0
logging.debug(f"Root URL: {self.root}")
def _get(self, path):
"""
Performs a GET request and returns the response object. If assetsOnly
is disabled, the response's contents are also saved to a file.
"""
response = self.session.get(f"{self.root}/{path}")
if not response.ok:
logging.fatal(f"{path} error (HTTP {response.status_code}, {path})")
exit(1)
if not self.assetsOnly:
savePath = ensureParentDir(path)
with open(savePath, "wb") as outputFile:
outputFile.write(response.content)
logging.debug(f"{path} saved")
return response
def _download(self, path, expectedSize = None):
"""
Downloads a file. Unlike _get() this functions streams the file and is
thus suitable for downloading large assets.
"""
savePath = ensureParentDir(path)
size = 0
# Skip this file if it has already been downloaded before (and its size
# matches the expected size).
if not self.redownload and os.path.isfile(savePath):
if os.stat(savePath).st_size == expectedSize:
logging.info(f"{path} skipped")
return
elif expectedSize is not None:
logging.warning(f"{path} found with invalid size, redownloading")
response = self.session.get(f"{self.root}/{path}", stream = True)
if not response.ok:
logging.error(f"{path} error (HTTP {response.status_code}, {path})")
with open(savePath, "wb") as outputFile:
for chunk in response.iter_content(1024):
outputFile.write(chunk)
size += len(chunk)
if expectedSize is not None and size != expectedSize:
logging.warning(f"{path} size does not match expected size")
self.numAssets += 1
self.totalSize += size
logging.info(f"{path} downloaded ({size // 1024} KB)")
def _manifestAllowed(self, name):
"""
Returns whether the manifest with the specified name should be
downloaded or skipped, based on the whitelist and blacklist.
"""
*_, _name = name.rsplit("/", 1)
if self.whitelist is not None:
if _name not in self.whitelist:
return False
if self.blacklist is not None:
if _name in self.blacklist:
return False
return True
def parseManifest(self, name, manifest):
"""
Downloads all assets listed in the given manifest object. Returns a
decoded copy of the manifest object.
"""
_version = manifest["version"]
if _version != 2:
logging.warning(f"({name}) Manifest version {_version} is not officially supported")
# The root path specified in the manifest is relative to the manifest
# itself's URL. Note that this prefix only applies to paths, not asset
# IDs (which are identical to paths in FNF, but might be different in
# other games).
prefix = manifest.get("rootPath", None) or "."
prefix = resolvePath(f"{name}/../{prefix}")
if prefix:
prefix += "/"
# Each manifest is a JSON wrapper over a weird URL-encoded string...
# well, after digging through OpenFL/Lime sources it turns out it's
# actually the output of Haxe's built-in serializer. So I wrote a
# parser for that too.
assets = self.parser.parse(manifest["assets"])
logging.info(f"({name}) Downloading {len(assets)} assets")
for asset in assets:
_id = asset.get("id", "")
_size = asset.get("size", 0)
# Some assets seem to use "path groups", i.e. lists of multiple
# paths (even though all files in FNF which use this feature have a
# single entry in the group). Other assets (fonts) are loaded via
# CSS instead and don't have an associated path, only a CSS class.
if "className" in asset:
logging.debug(f"Skipping font asset: {_id}")
continue
if "pathGroup" in asset:
paths = asset["pathGroup"]
else:
paths = asset["path"],
for path in paths:
self._download(prefix + path, _size)
# Return a shallow copy of the manifest, with the Haxe blob replaced
# with the deserialized object.
return { **manifest, "assets": assets }
def downloadJS(self, url):
"""
Downloads a JavaScript file from the given URL and searches it for
embedded and linked manifests, then calls downloadManifest() for each
manifest found. Yields ( manifestName, manifestObject ) tuples.
"""
name, _ = os.path.splitext(url)
script = self._get(url)
# Extract the manifest(s) embedded within the script itself. These
# are JSON blobs wrapped into string literals; thankfully Python
# and JS literals have almost the same syntax.
matches = MANIFEST_REGEX.findall(script.text)
for _id, literal in enumerate(matches):
# Add a suffix if multiple manifest blobs are present.
manifestName = name
if _id:
manifestName += f"_{_id}"
if not self._manifestAllowed(manifestName):
continue
logging.info(f"({name}) Found embedded manifest: {manifestName}")
manifest = literal_eval(literal)
yield manifestName, self.parseManifest(
manifestName,
json.loads(manifest)
)
# Download any external manifests whose paths are referenced in the
# script.
matches = EXT_MANIFEST_REGEX.findall(script.text)
for _, path in matches:
manifestName, _ = os.path.splitext(path)
if not self._manifestAllowed(manifestName):
continue
logging.info(f"({name}) Found linked manifest: {manifestName}")
manifest = self._get(path)
yield manifestName, self.parseManifest(
manifestName,
manifest.json()
)
def downloadHTML(self, url):
"""
Downloads a HTML file from the given URL and searches it for fonts and
JS files, then calls downloadJS() for each script found. Yields
( manifestName, manifestObject ) tuples.
"""
logging.info("Downloading HTML, fonts and JS")
html = self._get(url)
# Download fonts. Due to "limitations" of HTML5, these are loaded
# directly from the main HTML page through inline CSS rather than via
# manifests.
if self._manifestAllowed("_fonts"):
for _, _path in HTML_FONT_REGEX.findall(html.text):
path = resolvePath(_path)
self._download(path)
# Obtain all manifest files from each script.
for _path in HTML_SCRIPT_REGEX.findall(html.text):
path = resolvePath(_path)
logging.info(f"Found linked JS: {path}")
yield from self.downloadJS(path)
## Command line crap
def parserSetup():
"""
Creates a command line parser. This is only used internally by main().
"""
names = ", ".join(DEFAULT_URLS)
parser = ArgumentParser(
description = f"Downloads a copy of any Lime/OpenFL/HaxeFlixel-based web game. The following games can be specified by name: {names}",
epilog = "Use '-E _fonts' to skip downloading font files referenced by CSS.",
add_help = False
)
toolsGroup = parser.add_argument_group("Tools")
toolsGroup.add_argument(
"-h", "--help",
action = "help",
help = "Shows this help message and exits"
)
toolsGroup.add_argument(
"-v", "--verbose",
action = "count",
help = "Shows additional debug information (2 levels)"
)
manifestGroup = parser.add_argument_group("Manifest options")
manifestGroup.add_argument(
"-M", "--manifests",
action = "append",
type = str,
help = "Only downloads assets from the specified manifest(s)",
metavar = "name"
)
manifestGroup.add_argument(
"-E", "--exclude",
action = "append",
type = str,
help = "Skips downloading assets from the specified manifest(s)",
metavar = "name"
)
manifestGroup.add_argument(
"-S", "--save",
type = FileType("wt"),
help = "Outputs a JSON file containing all decoded manifests",
metavar = "outputFile"
)
downloadGroup = parser.add_argument_group("Download options")
downloadGroup.add_argument(
"-a", "--assets-only",
action = "store_true",
help = "Disables saving parsed HTML, JS and manifest files"
)
downloadGroup.add_argument(
"-r", "--redownload",
action = "store_true",
help = "Forces redownloading existing assets"
)
downloadGroup.add_argument(
"-u", "--user-agent",
type = str,
default = USER_AGENT,
help = "User agent (browser ID) to use when downloading",
metavar = "string"
)
fileGroup = parser.add_argument_group("File paths")
fileGroup.add_argument(
"game",
type = str,
nargs = "?",
default = "fnf",
help = "Game name or URL to the game's root or index.html"
)
fileGroup.add_argument(
"outputPath",
type = str,
nargs = "?",
default = ".",
help = "Where to store all downloaded files"
)
return parser
def loggerSetup(verbose = None):
"""
Configures logging. This is only used internally by main().
"""
if verbose:
_level = min(verbose, 2)
else:
_level = 0
logging.basicConfig(
format = "[%(funcName)-13s %(levelname)-7s] %(message)s",
level = (
logging.WARNING,
logging.INFO, # -v
logging.DEBUG # -vv
)[_level]
)
## Main
def main():
"""
Main entry point of the script.
"""
parser = parserSetup()
args = parser.parse_args()
loggerSetup(args.verbose)
logging.info(f"FNF downloader v{__version__}")
url = DEFAULT_URLS.get(
args.game.lower(),
args.game
)
downloader = Downloader(
url,
args.assets_only,
args.redownload,
args.manifests,
args.exclude
)
downloader.session.headers["User-Agent"] = args.user_agent
startTime = perf_counter()
manifests = {}
outputPath = ensureParentDir(f"{args.outputPath}/.") # lol
os.chdir(outputPath)
for name, manifest in downloader.downloadHTML("index.html"):
manifests[name] = manifest
# Save all manifests's contents to a single JSON file if -S was passed.
if args.save:
with args.save as outputFile:
json.dump(
manifests,
outputFile,
indent = "\t"
)
downloadTime = round(perf_counter() - startTime)
logging.info(f"Files downloaded: {downloader.numAssets}, {downloader.totalSize // 1048576} MB")
logging.info(f"Done ({downloadTime // 60}m {downloadTime % 60}s)")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment