Last active
August 2, 2016 12:41
-
-
Save pudquick/c3b3bcdeed5c9a926bdfcea21111bfa4 to your computer and use it in GitHub Desktop.
Download individual MAME ROMs from archive.org MAME reference sets via python over HTTP/S without downloading the entire set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import posixpath, urlparse, urllib2, re, ast, struct, cmd, os, os.path | |
import xml.etree.ElementTree as ET | |
def discover_roms(rom_url): | |
# step 1: based on the URL, download the _files.xml for the details | |
# I could do this a lot easier, but this is more fun | |
url_parts = urlparse.urlsplit(rom_url) | |
base_path, page_path = posixpath.split(url_parts.path) | |
files_path = posixpath.join(url_parts.path, page_path + '_files.xml') | |
new_parts = url_parts._replace(path=files_path) | |
xml_url = urlparse.urlunsplit(new_parts) | |
# download it | |
f = urllib2.urlopen(xml_url) | |
files_xml = f.read() | |
f.close() | |
# step 2: parse the XML, looking for a large multi-part zip | |
root = ET.fromstring(files_xml) | |
zips = dict() | |
for child in root: | |
if child.get('source') == 'original': | |
# if it's part of the real content | |
m = re.match('^(.+)\.(?:zip|z[0-9]{2})$', child.get('name')) | |
if m: | |
# It's a file of the form .zip, .z01, .z02, etc. | |
group_name = m.group(1) | |
file_name = m.group(0) | |
file_size = ast.literal_eval(child.find('size').text) | |
# record the results | |
zips[group_name] = zips.get(group_name, dict()) | |
zips[group_name]['size'] = zips[group_name].get('size', 0) + file_size | |
zips[group_name]['files'] = zips[group_name].get('files', list()) + [(file_name, file_size)] | |
# step 3: select the largest file group | |
rom_archive, archive_info = sorted(zips.items(), key=lambda x: x[1]['size'], reverse=True)[0] | |
total_size = archive_info['size'] | |
# This should sort into: .z01, .z02, [...], .zip order | |
archive_files = sorted(archive_info['files'], key=lambda x: x[0]) | |
# step 4: grab the last 200k of the final .zip file, which should be enough to contain the end of the central directory | |
# since the max size is a 2 byte / 16-bit value (65535) | |
eocd_zip, eocd_size = archive_files[-1] | |
# byte ranges are zero-based | |
last_200 = eocd_size - 100*1024 | |
# build the url | |
eocd_path = posixpath.join(url_parts.path, eocd_zip) | |
new_parts = url_parts._replace(path=eocd_path) | |
eocd_url = urlparse.urlunsplit(new_parts) | |
# build the request | |
req = urllib2.Request(eocd_url) | |
req.add_header("Range","bytes=%s-" % (last_200)) | |
# grab that EOCD! | |
f = urllib2.urlopen(req) | |
eocd_raw = f.read() | |
f.close() | |
# step 6: search for the last EOCD signature | |
EOCD = re.findall(r'PK\x05\x06.{16}', eocd_raw)[-1] | |
# unpack it | |
magic, d_count, d_cd, ignore, records_total, cd_size, cd_offset = struct.unpack('<4sHHHHLL', EOCD) | |
# step 7: grab the real central directory | |
# (it's probably on the same zip as the eocd, but whee, let's follow the spec) | |
cd_zip = archive_files[d_cd][0] | |
# build the url | |
cd_path = posixpath.join(url_parts.path, cd_zip) | |
new_parts = url_parts._replace(path=cd_path) | |
cd_url = urlparse.urlunsplit(new_parts) | |
# build the request | |
req = urllib2.Request(cd_url) | |
req.add_header("Range","bytes=%s-%s" % (cd_offset, cd_offset + cd_size - 1)) | |
# grab that CD! | |
f = urllib2.urlopen(req) | |
cd_raw = f.read() | |
f.close() | |
# step 7: parse the central directory! | |
# ... probably not safe to parse like this, but it's fun! | |
raw_entries = re.split(r'(PK\x01\x02)', cd_raw)[1:] | |
raw_entries = map(lambda x: ''.join(x), zip(raw_entries[::2],raw_entries[1::2])) | |
entries = [] | |
# parse out the entries and where they are in the archive | |
for x in raw_entries: | |
magic, ignore_16, c_size, u_size, fn_len, e_len, fc_len, f_disk, ignore_6, f_offset = struct.unpack('<4s16sLLHHHH6sL', x[:46]) | |
f_name = x[46:46+fn_len] | |
# only remember compressed files! | |
if c_size: | |
d = dict() | |
d['path'] = f_name | |
d['name'] = posixpath.split(f_name)[-1] | |
d['zip'] = f_disk | |
d['size'] = c_size | |
d['offset'] = f_offset | |
entries.append(d) | |
result = dict() | |
result['source'] = url_parts | |
result['roms'] = entries | |
result['zips'] = archive_files | |
return result | |
def find_rom(rom_info, rom_name): | |
return [x for x in rom_info['roms'] if rom_name in x['name']] | |
def get_byte_range(rom_info, first_zip, start, num_bytes): | |
bytes_raw = '' | |
url_parts = rom_info['source'] | |
range_start = start | |
range_end = start + num_bytes - 1 | |
# check to see if range_end is beyond the length of the first_zip | |
zip_name, zip_size = rom_info['zips'][first_zip] | |
f_path = posixpath.join(url_parts.path, zip_name) | |
new_parts = url_parts._replace(path=f_path) | |
f_url = urlparse.urlunsplit(new_parts) | |
# build the request | |
req = urllib2.Request(f_url) | |
# now the tricky part | |
if range_start >= zip_size: | |
# Whoops! We're trying to start beyond the end of the file | |
adjusted_start = range_start - zip_size | |
adjusted_end = range_end - zip_size | |
zip_name, zip_size = rom_info['zips'][first_zip+1] | |
f_path = posixpath.join(url_parts.path, zip_name) | |
new_parts = url_parts._replace(path=f_path) | |
f_url = urlparse.urlunsplit(new_parts) | |
# build the request | |
req = urllib2.Request(f_url) | |
req.add_header("Range","bytes=%s-%s" % (adjusted_start, adjusted_end)) | |
# grab it | |
f = urllib2.urlopen(req) | |
bytes_raw = f.read() | |
f.close() | |
elif range_end >= zip_size: | |
# We need to download from two zips | |
# So let's get the first one | |
req.add_header("Range","bytes=%s-" % (range_start)) | |
# grab part 1 | |
f = urllib2.urlopen(req) | |
part_1 = f.read() | |
f.close() | |
# How ever many bytes we got, get the rest from the next zip file | |
remaining_bytes = num_bytes - len(part_1) | |
zip_name, zip_size = rom_info['zips'][first_zip+1] | |
f_path = posixpath.join(url_parts.path, zip_name) | |
new_parts = url_parts._replace(path=f_path) | |
f_url = urlparse.urlunsplit(new_parts) | |
# build the request | |
req = urllib2.Request(f_url) | |
req.add_header("Range","bytes=0-%s" % (remaining_bytes-1)) | |
# grab part 2 | |
f = urllib2.urlopen(req) | |
part_2 = f.read() | |
f.close() | |
bytes_raw = part_1 + part_2 | |
else: | |
# Yay, it's the simple case and we can download from one zip | |
req.add_header("Range","bytes=%s-%s" % (range_start, range_end)) | |
# grab those bytes! | |
f = urllib2.urlopen(req) | |
bytes_raw = f.read() | |
f.close() | |
return bytes_raw | |
def download_rom(rom_info, rom_record, save_path): | |
# first we need to get the local header for the file to see how much total bytes to skip | |
local_header = get_byte_range(rom_info, rom_record['zip'], rom_record['offset'], 30) | |
# parse the local header | |
magic, ignore_22, fn_len, e_len = struct.unpack('<4s22sHH', local_header) | |
real_offset = rom_record['offset'] + 30 + fn_len + e_len | |
# now we can get the actual rom file! | |
raw_rom = get_byte_range(rom_info, rom_record['zip'], real_offset, rom_record['size']) | |
f = open(save_path, 'wb') | |
f.write(raw_rom) | |
f.close() | |
class romArchive(cmd.Cmd): | |
def __init__(self, archiveorg_rom_url): | |
cmd.Cmd.__init__(self) | |
self.rom_url = archiveorg_rom_url | |
print "[INITIALIZING: %s]" % self.rom_url | |
self.rom_info = discover_roms(self.rom_url) | |
def do_pwd(self, line): | |
"Print current directory" | |
print os.getcwd() | |
def do_cd(self, line): | |
"Change current directory" | |
if os.path.isdir(line): | |
os.chdir(line) | |
else: | |
print "Error: No such dir: %s" % line | |
def do_ls(self, line): | |
"Print contents of current directory" | |
dirpath, dirnames, filenames = os.walk(os.getcwd()).next() | |
for x in sorted(dirnames): | |
print x + '/' | |
for x in sorted(filenames): | |
print x | |
def do_quit(self, line): | |
"Exit" | |
return True | |
def do_find(self, line): | |
"find <part of file name>" | |
if line: | |
results = find_rom(self.rom_info, line) | |
if not results: | |
print "No results." | |
else: | |
for x in sorted(results, key=lambda y: y['path']): | |
print 'path:', x['path'] | |
print 'name:', x['name'] | |
print 'size:', x['size'] | |
print '' | |
def do_get(self, line): | |
"get <file name>" | |
if line: | |
results = sorted(find_rom(self.rom_info, line), key=lambda y: y['path']) | |
target = results[0] | |
print "[DOWNLOADING: %s]" % target['path'] | |
download_rom(self.rom_info, target, target['name']) | |
print "[SAVED: %s]" % target['name'] | |
do_q = do_quit | |
do_EOF = do_quit | |
do_exit = do_quit | |
# Keep to HTTP URLs to avoid issues with SPI/TLS/HTTPS requirements on python 2 | |
# (though archive.org seems to be friendly/compatible, if you do want to do HTTPS) | |
def main(): | |
rom_url = "http://archive.org/download/MAME0.78-MAME2003-ROMs-CHDs-Samples" | |
# rom_url = "https://archive.org/download/MAME0.37b5-MAME2000-ROMSet" | |
# and https://archive.org/download/MAME0.106-Reference-Set-ROMs-CHDs-Samples, etc. | |
# (make sure you're using the '/download/' form of the URL) | |
ra = romArchive(rom_url) | |
ra.cmdloop() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment