Skip to content

Instantly share code, notes, and snippets.

@heygrady
Created March 19, 2025 15:31
Show Gist options
  • Save heygrady/7a895994b0e7a5f4e2c7e0219f95b651 to your computer and use it in GitHub Desktop.
Save heygrady/7a895994b0e7a5f4e2c7e0219f95b651 to your computer and use it in GitHub Desktop.
Download ROMS
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, unquote
import json
import re
exclude_words = [
"Pack",
"Pak",
"Great Games",
" in 1",
" on 1",
"-in-1",
"-on-1",
"in One",
"on One",
"Superpack",
"(LodgeNet)",
"(Sega",
"(Arcade)",
"(Beta",
"Beta)",
"(Demo",
"Demo)",
"(Unl)",
"(Proto",
"Proto)",
"(PAL)",
"(MPAL)",
"(SECAM)",
"(Pirate)",
"(Competition",
"(Virtual Console)",
"(Virtual Console",
"(Promo",
"Promo)",
"(Sample)",
"(iam8bit)",
"(e-Reader Edition)",
"(Capcom Town)",
"(Disney Classic Games)",
"(Re-release)",
"(Final Cut)",
"(Capcom Classics",
"(Genesis Mini)",
"(Lock-on Combination)"
"(Switch",
"Switch)",
"(Steam)",
"(Pt)",
"(De)",
"(Fr)",
"(Es)",
"(Ja)",
"(Alt",
"(Wii",
"(GameCube",
"(Kiosk)",
"(Wi-Fi Kiosk)",
"(Enhancement Chip)",
"(Collection of Mana)",
"(Collection of SaGa)",
"(Cowabunga Collection, The)",
"Advance Collection)",
"(Atari Anthology)",
"(Activision Anthology",
"(Enhanced)",
"(Piko Interactive)",
"(QUByte Classics)",
"(Aftermarket)",
"(Retro",
"Collection)",
"(Strictly",
"(SNS-XM)",
"[b]",
"(Test Program)",
"(Program)",
"(Audio Tapes)",
"(Test",
"(Limited Run Games)",
"[BIOS]",
]
def parse_filename_info(filename):
region_patterns = {
"(USA)": 0,
"(USA, Canada)": 1,
"(USA, Australia)": 2,
"(USA, Europe)": 3,
"(USA, Europe, Brazil)": 4,
"(USA, Brazil)": 5,
"(USA, Europe, Korea)": 6,
"(USA, Korea)": 7,
"(Japan, USA)": 8,
"(World)": 9,
}
region = -1
for pattern, region_name in region_patterns.items():
if pattern in filename:
region = region_name
break
# Handles digits or letters
revision_match = re.search(r"\(Rev ([0-9A-Za-z]+)\)", filename, re.IGNORECASE)
revision = 0 # Default revision if not found
if revision_match:
revision_str = revision_match.group(1).upper()
if revision_str.isdigit():
revision = int(revision_str)
elif revision_str.isalpha():
revision = ord(revision_str) - ord('A') + 1 # Convert letter to number (A=1, B=2, ...)
else:
revision = 0
return revision, region
cookie_string = ''
cookies = {}
for cookie_pair in cookie_string.split('; '):
key_value = cookie_pair.split('=', 1) # Split only once at the first '='
if len(key_value) == 2:
key, value = key_value
cookies[key] = value
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'referer': 'https://r-roms.github.io/',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'cross-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
}
def download_files_from_webpage(url, download_dir):
"""
Downloads files from a webpage based on specific criteria.
Args:
url (str): The URL of the webpage to scrape.
download_dir (str): The directory to save downloaded files to.
"""
download_headers = headers.copy()
download_headers['referer'] = url # Set the referer for downloads to the webpage URL
try:
response = requests.get(url, headers=headers, cookies=cookies)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching webpage: {e}")
return
soup = BeautifulSoup(response.content, 'html.parser')
# Create download directory if it doesn't exist
os.makedirs(download_dir, exist_ok=True)
file_links = soup.select('table tr > td:nth-child(1) > a:nth-child(1)')
if not file_links:
print("No file links found on the webpage using the provided selector.")
return
print("Found file links, filtering...")
game_links = {}
links_to_download = []
for link_element in file_links:
link_text = link_element.text
href = link_element['href']
if href.startswith("//"):
download_url = urljoin("https://archive.org", href) # Ensure absolute URL
elif not href.startswith("http"):
download_url = urljoin(url, href)
if ("(USA)" in link_text or "(World)" in link_text or "(USA," in link_text or ", USA)" in link_text) and not any(exclude_word in link_text for exclude_word in exclude_words):
if not (link_text.endswith(".zip") or link_text.endswith(".7z") or link_text.endswith(".chd")):
print(f"Skipping non-standard file: {link_text}")
continue
url_encoded_filename = os.path.basename(download_url)
filename = unquote(url_encoded_filename) # **URL DECODE FILENAME HERE**
filepath = os.path.join(download_dir, filename)
revision, region = parse_filename_info(filename)
game_base_name = filename.split(" (", maxsplit=1)[0].strip().lower()
if game_base_name not in game_links:
game_links[game_base_name] = []
game_links[game_base_name].append({
"link": download_url,
"filename": filename,
"revision": revision,
"region": region,
"full_path": filepath
})
for game_name, file_list in game_links.items():
# Process revisions
max_region_revision = {}
best_region = None
if len(file_list) == 1:
links_to_download.append((file_list[0]["link"], file_list[0]["filename"], file_list[0]["full_path"]))
continue
for file_info in file_list:
if max_region_revision.get(file_info["region"]) is None:
max_region_revision[file_info["region"]] = -1
max_revision = max_region_revision[file_info["region"]]
if file_info["revision"] is not None:
if file_info["revision"] > max_revision:
max_region_revision[file_info["region"]] = file_info["revision"]
if file_info["region"] is not None:
if best_region is None or file_info["region"] < best_region:
best_region = file_info["region"]
for file_info in file_list:
if file_info["revision"] >= max_region_revision[file_info["region"]] and file_info["region"] == best_region:
links_to_download.append((file_info["link"], file_info["filename"], file_info["full_path"]))
print("Downloading...")
for link, link_text, filepath in links_to_download:
if os.path.exists(filepath):
print(f"Skipping existing file: {link_text}")
continue
print(f"Downloading: {link_text} to {filepath}")
try:
with requests.get(link, headers=download_headers, cookies=cookies, stream=True) as download_response:
download_response.raise_for_status()
with open(filepath, 'wb') as outfile:
for chunk in download_response.iter_content(chunk_size=8192): # 8KB chunks
outfile.write(chunk)
except requests.exceptions.RequestException as e:
print(f"Error downloading {link_text}: {e}")
if __name__ == "__main__":
webpage_url = ""
download_directory = "roms" # Directory to save downloaded files
download_files_from_webpage(webpage_url, download_directory)
print("Script finished.")
@heygrady
Copy link
Author

You need to edit webpage_url on line 248 and, if you are trying to download from archive.org, you will need your cookie string added to cookie_string on line 120.

You can get your cookie string using the Chrome Inspector.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment