Created
March 19, 2025 15:31
-
-
Save heygrady/7a895994b0e7a5f4e2c7e0219f95b651 to your computer and use it in GitHub Desktop.
Download ROMS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from urllib.parse import urljoin, unquote | |
import json | |
import re | |
exclude_words = [ | |
"Pack", | |
"Pak", | |
"Great Games", | |
" in 1", | |
" on 1", | |
"-in-1", | |
"-on-1", | |
"in One", | |
"on One", | |
"Superpack", | |
"(LodgeNet)", | |
"(Sega", | |
"(Arcade)", | |
"(Beta", | |
"Beta)", | |
"(Demo", | |
"Demo)", | |
"(Unl)", | |
"(Proto", | |
"Proto)", | |
"(PAL)", | |
"(MPAL)", | |
"(SECAM)", | |
"(Pirate)", | |
"(Competition", | |
"(Virtual Console)", | |
"(Virtual Console", | |
"(Promo", | |
"Promo)", | |
"(Sample)", | |
"(iam8bit)", | |
"(e-Reader Edition)", | |
"(Capcom Town)", | |
"(Disney Classic Games)", | |
"(Re-release)", | |
"(Final Cut)", | |
"(Capcom Classics", | |
"(Genesis Mini)", | |
"(Lock-on Combination)" | |
"(Switch", | |
"Switch)", | |
"(Steam)", | |
"(Pt)", | |
"(De)", | |
"(Fr)", | |
"(Es)", | |
"(Ja)", | |
"(Alt", | |
"(Wii", | |
"(GameCube", | |
"(Kiosk)", | |
"(Wi-Fi Kiosk)", | |
"(Enhancement Chip)", | |
"(Collection of Mana)", | |
"(Collection of SaGa)", | |
"(Cowabunga Collection, The)", | |
"Advance Collection)", | |
"(Atari Anthology)", | |
"(Activision Anthology", | |
"(Enhanced)", | |
"(Piko Interactive)", | |
"(QUByte Classics)", | |
"(Aftermarket)", | |
"(Retro", | |
"Collection)", | |
"(Strictly", | |
"(SNS-XM)", | |
"[b]", | |
"(Test Program)", | |
"(Program)", | |
"(Audio Tapes)", | |
"(Test", | |
"(Limited Run Games)", | |
"[BIOS]", | |
] | |
def parse_filename_info(filename): | |
region_patterns = { | |
"(USA)": 0, | |
"(USA, Canada)": 1, | |
"(USA, Australia)": 2, | |
"(USA, Europe)": 3, | |
"(USA, Europe, Brazil)": 4, | |
"(USA, Brazil)": 5, | |
"(USA, Europe, Korea)": 6, | |
"(USA, Korea)": 7, | |
"(Japan, USA)": 8, | |
"(World)": 9, | |
} | |
region = -1 | |
for pattern, region_name in region_patterns.items(): | |
if pattern in filename: | |
region = region_name | |
break | |
# Handles digits or letters | |
revision_match = re.search(r"\(Rev ([0-9A-Za-z]+)\)", filename, re.IGNORECASE) | |
revision = 0 # Default revision if not found | |
if revision_match: | |
revision_str = revision_match.group(1).upper() | |
if revision_str.isdigit(): | |
revision = int(revision_str) | |
elif revision_str.isalpha(): | |
revision = ord(revision_str) - ord('A') + 1 # Convert letter to number (A=1, B=2, ...) | |
else: | |
revision = 0 | |
return revision, region | |
cookie_string = '' | |
cookies = {} | |
for cookie_pair in cookie_string.split('; '): | |
key_value = cookie_pair.split('=', 1) # Split only once at the first '=' | |
if len(key_value) == 2: | |
key, value = key_value | |
cookies[key] = value | |
headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-language': 'en-US,en;q=0.9', | |
'cache-control': 'max-age=0', | |
'priority': 'u=0, i', | |
'referer': 'https://r-roms.github.io/', | |
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'cross-site', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', | |
} | |
def download_files_from_webpage(url, download_dir): | |
""" | |
Downloads files from a webpage based on specific criteria. | |
Args: | |
url (str): The URL of the webpage to scrape. | |
download_dir (str): The directory to save downloaded files to. | |
""" | |
download_headers = headers.copy() | |
download_headers['referer'] = url # Set the referer for downloads to the webpage URL | |
try: | |
response = requests.get(url, headers=headers, cookies=cookies) | |
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching webpage: {e}") | |
return | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Create download directory if it doesn't exist | |
os.makedirs(download_dir, exist_ok=True) | |
file_links = soup.select('table tr > td:nth-child(1) > a:nth-child(1)') | |
if not file_links: | |
print("No file links found on the webpage using the provided selector.") | |
return | |
print("Found file links, filtering...") | |
game_links = {} | |
links_to_download = [] | |
for link_element in file_links: | |
link_text = link_element.text | |
href = link_element['href'] | |
if href.startswith("//"): | |
download_url = urljoin("https://archive.org", href) # Ensure absolute URL | |
elif not href.startswith("http"): | |
download_url = urljoin(url, href) | |
if ("(USA)" in link_text or "(World)" in link_text or "(USA," in link_text or ", USA)" in link_text) and not any(exclude_word in link_text for exclude_word in exclude_words): | |
if not (link_text.endswith(".zip") or link_text.endswith(".7z") or link_text.endswith(".chd")): | |
print(f"Skipping non-standard file: {link_text}") | |
continue | |
url_encoded_filename = os.path.basename(download_url) | |
filename = unquote(url_encoded_filename) # **URL DECODE FILENAME HERE** | |
filepath = os.path.join(download_dir, filename) | |
revision, region = parse_filename_info(filename) | |
game_base_name = filename.split(" (", maxsplit=1)[0].strip().lower() | |
if game_base_name not in game_links: | |
game_links[game_base_name] = [] | |
game_links[game_base_name].append({ | |
"link": download_url, | |
"filename": filename, | |
"revision": revision, | |
"region": region, | |
"full_path": filepath | |
}) | |
for game_name, file_list in game_links.items(): | |
# Process revisions | |
max_region_revision = {} | |
best_region = None | |
if len(file_list) == 1: | |
links_to_download.append((file_list[0]["link"], file_list[0]["filename"], file_list[0]["full_path"])) | |
continue | |
for file_info in file_list: | |
if max_region_revision.get(file_info["region"]) is None: | |
max_region_revision[file_info["region"]] = -1 | |
max_revision = max_region_revision[file_info["region"]] | |
if file_info["revision"] is not None: | |
if file_info["revision"] > max_revision: | |
max_region_revision[file_info["region"]] = file_info["revision"] | |
if file_info["region"] is not None: | |
if best_region is None or file_info["region"] < best_region: | |
best_region = file_info["region"] | |
for file_info in file_list: | |
if file_info["revision"] >= max_region_revision[file_info["region"]] and file_info["region"] == best_region: | |
links_to_download.append((file_info["link"], file_info["filename"], file_info["full_path"])) | |
print("Downloading...") | |
for link, link_text, filepath in links_to_download: | |
if os.path.exists(filepath): | |
print(f"Skipping existing file: {link_text}") | |
continue | |
print(f"Downloading: {link_text} to {filepath}") | |
try: | |
with requests.get(link, headers=download_headers, cookies=cookies, stream=True) as download_response: | |
download_response.raise_for_status() | |
with open(filepath, 'wb') as outfile: | |
for chunk in download_response.iter_content(chunk_size=8192): # 8KB chunks | |
outfile.write(chunk) | |
except requests.exceptions.RequestException as e: | |
print(f"Error downloading {link_text}: {e}") | |
if __name__ == "__main__": | |
webpage_url = "" | |
download_directory = "roms" # Directory to save downloaded files | |
download_files_from_webpage(webpage_url, download_directory) | |
print("Script finished.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You need to edit
webpage_url
on line 248 and, if you are trying to download from archive.org, you will need your cookie string added tocookie_string
on line 120.You can get your cookie string using the Chrome Inspector.