heygrady · March 19, 2025 15:31 · heygrady · Mar 19, 2025
diff --git a/download_roms.py b/download_roms.py
 import requests
 from bs4 import BeautifulSoup
 import os
 from urllib.parse import urljoin, unquote
 import json
 import re

 exclude_words = [
    "Pack",
    "Pak",
    "Great Games",
    " in 1",
    " on 1",
    "-in-1",
    "-on-1",
    "in One",
    "on One",
    "Superpack",
    "(LodgeNet)",
    "(Sega",
    "(Arcade)",
    "(Beta",
    "Beta)",
    "(Demo",
    "Demo)",
    "(Unl)",
    "(Proto",
    "Proto)",
    "(PAL)",
    "(MPAL)",
    "(SECAM)",
    "(Pirate)",
    "(Competition",
    "(Virtual Console)",
    "(Virtual Console",
    "(Promo",
    "Promo)",
    "(Sample)",
    "(iam8bit)",
    "(e-Reader Edition)",
    "(Capcom Town)",
    "(Disney Classic Games)",
    "(Re-release)",
    "(Final Cut)",
    "(Capcom Classics",
    "(Genesis Mini)",
    "(Lock-on Combination)"
    "(Switch",
    "Switch)",
    "(Steam)",
    "(Pt)",
    "(De)",
    "(Fr)",
    "(Es)",
    "(Ja)",
    "(Alt",
    "(Wii",
    "(GameCube",
    "(Kiosk)",
    "(Wi-Fi Kiosk)",
    "(Enhancement Chip)",
    "(Collection of Mana)",
    "(Collection of SaGa)",
    "(Cowabunga Collection, The)",
    "Advance Collection)",
    "(Atari Anthology)",
    "(Activision Anthology",
    "(Enhanced)",
    "(Piko Interactive)",
    "(QUByte Classics)",
    "(Aftermarket)",
    "(Retro",
    "Collection)",
    "(Strictly",
    "(SNS-XM)",
    "[b]",
    "(Test Program)",
    "(Program)",
    "(Audio Tapes)",
    "(Test",
    "(Limited Run Games)",
    "[BIOS]",
 ]

 def parse_filename_info(filename):
    region_patterns = {
        "(USA)": 0,
        "(USA, Canada)": 1,
        "(USA, Australia)": 2,
        "(USA, Europe)": 3,
        "(USA, Europe, Brazil)": 4,
        "(USA, Brazil)": 5,
        "(USA, Europe, Korea)": 6,
        "(USA, Korea)": 7,
        "(Japan, USA)": 8,
        "(World)": 9,
    }

    region = -1
    for pattern, region_name in region_patterns.items():
        if pattern in filename:
            region = region_name
            break

    # Handles digits or letters
    revision_match = re.search(r"\(Rev ([0-9A-Za-z]+)\)", filename, re.IGNORECASE)
    revision = 0  # Default revision if not found

    if revision_match:
        revision_str = revision_match.group(1).upper()
        if revision_str.isdigit():
            revision = int(revision_str)
        elif revision_str.isalpha():
            revision = ord(revision_str) - ord('A') + 1 # Convert letter to number (A=1, B=2, ...)
        else:
            revision = 0

    return revision, region

 cookie_string = ''
 cookies = {}
 for cookie_pair in cookie_string.split('; '):
    key_value = cookie_pair.split('=', 1) # Split only once at the first '='
    if len(key_value) == 2:
        key, value = key_value
        cookies[key] = value

 headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'priority': 'u=0, i',
    'referer': 'https://r-roms.github.io/',
    'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'cross-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
 }

 def download_files_from_webpage(url, download_dir):
    """
    Downloads files from a webpage based on specific criteria.

    Args:
        url (str): The URL of the webpage to scrape.
        download_dir (str): The directory to save downloaded files to.
    """

    download_headers = headers.copy()
    download_headers['referer'] = url # Set the referer for downloads to the webpage URL

    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching webpage: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Create download directory if it doesn't exist
    os.makedirs(download_dir, exist_ok=True)

    file_links = soup.select('table tr > td:nth-child(1) > a:nth-child(1)')

    if not file_links:
        print("No file links found on the webpage using the provided selector.")
        return

    print("Found file links, filtering...")
    game_links = {}
    links_to_download = []
    for link_element in file_links:
        link_text = link_element.text
        href = link_element['href']
        if href.startswith("//"):
            download_url = urljoin("https://archive.org", href) # Ensure absolute URL
        elif not href.startswith("http"):
            download_url = urljoin(url, href)
        if ("(USA)" in link_text or "(World)" in link_text or "(USA," in link_text or ", USA)" in link_text) and not any(exclude_word in link_text for exclude_word in exclude_words):
            if not (link_text.endswith(".zip") or link_text.endswith(".7z") or link_text.endswith(".chd")):
                print(f"Skipping non-standard file: {link_text}")
                continue

            url_encoded_filename = os.path.basename(download_url)
            filename = unquote(url_encoded_filename) # **URL DECODE FILENAME HERE**
            filepath = os.path.join(download_dir, filename)

            revision, region = parse_filename_info(filename)

            game_base_name = filename.split(" (", maxsplit=1)[0].strip().lower()
            if game_base_name not in game_links:
                game_links[game_base_name] = []

            game_links[game_base_name].append({
                "link": download_url,
                "filename": filename,
                "revision": revision,
                "region": region,
                "full_path": filepath
            })

    for game_name, file_list in game_links.items():
        # Process revisions
        max_region_revision = {}
        best_region = None
        if len(file_list) == 1:
            links_to_download.append((file_list[0]["link"], file_list[0]["filename"], file_list[0]["full_path"]))
            continue

        for file_info in file_list:
            if max_region_revision.get(file_info["region"]) is None:
                max_region_revision[file_info["region"]] = -1
            max_revision = max_region_revision[file_info["region"]]
            if file_info["revision"] is not None:
                if file_info["revision"] > max_revision:
                    max_region_revision[file_info["region"]] = file_info["revision"]
            if file_info["region"] is not None:
                if best_region is None or file_info["region"] < best_region:
                    best_region = file_info["region"]

        for file_info in file_list:
            if file_info["revision"] >= max_region_revision[file_info["region"]] and file_info["region"] == best_region:
                links_to_download.append((file_info["link"], file_info["filename"], file_info["full_path"]))

    print("Downloading...")
    for link, link_text, filepath in links_to_download:
        if os.path.exists(filepath):
            print(f"Skipping existing file: {link_text}")
            continue
        print(f"Downloading: {link_text} to {filepath}")
        try:
            with requests.get(link, headers=download_headers, cookies=cookies, stream=True) as download_response:
                download_response.raise_for_status()
                with open(filepath, 'wb') as outfile:
                    for chunk in download_response.iter_content(chunk_size=8192): # 8KB chunks
                        outfile.write(chunk)
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {link_text}: {e}")


 if __name__ == "__main__":
    webpage_url = ""
    download_directory = "roms"  # Directory to save downloaded files

    download_files_from_webpage(webpage_url, download_directory)
    print("Script finished.")
	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.parse import urljoin, unquote
	import json
	import re

	exclude_words = [
	"Pack",
	"Pak",
	"Great Games",
	" in 1",
	" on 1",
	"-in-1",
	"-on-1",
	"in One",
	"on One",
	"Superpack",
	"(LodgeNet)",
	"(Sega",
	"(Arcade)",
	"(Beta",
	"Beta)",
	"(Demo",
	"Demo)",
	"(Unl)",
	"(Proto",
	"Proto)",
	"(PAL)",
	"(MPAL)",
	"(SECAM)",
	"(Pirate)",
	"(Competition",
	"(Virtual Console)",
	"(Virtual Console",
	"(Promo",
	"Promo)",
	"(Sample)",
	"(iam8bit)",
	"(e-Reader Edition)",
	"(Capcom Town)",
	"(Disney Classic Games)",
	"(Re-release)",
	"(Final Cut)",
	"(Capcom Classics",
	"(Genesis Mini)",
	"(Lock-on Combination)"
	"(Switch",
	"Switch)",
	"(Steam)",
	"(Pt)",
	"(De)",
	"(Fr)",
	"(Es)",
	"(Ja)",
	"(Alt",
	"(Wii",
	"(GameCube",
	"(Kiosk)",
	"(Wi-Fi Kiosk)",
	"(Enhancement Chip)",
	"(Collection of Mana)",
	"(Collection of SaGa)",
	"(Cowabunga Collection, The)",
	"Advance Collection)",
	"(Atari Anthology)",
	"(Activision Anthology",
	"(Enhanced)",
	"(Piko Interactive)",
	"(QUByte Classics)",
	"(Aftermarket)",
	"(Retro",
	"Collection)",
	"(Strictly",
	"(SNS-XM)",
	"[b]",
	"(Test Program)",
	"(Program)",
	"(Audio Tapes)",
	"(Test",
	"(Limited Run Games)",
	"[BIOS]",
	]

	def parse_filename_info(filename):
	region_patterns = {
	"(USA)": 0,
	"(USA, Canada)": 1,
	"(USA, Australia)": 2,
	"(USA, Europe)": 3,
	"(USA, Europe, Brazil)": 4,
	"(USA, Brazil)": 5,
	"(USA, Europe, Korea)": 6,
	"(USA, Korea)": 7,
	"(Japan, USA)": 8,
	"(World)": 9,
	}

	region = -1
	for pattern, region_name in region_patterns.items():
	if pattern in filename:
	region = region_name
	break

	# Handles digits or letters
	revision_match = re.search(r"\(Rev ([0-9A-Za-z]+)\)", filename, re.IGNORECASE)
	revision = 0 # Default revision if not found

	if revision_match:
	revision_str = revision_match.group(1).upper()
	if revision_str.isdigit():
	revision = int(revision_str)
	elif revision_str.isalpha():
	revision = ord(revision_str) - ord('A') + 1 # Convert letter to number (A=1, B=2, ...)
	else:
	revision = 0

	return revision, region

	cookie_string = ''
	cookies = {}
	for cookie_pair in cookie_string.split('; '):
	key_value = cookie_pair.split('=', 1) # Split only once at the first '='
	if len(key_value) == 2:
	key, value = key_value
	cookies[key] = value

	headers = {
	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7',
	'accept-language': 'en-US,en;q=0.9',
	'cache-control': 'max-age=0',
	'priority': 'u=0, i',
	'referer': 'https://r-roms.github.io/',
	'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"macOS"',
	'sec-fetch-dest': 'document',
	'sec-fetch-mode': 'navigate',
	'sec-fetch-site': 'cross-site',
	'sec-fetch-user': '?1',
	'upgrade-insecure-requests': '1',
	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
	}

	def download_files_from_webpage(url, download_dir):
	"""
	Downloads files from a webpage based on specific criteria.

	Args:
	url (str): The URL of the webpage to scrape.
	download_dir (str): The directory to save downloaded files to.
	"""

	download_headers = headers.copy()
	download_headers['referer'] = url # Set the referer for downloads to the webpage URL

	try:
	response = requests.get(url, headers=headers, cookies=cookies)
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
	except requests.exceptions.RequestException as e:
	print(f"Error fetching webpage: {e}")
	return

	soup = BeautifulSoup(response.content, 'html.parser')

	# Create download directory if it doesn't exist
	os.makedirs(download_dir, exist_ok=True)

	file_links = soup.select('table tr > td:nth-child(1) > a:nth-child(1)')

	if not file_links:
	print("No file links found on the webpage using the provided selector.")
	return

	print("Found file links, filtering...")
	game_links = {}
	links_to_download = []
	for link_element in file_links:
	link_text = link_element.text
	href = link_element['href']
	if href.startswith("//"):
	download_url = urljoin("https://archive.org", href) # Ensure absolute URL
	elif not href.startswith("http"):
	download_url = urljoin(url, href)
	if ("(USA)" in link_text or "(World)" in link_text or "(USA," in link_text or ", USA)" in link_text) and not any(exclude_word in link_text for exclude_word in exclude_words):
	if not (link_text.endswith(".zip") or link_text.endswith(".7z") or link_text.endswith(".chd")):
	print(f"Skipping non-standard file: {link_text}")
	continue

	url_encoded_filename = os.path.basename(download_url)
	filename = unquote(url_encoded_filename) # URL DECODE FILENAME HERE
	filepath = os.path.join(download_dir, filename)

	revision, region = parse_filename_info(filename)

	game_base_name = filename.split(" (", maxsplit=1)[0].strip().lower()
	if game_base_name not in game_links:
	game_links[game_base_name] = []

	game_links[game_base_name].append({
	"link": download_url,
	"filename": filename,
	"revision": revision,
	"region": region,
	"full_path": filepath
	})

	for game_name, file_list in game_links.items():
	# Process revisions
	max_region_revision = {}
	best_region = None
	if len(file_list) == 1:
	links_to_download.append((file_list[0]["link"], file_list[0]["filename"], file_list[0]["full_path"]))
	continue

	for file_info in file_list:
	if max_region_revision.get(file_info["region"]) is None:
	max_region_revision[file_info["region"]] = -1
	max_revision = max_region_revision[file_info["region"]]
	if file_info["revision"] is not None:
	if file_info["revision"] > max_revision:
	max_region_revision[file_info["region"]] = file_info["revision"]
	if file_info["region"] is not None:
	if best_region is None or file_info["region"] < best_region:
	best_region = file_info["region"]

	for file_info in file_list:
	if file_info["revision"] >= max_region_revision[file_info["region"]] and file_info["region"] == best_region:
	links_to_download.append((file_info["link"], file_info["filename"], file_info["full_path"]))

	print("Downloading...")
	for link, link_text, filepath in links_to_download:
	if os.path.exists(filepath):
	print(f"Skipping existing file: {link_text}")
	continue
	print(f"Downloading: {link_text} to {filepath}")
	try:
	with requests.get(link, headers=download_headers, cookies=cookies, stream=True) as download_response:
	download_response.raise_for_status()
	with open(filepath, 'wb') as outfile:
	for chunk in download_response.iter_content(chunk_size=8192): # 8KB chunks
	outfile.write(chunk)
	except requests.exceptions.RequestException as e:
	print(f"Error downloading {link_text}: {e}")


	if __name__ == "__main__":
	webpage_url = ""
	download_directory = "roms" # Directory to save downloaded files

	download_files_from_webpage(webpage_url, download_directory)
	print("Script finished.")