Last active
May 9, 2025 11:48
-
-
Save shriyanss/2c5a8d4706a8a1efc5d4370c2a47728f to your computer and use it in GitHub Desktop.
Download JS files that are being loaded on the webpage from a list of sites
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright | |
from urllib.parse import urlparse | |
import os | |
import requests | |
# Read your list of URLs | |
with open('comet_urls.txt') as f: | |
urls = f.read().splitlines() | |
def get_site_directory(url): | |
"""Create a directory name based on the site's domain""" | |
domain = urlparse(url).hostname | |
if domain: | |
# Replace dots with underscores and remove www prefix if present | |
dir_name = domain.replace('.', '_') | |
if dir_name.startswith('www_'): | |
dir_name = dir_name[4:] | |
return os.path.join('downloaded_files', dir_name) | |
return 'downloaded_files' | |
# Create base directory | |
os.makedirs('downloaded_files', exist_ok=True) | |
def is_desired_type(url): | |
return url.endswith('.js') or url.endswith('.html') | |
def is_same_domain(base_url, target_url): | |
base_domain = urlparse(base_url).hostname | |
target_domain = urlparse(target_url).hostname | |
return base_domain == target_domain | |
with sync_playwright() as p: | |
browser = p.chromium.launch(headless=True) | |
context = browser.new_context() | |
for url in urls: | |
try: | |
print(f"\n[+] Visiting {url}") | |
page = context.new_page() | |
collected_urls = set() | |
# Intercept all requests | |
def handle_request(request): | |
req_url = request.url | |
try: | |
if is_desired_type(req_url) and is_same_domain(url, req_url): | |
collected_urls.add(req_url) | |
except Exception as e: | |
print(f"[-] Error in request handler: {e}") | |
page.on("request", handle_request) | |
# Load the page and wait until all requests are done | |
try: | |
response = page.goto(url, wait_until="networkidle", timeout=30000) | |
if response and response.status != 200: | |
print(f"[-] Failed to load {url}: Status {response.status}") | |
continue | |
except Exception as e: | |
print(f"[-] Error loading {url}: {e}") | |
continue | |
# Download each collected JS/HTML file | |
for resource_url in collected_urls: | |
try: | |
filename = os.path.basename(urlparse(resource_url).path) | |
if not filename: | |
filename = "index.html" if resource_url.endswith(".html") else "file.js" | |
# Create site-specific directory | |
site_dir = get_site_directory(url) | |
os.makedirs(site_dir, exist_ok=True) | |
filepath = os.path.join(site_dir, filename) | |
res = requests.get(resource_url, timeout=10) | |
if res.status_code == 200: | |
with open(filepath, "w", encoding="utf-8") as f: | |
f.write(res.text) | |
# Save base URL for JS files | |
if filename.endswith('.js'): | |
base_url_path = os.path.join(site_dir, "base_urls.txt") | |
# Create or append to the base URLs file | |
# Read existing URLs to avoid duplicates | |
existing_urls = set() | |
if os.path.exists(base_url_path): | |
with open(base_url_path, "r", encoding="utf-8") as f: | |
existing_urls = set(line.strip() for line in f if line.strip()) | |
if url not in existing_urls: | |
with open(base_url_path, "a", encoding="utf-8") as f: | |
f.write(f"{url}\n") | |
print(f"[✓] Added base URL for: {filename}") | |
else: | |
print(f"[✓] Base URL already exists for: {filename}") | |
print(f"[✓] Saved: {filepath}") | |
else: | |
print(f"[-] Failed to download {resource_url}: Status {res.status_code}") | |
except Exception as e: | |
print(f"[-] Failed to download {resource_url}: {e}") | |
# Close the page after processing | |
page.close() | |
except Exception as e: | |
print(f"[-] Error processing {url}: {e}") | |
continue | |
browser.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python3 -m pip install playwright | |
python3 -m pip install requests | |
python3 -m playwright install |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment