Skip to content

Instantly share code, notes, and snippets.

@shriyanss
Last active May 9, 2025 11:48
Show Gist options
  • Save shriyanss/2c5a8d4706a8a1efc5d4370c2a47728f to your computer and use it in GitHub Desktop.
Save shriyanss/2c5a8d4706a8a1efc5d4370c2a47728f to your computer and use it in GitHub Desktop.
Download JS files that are being loaded on the webpage from a list of sites
from playwright.sync_api import sync_playwright
from urllib.parse import urlparse
import os
import requests
# Read your list of URLs
with open('comet_urls.txt') as f:
urls = f.read().splitlines()
def get_site_directory(url):
"""Create a directory name based on the site's domain"""
domain = urlparse(url).hostname
if domain:
# Replace dots with underscores and remove www prefix if present
dir_name = domain.replace('.', '_')
if dir_name.startswith('www_'):
dir_name = dir_name[4:]
return os.path.join('downloaded_files', dir_name)
return 'downloaded_files'
# Create base directory
os.makedirs('downloaded_files', exist_ok=True)
def is_desired_type(url):
return url.endswith('.js') or url.endswith('.html')
def is_same_domain(base_url, target_url):
base_domain = urlparse(base_url).hostname
target_domain = urlparse(target_url).hostname
return base_domain == target_domain
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context()
for url in urls:
try:
print(f"\n[+] Visiting {url}")
page = context.new_page()
collected_urls = set()
# Intercept all requests
def handle_request(request):
req_url = request.url
try:
if is_desired_type(req_url) and is_same_domain(url, req_url):
collected_urls.add(req_url)
except Exception as e:
print(f"[-] Error in request handler: {e}")
page.on("request", handle_request)
# Load the page and wait until all requests are done
try:
response = page.goto(url, wait_until="networkidle", timeout=30000)
if response and response.status != 200:
print(f"[-] Failed to load {url}: Status {response.status}")
continue
except Exception as e:
print(f"[-] Error loading {url}: {e}")
continue
# Download each collected JS/HTML file
for resource_url in collected_urls:
try:
filename = os.path.basename(urlparse(resource_url).path)
if not filename:
filename = "index.html" if resource_url.endswith(".html") else "file.js"
# Create site-specific directory
site_dir = get_site_directory(url)
os.makedirs(site_dir, exist_ok=True)
filepath = os.path.join(site_dir, filename)
res = requests.get(resource_url, timeout=10)
if res.status_code == 200:
with open(filepath, "w", encoding="utf-8") as f:
f.write(res.text)
# Save base URL for JS files
if filename.endswith('.js'):
base_url_path = os.path.join(site_dir, "base_urls.txt")
# Create or append to the base URLs file
# Read existing URLs to avoid duplicates
existing_urls = set()
if os.path.exists(base_url_path):
with open(base_url_path, "r", encoding="utf-8") as f:
existing_urls = set(line.strip() for line in f if line.strip())
if url not in existing_urls:
with open(base_url_path, "a", encoding="utf-8") as f:
f.write(f"{url}\n")
print(f"[✓] Added base URL for: {filename}")
else:
print(f"[✓] Base URL already exists for: {filename}")
print(f"[✓] Saved: {filepath}")
else:
print(f"[-] Failed to download {resource_url}: Status {res.status_code}")
except Exception as e:
print(f"[-] Failed to download {resource_url}: {e}")
# Close the page after processing
page.close()
except Exception as e:
print(f"[-] Error processing {url}: {e}")
continue
browser.close()
python3 -m pip install playwright
python3 -m pip install requests
python3 -m playwright install
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment