shriyanss · May 9, 2025 11:48
diff --git a/download_js_files.py b/download_js_files.py
 from playwright.sync_api import sync_playwright
 from urllib.parse import urlparse
 import os
 import requests

 # Read your list of URLs
 with open('comet_urls.txt') as f:
    urls = f.read().splitlines()

 def get_site_directory(url):
    """Create a directory name based on the site's domain"""
    domain = urlparse(url).hostname
    if domain:
        # Replace dots with underscores and remove www prefix if present
        dir_name = domain.replace('.', '_')
        if dir_name.startswith('www_'):
            dir_name = dir_name[4:]
        return os.path.join('downloaded_files', dir_name)
    return 'downloaded_files'

 # Create base directory
 os.makedirs('downloaded_files', exist_ok=True)

 def is_desired_type(url):
    return url.endswith('.js') or url.endswith('.html')

 def is_same_domain(base_url, target_url):
    base_domain = urlparse(base_url).hostname
    target_domain = urlparse(target_url).hostname
    return base_domain == target_domain

 with sync_playwright() as p:
    browser = p.chromium.launch(headless=True)
    context = browser.new_context()
    
    for url in urls:
        try:
            print(f"\n[+] Visiting {url}")
            page = context.new_page()
            collected_urls = set()

            # Intercept all requests
            def handle_request(request):
                req_url = request.url
                try:
                    if is_desired_type(req_url) and is_same_domain(url, req_url):
                        collected_urls.add(req_url)
                except Exception as e:
                    print(f"[-] Error in request handler: {e}")

            page.on("request", handle_request)

            # Load the page and wait until all requests are done
            try:
                response = page.goto(url, wait_until="networkidle", timeout=30000)
                if response and response.status != 200:
                    print(f"[-] Failed to load {url}: Status {response.status}")
                    continue
            except Exception as e:
                print(f"[-] Error loading {url}: {e}")
                continue

            # Download each collected JS/HTML file
            for resource_url in collected_urls:
                try:
                    filename = os.path.basename(urlparse(resource_url).path)
                    if not filename:
                        filename = "index.html" if resource_url.endswith(".html") else "file.js"

                    # Create site-specific directory
                    site_dir = get_site_directory(url)
                    os.makedirs(site_dir, exist_ok=True)
                    filepath = os.path.join(site_dir, filename)

                    res = requests.get(resource_url, timeout=10)
                    if res.status_code == 200:
                        with open(filepath, "w", encoding="utf-8") as f:
                            f.write(res.text)
                        
                        # Save base URL for JS files
                        if filename.endswith('.js'):
                            base_url_path = os.path.join(site_dir, "base_urls.txt")
                            # Create or append to the base URLs file
                            # Read existing URLs to avoid duplicates
                            existing_urls = set()
                            if os.path.exists(base_url_path):
                                with open(base_url_path, "r", encoding="utf-8") as f:
                                    existing_urls = set(line.strip() for line in f if line.strip())
                            
                            if url not in existing_urls:
                                with open(base_url_path, "a", encoding="utf-8") as f:
                                    f.write(f"{url}\n")
                                print(f"[✓] Added base URL for: {filename}")
                            else:
                                print(f"[✓] Base URL already exists for: {filename}")
                        
                        print(f"[✓] Saved: {filepath}")
                    else:
                        print(f"[-] Failed to download {resource_url}: Status {res.status_code}")

                except Exception as e:
                    print(f"[-] Failed to download {resource_url}: {e}")

            # Close the page after processing
            page.close()

        except Exception as e:
            print(f"[-] Error processing {url}: {e}")
            continue

    browser.close()
diff --git a/install_tools.sh b/install_tools.sh
 python3 -m pip install playwright
 python3 -m pip install requests
 python3 -m playwright install
	from playwright.sync_api import sync_playwright
	from urllib.parse import urlparse
	import os
	import requests

	# Read your list of URLs
	with open('comet_urls.txt') as f:
	urls = f.read().splitlines()

	def get_site_directory(url):
	"""Create a directory name based on the site's domain"""
	domain = urlparse(url).hostname
	if domain:
	# Replace dots with underscores and remove www prefix if present
	dir_name = domain.replace('.', '_')
	if dir_name.startswith('www_'):
	dir_name = dir_name[4:]
	return os.path.join('downloaded_files', dir_name)
	return 'downloaded_files'

	# Create base directory
	os.makedirs('downloaded_files', exist_ok=True)

	def is_desired_type(url):
	return url.endswith('.js') or url.endswith('.html')

	def is_same_domain(base_url, target_url):
	base_domain = urlparse(base_url).hostname
	target_domain = urlparse(target_url).hostname
	return base_domain == target_domain

	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	context = browser.new_context()

	for url in urls:
	try:
	print(f"\n[+] Visiting {url}")
	page = context.new_page()
	collected_urls = set()

	# Intercept all requests
	def handle_request(request):
	req_url = request.url
	try:
	if is_desired_type(req_url) and is_same_domain(url, req_url):
	collected_urls.add(req_url)
	except Exception as e:
	print(f"[-] Error in request handler: {e}")

	page.on("request", handle_request)

	# Load the page and wait until all requests are done
	try:
	response = page.goto(url, wait_until="networkidle", timeout=30000)
	if response and response.status != 200:
	print(f"[-] Failed to load {url}: Status {response.status}")
	continue
	except Exception as e:
	print(f"[-] Error loading {url}: {e}")
	continue

	# Download each collected JS/HTML file
	for resource_url in collected_urls:
	try:
	filename = os.path.basename(urlparse(resource_url).path)
	if not filename:
	filename = "index.html" if resource_url.endswith(".html") else "file.js"

	# Create site-specific directory
	site_dir = get_site_directory(url)
	os.makedirs(site_dir, exist_ok=True)
	filepath = os.path.join(site_dir, filename)

	res = requests.get(resource_url, timeout=10)
	if res.status_code == 200:
	with open(filepath, "w", encoding="utf-8") as f:
	f.write(res.text)

	# Save base URL for JS files
	if filename.endswith('.js'):
	base_url_path = os.path.join(site_dir, "base_urls.txt")
	# Create or append to the base URLs file
	# Read existing URLs to avoid duplicates
	existing_urls = set()
	if os.path.exists(base_url_path):
	with open(base_url_path, "r", encoding="utf-8") as f:
	existing_urls = set(line.strip() for line in f if line.strip())

	if url not in existing_urls:
	with open(base_url_path, "a", encoding="utf-8") as f:
	f.write(f"{url}\n")
	print(f"[✓] Added base URL for: {filename}")
	else:
	print(f"[✓] Base URL already exists for: {filename}")

	print(f"[✓] Saved: {filepath}")
	else:
	print(f"[-] Failed to download {resource_url}: Status {res.status_code}")

	except Exception as e:
	print(f"[-] Failed to download {resource_url}: {e}")

	# Close the page after processing
	page.close()

	except Exception as e:
	print(f"[-] Error processing {url}: {e}")
	continue

	browser.close()
	python3 -m pip install playwright
	python3 -m pip install requests
	python3 -m playwright install