Last active
October 21, 2025 19:26
-
-
Save BoQsc/3d8d326e476e0de7e9f60c947f32856a to your computer and use it in GitHub Desktop.
This is a https://www.meta.ai/ videos downloader, it uses playwright to capture webpage content links and download it.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| import re | |
| import sys | |
| import time | |
| import urllib.request | |
| from playwright.sync_api import sync_playwright | |
| # --- Configuration --- | |
| HAR_FILE = "meta.har" | |
| OUTPUT_DIR = "downloads" | |
| RETRY_LIMIT = 3 | |
| TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/" | |
| WAIT_TIME_MS = 0 #WAIT_TIME_MS = 5000 # Time to wait after page load to capture dynamic content | |
| # --- Helper Functions --- | |
| def clean_filename(name): | |
| """Cleans a string to be a safe filename.""" | |
| return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) | |
| def download_file(url, path): | |
| """Downloads a file from a URL with retry and progress reporting.""" | |
| for attempt in range(1, RETRY_LIMIT + 1): | |
| try: | |
| print(f"URL: {url}") | |
| with urllib.request.urlopen(url) as response, open(path, "wb") as f: | |
| total = int(response.headers.get("content-length", 0)) | |
| downloaded = 0 | |
| chunk_size = 8192 | |
| start_time = time.time() | |
| sys.stdout.write(f" Starting download (Size: {total/1024/1024:.2f} MB)...") | |
| sys.stdout.flush() | |
| while True: | |
| chunk = response.read(chunk_size) | |
| if not chunk: | |
| break | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| if total > 0: | |
| percent = downloaded * 100 / total | |
| # Calculate speed, adding a small epsilon to the denominator to prevent division by zero | |
| speed = downloaded / (1024 * (time.time() - start_time + 0.001)) | |
| sys.stdout.write(f"\r {percent:5.1f}% | {speed:6.1f} KB/s") | |
| sys.stdout.flush() | |
| sys.stdout.write("\n") | |
| print(f"โ Saved: {path}") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ ๏ธ Attempt {attempt}/{RETRY_LIMIT} failed: {e}") | |
| time.sleep(2) | |
| print(f"โ Failed to download: {url}") | |
| return False | |
| # --- Main Logic --- | |
| def scrape_and_capture(): | |
| """Uses Playwright to navigate, capture HAR, and close.""" | |
| print(f"๐ Starting Playwright to scrape: {TARGET_URL}") | |
| print(f"๐พ Network traffic will be saved to: {HAR_FILE}") | |
| try: | |
| with sync_playwright() as p: | |
| # Use 'headless=True' for silent operation, 'headless=False' to see the browser | |
| browser = p.chromium.launch(headless=True) | |
| # This is the key part: record_har_path saves all network traffic to the file | |
| context = browser.new_context(record_har_path=HAR_FILE) | |
| page = context.new_page() | |
| page.goto(TARGET_URL, wait_until="networkidle") | |
| # Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started | |
| print(f" Waiting {WAIT_TIME_MS/1000} seconds for content to load...") | |
| page.wait_for_timeout(WAIT_TIME_MS) | |
| context.close() | |
| browser.close() | |
| print("โ Playwright session complete. HAR file created.") | |
| except Exception as e: | |
| print(f"โ Playwright error: {e}") | |
| sys.exit(1) | |
| def parse_har_and_download(): | |
| """Parses the HAR file for video URLs and downloads them.""" | |
| if not os.path.exists(HAR_FILE): | |
| print(f"โ Missing HAR file: {HAR_FILE}. Did the scrape fail?") | |
| sys.exit(1) | |
| print(f"\n๐ฌ Parsing HAR file: {HAR_FILE}") | |
| with open(HAR_FILE, "r", encoding="utf-8") as f: | |
| har = json.load(f) | |
| entries = har.get("log", {}).get("entries", []) | |
| media_entries = [] | |
| seen_urls = set() | |
| for entry in entries: | |
| url = entry.get("request", {}).get("url", "") | |
| mime = entry.get("response", {}).get("content", {}).get("mimeType", "") | |
| # Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate | |
| if ".mp4" in url and "video" in mime and url not in seen_urls: | |
| seen_urls.add(url) | |
| # bodySize is not reliable for streamed content, but kept for context/size estimate | |
| size = int(entry.get("response", {}).get("bodySize", 0)) | |
| media_entries.append((url, size)) | |
| if not media_entries: | |
| print("โ No MP4 media URLs found in the HAR file.") | |
| return | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| print(f"๐ฆ Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'") | |
| for i, (url, size) in enumerate(media_entries, 1): | |
| # Create a clean filename from the last part of the URL (before any query params) | |
| url_part = url.split('/')[-1].split('?')[0] | |
| filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}" | |
| output_path = os.path.join(OUTPUT_DIR, filename) | |
| print(f"\nโถ๏ธ [{i}/{len(media_entries)}] Downloading {filename}") | |
| download_file(url, output_path) | |
| print("\nโ All downloads completed!") | |
| # Optional: Clean up the generated HAR file | |
| # os.remove(HAR_FILE) | |
| # print(f"๐๏ธ Cleaned up HAR file: {HAR_FILE}") | |
| if __name__ == "__main__": | |
| scrape_and_capture() | |
| parse_har_and_download() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| import re | |
| import sys | |
| import time | |
| import urllib.request | |
| from playwright.sync_api import sync_playwright | |
| # --- Configuration --- | |
| HAR_FILE = "meta.har" | |
| OUTPUT_DIR = "downloads" | |
| RETRY_LIMIT = 3 | |
| # Default URL will be used if no argument is provided | |
| DEFAULT_TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/" | |
| TARGET_URL = DEFAULT_TARGET_URL # Will be updated in the __main__ block | |
| WAIT_TIME_MS = 0 | |
| # --- Helper Functions --- | |
| def clean_filename(name): | |
| """Cleans a string to be a safe filename.""" | |
| return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) | |
| def download_file(url, path): | |
| """Downloads a file from a URL with retry and progress reporting.""" | |
| for attempt in range(1, RETRY_LIMIT + 1): | |
| try: | |
| print(f"URL: {url}") | |
| with urllib.request.urlopen(url) as response, open(path, "wb") as f: | |
| total = int(response.headers.get("content-length", 0)) | |
| downloaded = 0 | |
| chunk_size = 8192 | |
| start_time = time.time() | |
| sys.stdout.write(f" Starting download (Size: {total/1024/1024:.2f} MB)...") | |
| sys.stdout.flush() | |
| while True: | |
| chunk = response.read(chunk_size) | |
| if not chunk: | |
| break | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| if total > 0: | |
| percent = downloaded * 100 / total | |
| # Calculate speed, adding a small epsilon to the denominator to prevent division by zero | |
| speed = downloaded / (1024 * (time.time() - start_time + 0.001)) | |
| sys.stdout.write(f"\r {percent:5.1f}% | {speed:6.1f} KB/s") | |
| sys.stdout.flush() | |
| sys.stdout.write("\n") | |
| print(f"โ Saved: {path}") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ ๏ธ Attempt {attempt}/{RETRY_LIMIT} failed: {e}") | |
| time.sleep(2) | |
| print(f"โ Failed to download: {url}") | |
| return False | |
| # --- Main Logic --- | |
| def scrape_and_capture(): | |
| """Uses Playwright to navigate, capture HAR, and close.""" | |
| # TARGET_URL is now a global variable, which is updated in __main__ | |
| global TARGET_URL | |
| print(f"๐ Starting Playwright to scrape: {TARGET_URL}") | |
| print(f"๐พ Network traffic will be saved to: {HAR_FILE}") | |
| try: | |
| with sync_playwright() as p: | |
| # Use 'headless=True' for silent operation, 'headless=False' to see the browser | |
| browser = p.chromium.launch(headless=True) | |
| # This is the key part: record_har_path saves all network traffic to the file | |
| context = browser.new_context(record_har_path=HAR_FILE) | |
| page = context.new_page() | |
| page.goto(TARGET_URL, wait_until="networkidle") | |
| # Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started | |
| print(f" Waiting {WAIT_TIME_MS/1000} seconds for content to load...") | |
| page.wait_for_timeout(WAIT_TIME_MS) | |
| context.close() | |
| browser.close() | |
| print("โ Playwright session complete. HAR file created.") | |
| except Exception as e: | |
| print(f"โ Playwright error: {e}") | |
| sys.exit(1) | |
| def parse_har_and_download(): | |
| """Parses the HAR file for video URLs and downloads them.""" | |
| if not os.path.exists(HAR_FILE): | |
| print(f"โ Missing HAR file: {HAR_FILE}. Did the scrape fail?") | |
| sys.exit(1) | |
| print(f"\n๐ฌ Parsing HAR file: {HAR_FILE}") | |
| with open(HAR_FILE, "r", encoding="utf-8") as f: | |
| har = json.load(f) | |
| entries = har.get("log", {}).get("entries", []) | |
| media_entries = [] | |
| seen_urls = set() | |
| for entry in entries: | |
| url = entry.get("request", {}).get("url", "") | |
| mime = entry.get("response", {}).get("content", {}).get("mimeType", "") | |
| # Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate | |
| if ".mp4" in url and "video" in mime and url not in seen_urls: | |
| seen_urls.add(url) | |
| # bodySize is not reliable for streamed content, but kept for context/size estimate | |
| size = int(entry.get("response", {}).get("bodySize", 0)) | |
| media_entries.append((url, size)) | |
| if not media_entries: | |
| print("โ No MP4 media URLs found in the HAR file.") | |
| return | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| print(f"๐ฆ Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'") | |
| for i, (url, size) in enumerate(media_entries, 1): | |
| # Create a clean filename from the last part of the URL (before any query params) | |
| url_part = url.split('/')[-1].split('?')[0] | |
| filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}" | |
| output_path = os.path.join(OUTPUT_DIR, filename) | |
| print(f"\nโถ๏ธ [{i}/{len(media_entries)}] Downloading {filename}") | |
| download_file(url, output_path) | |
| print("\nโ All downloads completed!") | |
| # Optional: Clean up the generated HAR file | |
| # os.remove(HAR_FILE) | |
| # print(f"๐๏ธ Cleaned up HAR file: {HAR_FILE}") | |
| if __name__ == "__main__": | |
| # Check if a command-line argument for the URL was provided | |
| if len(sys.argv) > 1: | |
| # The first argument (index 0) is the script name. The second (index 1) is the first argument. | |
| new_url = sys.argv[1] | |
| print(f"โ Using command-line URL: {new_url}") | |
| TARGET_URL = new_url | |
| else: | |
| print(f"โน๏ธ No URL provided. Using default target URL: {TARGET_URL}") | |
| scrape_and_capture() | |
| parse_har_and_download() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment