Skip to content

Instantly share code, notes, and snippets.

@BoQsc
Last active October 21, 2025 19:26
Show Gist options
  • Save BoQsc/3d8d326e476e0de7e9f60c947f32856a to your computer and use it in GitHub Desktop.
Save BoQsc/3d8d326e476e0de7e9f60c947f32856a to your computer and use it in GitHub Desktop.
This is a https://www.meta.ai/ videos downloader, it uses playwright to capture webpage content links and download it.
import json
import os
import re
import sys
import time
import urllib.request
from playwright.sync_api import sync_playwright
# --- Configuration ---
HAR_FILE = "meta.har"
OUTPUT_DIR = "downloads"
RETRY_LIMIT = 3
TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/"
WAIT_TIME_MS = 0 #WAIT_TIME_MS = 5000 # Time to wait after page load to capture dynamic content
# --- Helper Functions ---
def clean_filename(name):
"""Cleans a string to be a safe filename."""
return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
def download_file(url, path):
"""Downloads a file from a URL with retry and progress reporting."""
for attempt in range(1, RETRY_LIMIT + 1):
try:
print(f"URL: {url}")
with urllib.request.urlopen(url) as response, open(path, "wb") as f:
total = int(response.headers.get("content-length", 0))
downloaded = 0
chunk_size = 8192
start_time = time.time()
sys.stdout.write(f" Starting download (Size: {total/1024/1024:.2f} MB)...")
sys.stdout.flush()
while True:
chunk = response.read(chunk_size)
if not chunk:
break
f.write(chunk)
downloaded += len(chunk)
if total > 0:
percent = downloaded * 100 / total
# Calculate speed, adding a small epsilon to the denominator to prevent division by zero
speed = downloaded / (1024 * (time.time() - start_time + 0.001))
sys.stdout.write(f"\r {percent:5.1f}% | {speed:6.1f} KB/s")
sys.stdout.flush()
sys.stdout.write("\n")
print(f"โœ… Saved: {path}")
return True
except Exception as e:
print(f"\nโš ๏ธ Attempt {attempt}/{RETRY_LIMIT} failed: {e}")
time.sleep(2)
print(f"โŒ Failed to download: {url}")
return False
# --- Main Logic ---
def scrape_and_capture():
"""Uses Playwright to navigate, capture HAR, and close."""
print(f"๐Ÿš€ Starting Playwright to scrape: {TARGET_URL}")
print(f"๐Ÿ’พ Network traffic will be saved to: {HAR_FILE}")
try:
with sync_playwright() as p:
# Use 'headless=True' for silent operation, 'headless=False' to see the browser
browser = p.chromium.launch(headless=True)
# This is the key part: record_har_path saves all network traffic to the file
context = browser.new_context(record_har_path=HAR_FILE)
page = context.new_page()
page.goto(TARGET_URL, wait_until="networkidle")
# Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started
print(f" Waiting {WAIT_TIME_MS/1000} seconds for content to load...")
page.wait_for_timeout(WAIT_TIME_MS)
context.close()
browser.close()
print("โœ… Playwright session complete. HAR file created.")
except Exception as e:
print(f"โŒ Playwright error: {e}")
sys.exit(1)
def parse_har_and_download():
"""Parses the HAR file for video URLs and downloads them."""
if not os.path.exists(HAR_FILE):
print(f"โŒ Missing HAR file: {HAR_FILE}. Did the scrape fail?")
sys.exit(1)
print(f"\n๐Ÿ”ฌ Parsing HAR file: {HAR_FILE}")
with open(HAR_FILE, "r", encoding="utf-8") as f:
har = json.load(f)
entries = har.get("log", {}).get("entries", [])
media_entries = []
seen_urls = set()
for entry in entries:
url = entry.get("request", {}).get("url", "")
mime = entry.get("response", {}).get("content", {}).get("mimeType", "")
# Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate
if ".mp4" in url and "video" in mime and url not in seen_urls:
seen_urls.add(url)
# bodySize is not reliable for streamed content, but kept for context/size estimate
size = int(entry.get("response", {}).get("bodySize", 0))
media_entries.append((url, size))
if not media_entries:
print("โŒ No MP4 media URLs found in the HAR file.")
return
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"๐Ÿ“ฆ Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'")
for i, (url, size) in enumerate(media_entries, 1):
# Create a clean filename from the last part of the URL (before any query params)
url_part = url.split('/')[-1].split('?')[0]
filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}"
output_path = os.path.join(OUTPUT_DIR, filename)
print(f"\nโ–ถ๏ธ [{i}/{len(media_entries)}] Downloading {filename}")
download_file(url, output_path)
print("\nโœ… All downloads completed!")
# Optional: Clean up the generated HAR file
# os.remove(HAR_FILE)
# print(f"๐Ÿ—‘๏ธ Cleaned up HAR file: {HAR_FILE}")
if __name__ == "__main__":
scrape_and_capture()
parse_har_and_download()
import json
import os
import re
import sys
import time
import urllib.request
from playwright.sync_api import sync_playwright
# --- Configuration ---
HAR_FILE = "meta.har"
OUTPUT_DIR = "downloads"
RETRY_LIMIT = 3
# Default URL will be used if no argument is provided
DEFAULT_TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/"
TARGET_URL = DEFAULT_TARGET_URL # Will be updated in the __main__ block
WAIT_TIME_MS = 0
# --- Helper Functions ---
def clean_filename(name):
"""Cleans a string to be a safe filename."""
return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
def download_file(url, path):
"""Downloads a file from a URL with retry and progress reporting."""
for attempt in range(1, RETRY_LIMIT + 1):
try:
print(f"URL: {url}")
with urllib.request.urlopen(url) as response, open(path, "wb") as f:
total = int(response.headers.get("content-length", 0))
downloaded = 0
chunk_size = 8192
start_time = time.time()
sys.stdout.write(f" Starting download (Size: {total/1024/1024:.2f} MB)...")
sys.stdout.flush()
while True:
chunk = response.read(chunk_size)
if not chunk:
break
f.write(chunk)
downloaded += len(chunk)
if total > 0:
percent = downloaded * 100 / total
# Calculate speed, adding a small epsilon to the denominator to prevent division by zero
speed = downloaded / (1024 * (time.time() - start_time + 0.001))
sys.stdout.write(f"\r {percent:5.1f}% | {speed:6.1f} KB/s")
sys.stdout.flush()
sys.stdout.write("\n")
print(f"โœ… Saved: {path}")
return True
except Exception as e:
print(f"\nโš ๏ธ Attempt {attempt}/{RETRY_LIMIT} failed: {e}")
time.sleep(2)
print(f"โŒ Failed to download: {url}")
return False
# --- Main Logic ---
def scrape_and_capture():
"""Uses Playwright to navigate, capture HAR, and close."""
# TARGET_URL is now a global variable, which is updated in __main__
global TARGET_URL
print(f"๐Ÿš€ Starting Playwright to scrape: {TARGET_URL}")
print(f"๐Ÿ’พ Network traffic will be saved to: {HAR_FILE}")
try:
with sync_playwright() as p:
# Use 'headless=True' for silent operation, 'headless=False' to see the browser
browser = p.chromium.launch(headless=True)
# This is the key part: record_har_path saves all network traffic to the file
context = browser.new_context(record_har_path=HAR_FILE)
page = context.new_page()
page.goto(TARGET_URL, wait_until="networkidle")
# Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started
print(f" Waiting {WAIT_TIME_MS/1000} seconds for content to load...")
page.wait_for_timeout(WAIT_TIME_MS)
context.close()
browser.close()
print("โœ… Playwright session complete. HAR file created.")
except Exception as e:
print(f"โŒ Playwright error: {e}")
sys.exit(1)
def parse_har_and_download():
"""Parses the HAR file for video URLs and downloads them."""
if not os.path.exists(HAR_FILE):
print(f"โŒ Missing HAR file: {HAR_FILE}. Did the scrape fail?")
sys.exit(1)
print(f"\n๐Ÿ”ฌ Parsing HAR file: {HAR_FILE}")
with open(HAR_FILE, "r", encoding="utf-8") as f:
har = json.load(f)
entries = har.get("log", {}).get("entries", [])
media_entries = []
seen_urls = set()
for entry in entries:
url = entry.get("request", {}).get("url", "")
mime = entry.get("response", {}).get("content", {}).get("mimeType", "")
# Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate
if ".mp4" in url and "video" in mime and url not in seen_urls:
seen_urls.add(url)
# bodySize is not reliable for streamed content, but kept for context/size estimate
size = int(entry.get("response", {}).get("bodySize", 0))
media_entries.append((url, size))
if not media_entries:
print("โŒ No MP4 media URLs found in the HAR file.")
return
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"๐Ÿ“ฆ Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'")
for i, (url, size) in enumerate(media_entries, 1):
# Create a clean filename from the last part of the URL (before any query params)
url_part = url.split('/')[-1].split('?')[0]
filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}"
output_path = os.path.join(OUTPUT_DIR, filename)
print(f"\nโ–ถ๏ธ [{i}/{len(media_entries)}] Downloading {filename}")
download_file(url, output_path)
print("\nโœ… All downloads completed!")
# Optional: Clean up the generated HAR file
# os.remove(HAR_FILE)
# print(f"๐Ÿ—‘๏ธ Cleaned up HAR file: {HAR_FILE}")
if __name__ == "__main__":
# Check if a command-line argument for the URL was provided
if len(sys.argv) > 1:
# The first argument (index 0) is the script name. The second (index 1) is the first argument.
new_url = sys.argv[1]
print(f"โœ… Using command-line URL: {new_url}")
TARGET_URL = new_url
else:
print(f"โ„น๏ธ No URL provided. Using default target URL: {TARGET_URL}")
scrape_and_capture()
parse_har_and_download()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment