BoQsc · October 21, 2025 19:26
diff --git a/meta_videos_downloader_first_version.py b/meta_videos_downloader_first_version.py
 import json
 import os
 import re
 import sys
 import time
 import urllib.request
 from playwright.sync_api import sync_playwright

 # --- Configuration ---
 HAR_FILE = "meta.har"
 OUTPUT_DIR = "downloads"
 RETRY_LIMIT = 3
 TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/"
 WAIT_TIME_MS = 0 #WAIT_TIME_MS = 5000 # Time to wait after page load to capture dynamic content

 # --- Helper Functions ---

 def clean_filename(name):
    """Cleans a string to be a safe filename."""
    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)

 def download_file(url, path):
    """Downloads a file from a URL with retry and progress reporting."""
    for attempt in range(1, RETRY_LIMIT + 1):
        try:
            print(f"URL: {url}")
            with urllib.request.urlopen(url) as response, open(path, "wb") as f:
                total = int(response.headers.get("content-length", 0))
                downloaded = 0
                chunk_size = 8192
                start_time = time.time()

                sys.stdout.write(f"  Starting download (Size: {total/1024/1024:.2f} MB)...")
                sys.stdout.flush()

                while True:
                    chunk = response.read(chunk_size)
                    if not chunk:
                        break
                    f.write(chunk)
                    downloaded += len(chunk)

                    if total > 0:
                        percent = downloaded * 100 / total
                        # Calculate speed, adding a small epsilon to the denominator to prevent division by zero
                        speed = downloaded / (1024 * (time.time() - start_time + 0.001))
                        sys.stdout.write(f"\r  {percent:5.1f}% | {speed:6.1f} KB/s")
                        sys.stdout.flush()

                sys.stdout.write("\n")
            print(f"✅ Saved: {path}")
            return True
        except Exception as e:
            print(f"\n⚠️ Attempt {attempt}/{RETRY_LIMIT} failed: {e}")
            time.sleep(2)
    print(f"❌ Failed to download: {url}")
    return False

 # --- Main Logic ---

 def scrape_and_capture():
    """Uses Playwright to navigate, capture HAR, and close."""
    print(f"🚀 Starting Playwright to scrape: {TARGET_URL}")
    print(f"💾 Network traffic will be saved to: {HAR_FILE}")

    try:
        with sync_playwright() as p:
            # Use 'headless=True' for silent operation, 'headless=False' to see the browser
            browser = p.chromium.launch(headless=True)
            # This is the key part: record_har_path saves all network traffic to the file
            context = browser.new_context(record_har_path=HAR_FILE)
            page = context.new_page()

            page.goto(TARGET_URL, wait_until="networkidle")
            # Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started
            print(f"   Waiting {WAIT_TIME_MS/1000} seconds for content to load...")
            page.wait_for_timeout(WAIT_TIME_MS)

            context.close()
            browser.close()
        print("✅ Playwright session complete. HAR file created.")

    except Exception as e:
        print(f"❌ Playwright error: {e}")
        sys.exit(1)


 def parse_har_and_download():
    """Parses the HAR file for video URLs and downloads them."""
    if not os.path.exists(HAR_FILE):
        print(f"❌ Missing HAR file: {HAR_FILE}. Did the scrape fail?")
        sys.exit(1)

    print(f"\n🔬 Parsing HAR file: {HAR_FILE}")
    with open(HAR_FILE, "r", encoding="utf-8") as f:
        har = json.load(f)

    entries = har.get("log", {}).get("entries", [])
    media_entries = []
    seen_urls = set()

    for entry in entries:
        url = entry.get("request", {}).get("url", "")
        mime = entry.get("response", {}).get("content", {}).get("mimeType", "")
        
        # Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate
        if ".mp4" in url and "video" in mime and url not in seen_urls:
            seen_urls.add(url)
            # bodySize is not reliable for streamed content, but kept for context/size estimate
            size = int(entry.get("response", {}).get("bodySize", 0)) 
            media_entries.append((url, size))

    if not media_entries:
        print("❌ No MP4 media URLs found in the HAR file.")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print(f"📦 Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'")
    
    for i, (url, size) in enumerate(media_entries, 1):
        # Create a clean filename from the last part of the URL (before any query params)
        url_part = url.split('/')[-1].split('?')[0]
        filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}"
        output_path = os.path.join(OUTPUT_DIR, filename)
        
        print(f"\n▶️ [{i}/{len(media_entries)}] Downloading {filename}")
        download_file(url, output_path)

    print("\n✅ All downloads completed!")
    
    # Optional: Clean up the generated HAR file
    # os.remove(HAR_FILE)
    # print(f"🗑️ Cleaned up HAR file: {HAR_FILE}")

 if __name__ == "__main__":
    scrape_and_capture()
    parse_har_and_download()
diff --git a/meta_videos_downloader_latest.py b/meta_videos_downloader_latest.py
 import json
 import os
 import re
 import sys
 import time
 import urllib.request
 from playwright.sync_api import sync_playwright

 # --- Configuration ---
 HAR_FILE = "meta.har"
 OUTPUT_DIR = "downloads"
 RETRY_LIMIT = 3
 # Default URL will be used if no argument is provided
 DEFAULT_TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/"
 TARGET_URL = DEFAULT_TARGET_URL # Will be updated in the __main__ block
 WAIT_TIME_MS = 0
 # --- Helper Functions ---
 def clean_filename(name):
    """Cleans a string to be a safe filename."""
    return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)

 def download_file(url, path):
    """Downloads a file from a URL with retry and progress reporting."""
    for attempt in range(1, RETRY_LIMIT + 1):
        try:
            print(f"URL: {url}")
            with urllib.request.urlopen(url) as response, open(path, "wb") as f:
                total = int(response.headers.get("content-length", 0))
                downloaded = 0
                chunk_size = 8192
                start_time = time.time()
                sys.stdout.write(f"  Starting download (Size: {total/1024/1024:.2f} MB)...")
                sys.stdout.flush()
                while True:
                    chunk = response.read(chunk_size)
                    if not chunk:
                        break
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total > 0:
                        percent = downloaded * 100 / total
                        # Calculate speed, adding a small epsilon to the denominator to prevent division by zero
                        speed = downloaded / (1024 * (time.time() - start_time + 0.001))
                        sys.stdout.write(f"\r  {percent:5.1f}% | {speed:6.1f} KB/s")
                        sys.stdout.flush()
                sys.stdout.write("\n")
            print(f"✅ Saved: {path}")
            return True
        except Exception as e:
            print(f"\n⚠️ Attempt {attempt}/{RETRY_LIMIT} failed: {e}")
            time.sleep(2)
    print(f"❌ Failed to download: {url}")
    return False

 # --- Main Logic ---
 def scrape_and_capture():
    """Uses Playwright to navigate, capture HAR, and close."""
    # TARGET_URL is now a global variable, which is updated in __main__
    global TARGET_URL 
    print(f"🚀 Starting Playwright to scrape: {TARGET_URL}")
    print(f"💾 Network traffic will be saved to: {HAR_FILE}")
    try:
        with sync_playwright() as p:
            # Use 'headless=True' for silent operation, 'headless=False' to see the browser
            browser = p.chromium.launch(headless=True)
            # This is the key part: record_har_path saves all network traffic to the file
            context = browser.new_context(record_har_path=HAR_FILE)
            page = context.new_page()
            page.goto(TARGET_URL, wait_until="networkidle")
            # Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started
            print(f"   Waiting {WAIT_TIME_MS/1000} seconds for content to load...")
            page.wait_for_timeout(WAIT_TIME_MS)
            context.close()
            browser.close()
        print("✅ Playwright session complete. HAR file created.")
    except Exception as e:
        print(f"❌ Playwright error: {e}")
        sys.exit(1)

 def parse_har_and_download():
    """Parses the HAR file for video URLs and downloads them."""
    if not os.path.exists(HAR_FILE):
        print(f"❌ Missing HAR file: {HAR_FILE}. Did the scrape fail?")
        sys.exit(1)

    print(f"\n🔬 Parsing HAR file: {HAR_FILE}")
    with open(HAR_FILE, "r", encoding="utf-8") as f:
        har = json.load(f)

    entries = har.get("log", {}).get("entries", [])
    media_entries = []
    seen_urls = set()

    for entry in entries:
        url = entry.get("request", {}).get("url", "")
        mime = entry.get("response", {}).get("content", {}).get("mimeType", "")
                
        # Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate
        if ".mp4" in url and "video" in mime and url not in seen_urls:
            seen_urls.add(url)
            # bodySize is not reliable for streamed content, but kept for context/size estimate
            size = int(entry.get("response", {}).get("bodySize", 0))
            media_entries.append((url, size))

    if not media_entries:
        print("❌ No MP4 media URLs found in the HAR file.")
        return

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"📦 Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'")
        
    for i, (url, size) in enumerate(media_entries, 1):
        # Create a clean filename from the last part of the URL (before any query params)
        url_part = url.split('/')[-1].split('?')[0]
        filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}"
        output_path = os.path.join(OUTPUT_DIR, filename)
                
        print(f"\n▶️ [{i}/{len(media_entries)}] Downloading {filename}")
        download_file(url, output_path)

    print("\n✅ All downloads completed!")
        
    # Optional: Clean up the generated HAR file
    # os.remove(HAR_FILE)
    # print(f"🗑️ Cleaned up HAR file: {HAR_FILE}")

 if __name__ == "__main__":
    # Check if a command-line argument for the URL was provided
    if len(sys.argv) > 1:
        # The first argument (index 0) is the script name. The second (index 1) is the first argument.
        new_url = sys.argv[1]
        print(f"✅ Using command-line URL: {new_url}")
        TARGET_URL = new_url
    else:
        print(f"ℹ️ No URL provided. Using default target URL: {TARGET_URL}")

    scrape_and_capture()
    parse_har_and_download()
	import json
	import os
	import re
	import sys
	import time
	import urllib.request
	from playwright.sync_api import sync_playwright

	# --- Configuration ---
	HAR_FILE = "meta.har"
	OUTPUT_DIR = "downloads"
	RETRY_LIMIT = 3
	TARGET_URL = "https://www.meta.ai/@pizzaslime/post/QK3IIkGOmpA/daily-reminder/"
	WAIT_TIME_MS = 0 #WAIT_TIME_MS = 5000 # Time to wait after page load to capture dynamic content

	# --- Helper Functions ---

	def clean_filename(name):
	"""Cleans a string to be a safe filename."""
	return re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)

	def download_file(url, path):
	"""Downloads a file from a URL with retry and progress reporting."""
	for attempt in range(1, RETRY_LIMIT + 1):
	try:
	print(f"URL: {url}")
	with urllib.request.urlopen(url) as response, open(path, "wb") as f:
	total = int(response.headers.get("content-length", 0))
	downloaded = 0
	chunk_size = 8192
	start_time = time.time()

	sys.stdout.write(f" Starting download (Size: {total/1024/1024:.2f} MB)...")
	sys.stdout.flush()

	while True:
	chunk = response.read(chunk_size)
	if not chunk:
	break
	f.write(chunk)
	downloaded += len(chunk)

	if total > 0:
	percent = downloaded * 100 / total
	# Calculate speed, adding a small epsilon to the denominator to prevent division by zero
	speed = downloaded / (1024 * (time.time() - start_time + 0.001))
	sys.stdout.write(f"\r {percent:5.1f}% \| {speed:6.1f} KB/s")
	sys.stdout.flush()

	sys.stdout.write("\n")
	print(f"✅ Saved: {path}")
	return True
	except Exception as e:
	print(f"\n⚠️ Attempt {attempt}/{RETRY_LIMIT} failed: {e}")
	time.sleep(2)
	print(f"❌ Failed to download: {url}")
	return False

	# --- Main Logic ---

	def scrape_and_capture():
	"""Uses Playwright to navigate, capture HAR, and close."""
	print(f"🚀 Starting Playwright to scrape: {TARGET_URL}")
	print(f"💾 Network traffic will be saved to: {HAR_FILE}")

	try:
	with sync_playwright() as p:
	# Use 'headless=True' for silent operation, 'headless=False' to see the browser
	browser = p.chromium.launch(headless=True)
	# This is the key part: record_har_path saves all network traffic to the file
	context = browser.new_context(record_har_path=HAR_FILE)
	page = context.new_page()

	page.goto(TARGET_URL, wait_until="networkidle")
	# Wait a few seconds to ensure all dynamic elements (like videos) have loaded/started
	print(f" Waiting {WAIT_TIME_MS/1000} seconds for content to load...")
	page.wait_for_timeout(WAIT_TIME_MS)

	context.close()
	browser.close()
	print("✅ Playwright session complete. HAR file created.")

	except Exception as e:
	print(f"❌ Playwright error: {e}")
	sys.exit(1)


	def parse_har_and_download():
	"""Parses the HAR file for video URLs and downloads them."""
	if not os.path.exists(HAR_FILE):
	print(f"❌ Missing HAR file: {HAR_FILE}. Did the scrape fail?")
	sys.exit(1)

	print(f"\n🔬 Parsing HAR file: {HAR_FILE}")
	with open(HAR_FILE, "r", encoding="utf-8") as f:
	har = json.load(f)

	entries = har.get("log", {}).get("entries", [])
	media_entries = []
	seen_urls = set()

	for entry in entries:
	url = entry.get("request", {}).get("url", "")
	mime = entry.get("response", {}).get("content", {}).get("mimeType", "")

	# Check for MP4 in the URL and 'video' in the MIME type, and ensure it's not a duplicate
	if ".mp4" in url and "video" in mime and url not in seen_urls:
	seen_urls.add(url)
	# bodySize is not reliable for streamed content, but kept for context/size estimate
	size = int(entry.get("response", {}).get("bodySize", 0))
	media_entries.append((url, size))

	if not media_entries:
	print("❌ No MP4 media URLs found in the HAR file.")
	return

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	print(f"📦 Found {len(media_entries)} video files. Starting downloads to './{OUTPUT_DIR}/'")

	for i, (url, size) in enumerate(media_entries, 1):
	# Create a clean filename from the last part of the URL (before any query params)
	url_part = url.split('/')[-1].split('?')[0]
	filename = f"{i:02d}_{clean_filename(url_part) or 'video.mp4'}"
	output_path = os.path.join(OUTPUT_DIR, filename)

	print(f"\n▶️ [{i}/{len(media_entries)}] Downloading {filename}")
	download_file(url, output_path)

	print("\n✅ All downloads completed!")

	# Optional: Clean up the generated HAR file
	# os.remove(HAR_FILE)
	# print(f"🗑️ Cleaned up HAR file: {HAR_FILE}")

	if __name__ == "__main__":
	scrape_and_capture()
	parse_har_and_download()
No results found