Created
July 18, 2024 06:44
-
-
Save apeckham/1a21776b8610a536c700137e9ef88140 to your computer and use it in GitHub Desktop.
extract contents of a HAR file to disk, and request the URL if the content is not in the HAR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import base64 | |
import os | |
import sys | |
from urllib.parse import urlparse | |
import requests | |
def fetch_url(url): | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() # Raises an HTTPError for bad responses | |
return response.content | |
except requests.RequestException as e: | |
print(f"Error fetching {url}: {str(e)}") | |
return None | |
def extract_files_from_har(har_data): | |
for entry in har_data['log']['entries']: | |
url = entry['request']['url'] | |
parsed_url = urlparse(url) | |
# Create directory structure in current directory | |
host_path = parsed_url.netloc | |
file_path = os.path.join(host_path, parsed_url.path.lstrip('/')) | |
# Ensure the directory exists | |
os.makedirs(os.path.dirname(file_path), exist_ok=True) | |
# Check if file already exists | |
if os.path.exists(file_path): | |
print(f"Skipping (file exists): {file_path}") | |
continue | |
# Get the file content | |
content = entry['response']['content'].get('text') | |
if content: | |
# If content is base64 encoded | |
if entry['response']['content'].get('encoding') == 'base64': | |
file_content = base64.b64decode(content) | |
else: | |
file_content = content.encode('utf-8') | |
else: | |
print(f"Fetching content for: {url}") | |
file_content = fetch_url(url) | |
if file_content is None: | |
print(f"Skipping (fetch failed): {url}") | |
continue | |
# Write the file | |
try: | |
with open(file_path, 'wb') as file: | |
file.write(file_content) | |
print(f"Extracted: {file_path}") | |
except IOError as e: | |
print(f"Error writing file {file_path}: {str(e)}") | |
if __name__ == "__main__": | |
# Read HAR data from stdin | |
har_data = json.load(sys.stdin) | |
extract_files_from_har(har_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment