-
-
Save FurloSK/0477e01024f701db42341fc3223a5d8c to your computer and use it in GitHub Desktop.
Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11) | |
# | |
# Original code: kafran | |
# https://gist.github.com/kafran/0257c13b3d0a79620695b73062334930 | |
# Updated code: Lewiscowles1986 | |
# https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea | |
# This code from: MrCheatEugene | |
# https://gist.github.com/MrCheatEugene/46ad8173e83efb70cf6543cb36629403 | |
# Updated & tweaked by: FurloSK [originally developed 2023-11-03] | |
# https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c | |
# | |
# Changes / Release notes: | |
# 2023-11-20 | |
# Prevents overwriting files by appending number to them + displays warning in such case. | |
# 2023-11-03 | |
# First working version. | |
import json | |
import base64 | |
import os | |
import sys | |
# BEGIN config part | |
# Allowed mimetypes/extensions to be parsed: | |
# Determines which mimetypes will be saved when parsing the *.har file | |
mimetypes = { | |
"image/webp": ".webp", | |
"image/jpeg": ".jpeg", # *.jpg files have two possible extensions | |
"image/jpeg": ".jpg", # (but .jpeg is official and thus preferred) | |
"image/png": ".png", | |
"image/svg+xml": ".svg" | |
} | |
# Output file path creation rules: | |
# Determines whether to create subfolders for domain/path when exporting files. | |
# When <outputPath> is True, <outputPathDepth> determines how many url path | |
# parts (delimited by /) will be used, and from which end of url to start. | |
outputDomain = True | |
outputPath = True | |
outputPathDepth = 0 # 0=all, 2=first two path parts , -2=last two path parts | |
# END config part | |
#============================================================================= | |
# Start program | |
#============================================================================= | |
# check cmd arguments | |
if len(sys.argv) < 2 or len(sys.argv) > 3: | |
print('extract_har: A simple script to extract all picture files from *.har file.') | |
print('Usage: extract_har.py <input.har> [<output_dir>]') | |
sys.exit(0) | |
# get *.har file to work with | |
if not os.path.exists(sys.argv[1]): | |
print('Specified *.har file (' + sys.argv[1] + ') does not exist, exiting...') | |
sys.exit(1) | |
with open(sys.argv[1], "r" ,encoding="utf8") as f: | |
print('Loading *.har file: ' + sys.argv[1]) | |
har = json.loads(f.read()) | |
# get base directory for extraction | |
if len(sys.argv) == 3: | |
folder = sys.argv[2] | |
else: | |
folder = os.path.join(os.getcwd(), sys.argv[1] + '_extract') | |
baseFolder = os.path.basename(os.path.normpath(folder)) | |
print('Destination folder: ' + baseFolder) | |
if not os.path.isdir(folder): | |
print(' Creating folder structure for extraction...') | |
os.makedirs(folder) | |
# save extensions list from allowed mimetypes | |
extensions = tuple(mimetypes.values()) | |
# print some info | |
print('\nExtraction output settings:') | |
if not outputDomain and not outputPath: | |
print(' do not create any directory structure - extract images directly to base folder') | |
else: | |
print(' create subfolders for domain: ' + str(outputDomain)) | |
if outputPath and outputPathDepth != 0: | |
print(' create subfolders for URL path: ' + str(outputPath) | |
+ ' (only for ' + ('first ' if outputPathDepth > 0 else 'last ') | |
+ str(abs(outputPathDepth)) + ' parts)') | |
else: | |
print(' create subfolders for URL path: ' + str(outputPath)) | |
# start parsing the individual entries | |
print('\nStarting extraction...') | |
entries = har["log"]["entries"] | |
count_total = 0 | |
count_extracted = 0 | |
for entry in entries: | |
count_total += 1 | |
# only parse the entry if it is one of the wanted extensions | |
mimetype = entry["response"]["content"]["mimeType"] | |
if mimetype in mimetypes: | |
count_extracted += 1 | |
# parse entry url | |
url = entry["request"]["url"] | |
urlProtocol, tmp = url.split("//", 1) # protocol | |
tmp = tmp.split("/") | |
urlDomain = tmp[0] # domain | |
urlFullPath = tmp[1:] | |
# extract file name and path | |
urlPath = urlFullPath[:-1] # path | |
urlFilename = urlFullPath[-1] # filename | |
# if filename does not contain expected extension, append it | |
if not urlFilename.endswith(extensions): | |
urlFilename = urlFilename + mimetypes.get(mimetype) | |
#print(str(urlProtocol), str(urlDomain), str(urlPath), str(urlFilename)) | |
# determine subfolders structure according to config settings | |
pathStr = '' | |
if outputDomain and outputPath: | |
pathStr = os.path.join(urlDomain, | |
*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:])) | |
) | |
elif outputDomain: | |
pathStr = urlDomain | |
elif outputPath: | |
pathStr = os.path.join( | |
*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:])) | |
) | |
# construct final file path to save extracted file | |
if outputDomain or outputPath: | |
subFolder = os.path.join(folder, pathStr) | |
else: | |
subFolder = folder | |
outFile = os.path.join(subFolder, urlFilename) | |
# prepare subfolder | |
if not os.path.isdir(subFolder): | |
os.makedirs(subFolder) | |
# parse raw data to image data | |
image64 = entry["response"]["content"]["text"] | |
image = base64.b64decode(image64) | |
# check if file exists | |
print(' ' + urlFilename + ' [' + str(len(image)) + ' bytes]:') | |
print(' extracting to: ' + (pathStr if len(pathStr) else baseFolder)) | |
if os.path.exists(outFile): | |
print(' ⚠️ WARNING: file ' + urlFilename + ' already exists!') | |
fixFilename, fixExtension = os.path.splitext(urlFilename) | |
counter = 1 | |
while os.path.exists(outFile): | |
fixFile = fixFilename + " (" + str(counter) + ")" + fixExtension | |
outFile = os.path.join(subFolder, fixFile) | |
counter += 1 | |
print(' ℹ️ Creating file ' + fixFile + '...') | |
# save data to extracted file | |
with open(outFile, "wb") as f: | |
f.write(image) | |
print('\nFinished extracting ' + str(count_extracted) + ' (out of total ' + str(count_total) + ') files.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Lol, I love you and this. It's basically a different program to the original fork now.
I just updated my fork to contain more mime-types to image formats, using MDN and searhing for
image/
https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea#file-extract_har-py-L9-L21