Skip to content

Instantly share code, notes, and snippets.

@FurloSK
Forked from MrCheatEugene/extract_har.py
Last active November 20, 2023 10:36
Show Gist options
  • Save FurloSK/0477e01024f701db42341fc3223a5d8c to your computer and use it in GitHub Desktop.
Save FurloSK/0477e01024f701db42341fc3223a5d8c to your computer and use it in GitHub Desktop.
Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11)
#!/usr/bin/env python3
# Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11)
#
# Original code: kafran
# https://gist.github.com/kafran/0257c13b3d0a79620695b73062334930
# Updated code: Lewiscowles1986
# https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea
# This code from: MrCheatEugene
# https://gist.github.com/MrCheatEugene/46ad8173e83efb70cf6543cb36629403
# Updated & tweaked by: FurloSK [originally developed 2023-11-03]
# https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c
#
# Changes / Release notes:
# 2023-11-20
# Prevents overwriting files by appending number to them + displays warning in such case.
# 2023-11-03
# First working version.
import json
import base64
import os
import sys
# BEGIN config part
# Allowed mimetypes/extensions to be parsed:
# Determines which mimetypes will be saved when parsing the *.har file
mimetypes = {
"image/webp": ".webp",
"image/jpeg": ".jpeg", # *.jpg files have two possible extensions
"image/jpeg": ".jpg", # (but .jpeg is official and thus preferred)
"image/png": ".png",
"image/svg+xml": ".svg"
}
# Output file path creation rules:
# Determines whether to create subfolders for domain/path when exporting files.
# When <outputPath> is True, <outputPathDepth> determines how many url path
# parts (delimited by /) will be used, and from which end of url to start.
outputDomain = True
outputPath = True
outputPathDepth = 0 # 0=all, 2=first two path parts , -2=last two path parts
# END config part
#=============================================================================
# Start program
#=============================================================================
# check cmd arguments
if len(sys.argv) < 2 or len(sys.argv) > 3:
print('extract_har: A simple script to extract all picture files from *.har file.')
print('Usage: extract_har.py <input.har> [<output_dir>]')
sys.exit(0)
# get *.har file to work with
if not os.path.exists(sys.argv[1]):
print('Specified *.har file (' + sys.argv[1] + ') does not exist, exiting...')
sys.exit(1)
with open(sys.argv[1], "r" ,encoding="utf8") as f:
print('Loading *.har file: ' + sys.argv[1])
har = json.loads(f.read())
# get base directory for extraction
if len(sys.argv) == 3:
folder = sys.argv[2]
else:
folder = os.path.join(os.getcwd(), sys.argv[1] + '_extract')
baseFolder = os.path.basename(os.path.normpath(folder))
print('Destination folder: ' + baseFolder)
if not os.path.isdir(folder):
print(' Creating folder structure for extraction...')
os.makedirs(folder)
# save extensions list from allowed mimetypes
extensions = tuple(mimetypes.values())
# print some info
print('\nExtraction output settings:')
if not outputDomain and not outputPath:
print(' do not create any directory structure - extract images directly to base folder')
else:
print(' create subfolders for domain: ' + str(outputDomain))
if outputPath and outputPathDepth != 0:
print(' create subfolders for URL path: ' + str(outputPath)
+ ' (only for ' + ('first ' if outputPathDepth > 0 else 'last ')
+ str(abs(outputPathDepth)) + ' parts)')
else:
print(' create subfolders for URL path: ' + str(outputPath))
# start parsing the individual entries
print('\nStarting extraction...')
entries = har["log"]["entries"]
count_total = 0
count_extracted = 0
for entry in entries:
count_total += 1
# only parse the entry if it is one of the wanted extensions
mimetype = entry["response"]["content"]["mimeType"]
if mimetype in mimetypes:
count_extracted += 1
# parse entry url
url = entry["request"]["url"]
urlProtocol, tmp = url.split("//", 1) # protocol
tmp = tmp.split("/")
urlDomain = tmp[0] # domain
urlFullPath = tmp[1:]
# extract file name and path
urlPath = urlFullPath[:-1] # path
urlFilename = urlFullPath[-1] # filename
# if filename does not contain expected extension, append it
if not urlFilename.endswith(extensions):
urlFilename = urlFilename + mimetypes.get(mimetype)
#print(str(urlProtocol), str(urlDomain), str(urlPath), str(urlFilename))
# determine subfolders structure according to config settings
pathStr = ''
if outputDomain and outputPath:
pathStr = os.path.join(urlDomain,
*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
)
elif outputDomain:
pathStr = urlDomain
elif outputPath:
pathStr = os.path.join(
*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
)
# construct final file path to save extracted file
if outputDomain or outputPath:
subFolder = os.path.join(folder, pathStr)
else:
subFolder = folder
outFile = os.path.join(subFolder, urlFilename)
# prepare subfolder
if not os.path.isdir(subFolder):
os.makedirs(subFolder)
# parse raw data to image data
image64 = entry["response"]["content"]["text"]
image = base64.b64decode(image64)
# check if file exists
print(' ' + urlFilename + ' [' + str(len(image)) + ' bytes]:')
print(' extracting to: ' + (pathStr if len(pathStr) else baseFolder))
if os.path.exists(outFile):
print(' ⚠️ WARNING: file ' + urlFilename + ' already exists!')
fixFilename, fixExtension = os.path.splitext(urlFilename)
counter = 1
while os.path.exists(outFile):
fixFile = fixFilename + " (" + str(counter) + ")" + fixExtension
outFile = os.path.join(subFolder, fixFile)
counter += 1
print(' ℹ️ Creating file ' + fixFile + '...')
# save data to extracted file
with open(outFile, "wb") as f:
f.write(image)
print('\nFinished extracting ' + str(count_extracted) + ' (out of total ' + str(count_total) + ') files.')
@MrCheatEugene
Copy link

hey, thanks for an update!

@Lewiscowles1986
Copy link

Lewiscowles1986 commented Nov 4, 2023

Lol, I love you and this. It's basically a different program to the original fork now.

I just updated my fork to contain more mime-types to image formats, using MDN and searhing for image/

https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea#file-extract_har-py-L9-L21

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment