Last active
March 16, 2019 15:34
-
-
Save pedramamini/54df2648a1b73adf9a0d6d0b1a75ca0a to your computer and use it in GitHub Desktop.
Carve files out of a blob.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# source: https://gist.github.com/pedramamini/54df2648a1b73adf9a0d6d0b1a75ca0a | |
import os | |
import re | |
import sys | |
import errno | |
import string | |
# debug output. | |
DEBUG = False | |
# add start/end markings to this datastructure. | |
START, END, EXTENSION = range(3) | |
CARVE_PAIRINGS = \ | |
{ | |
# label, start marker, end marker, file extension. | |
# NOTE: don't use None for blank entries, use "". | |
# NOTE: markers are in regular expression format! | |
"JPG-NORMAL" : ["\xff\xd8\xff(\xe0|\xe1|\xfe)", "\xff\xd9", "jpg"], | |
"JPG-WIDE" : ["\xff\xd8\xff", "\xff\xd9", "jpg"], | |
"JPG-END" : ["\xff\xd8\xff", "", "jpg"], | |
"PNG" : ["\x89PNG", "", "png"], | |
"GIF" : ["GIF8(9|7)a", "", "gif"], | |
"CDF-OLE" : ["\xD0\xCF\x11\xE0", "", "ole"], | |
"ZIP" : ["PK\x03\x04", "", "zip"], | |
} | |
######################################################################################################################## | |
def usage (msg=None): | |
sys.stderr.write("USAGE: %s </path/to/input/file> [</path/to/output/directory>] [-v]\n" % sys.argv[0]) | |
sys.stderr.write("\nCarve start/end marker pairings out of a file. Supported types:\n") | |
for k,v in CARVE_PAIRINGS.iteritems(): | |
log = "%20s start: %-20s end:%5s\n" | |
log %= k, hexify(v[START], True), hexify(v[END], True) if v[END] else None | |
sys.stderr.write(log) | |
if msg: | |
sys.stderr.write("\nError: %s\n" % msg) | |
sys.exit(1) | |
######################################################################################################################## | |
def mkdir_p (path): | |
try: | |
os.makedirs(path) | |
except OSError, e: | |
if e.errno == errno.EEXIST: | |
pass | |
else: | |
raise | |
######################################################################################################################## | |
def hexify (s, preserve_printables=False): | |
hexed = "" | |
for b in s: | |
if preserve_printables and b in string.printable: | |
hexed += b | |
else: | |
hexed += "%02x" % ord(b) | |
return "".join(hexed) | |
######################################################################################################################## | |
def hex_dump (data, addr=0, prefix=""): | |
dump = prefix | |
slice = "" | |
for byte in data: | |
if addr % 16 == 0: | |
dump += " " | |
for char in slice: | |
if ord(char) >= 32 and ord(char) <= 126: | |
dump += char | |
else: | |
dump += "." | |
dump += "\n%s%04x: " % (prefix, addr) | |
slice = "" | |
dump += "%02x " % ord(byte) | |
slice += byte | |
addr += 1 | |
remainder = addr % 16 | |
if remainder != 0: | |
dump += " " * (16 - remainder) + " " | |
for char in slice: | |
if ord(char) >= 32 and ord(char) <= 126: | |
dump += char | |
else: | |
dump += "." | |
return dump + "\n" | |
######################################################################################################################## | |
def find_all (needle, haystack, include_marker=False): | |
indexes = [] | |
for match in re.finditer(needle, haystack): | |
found = match.start() | |
if include_marker: | |
found += len(needle) | |
indexes.append(found) | |
if DEBUG: | |
print "found needle '%s' in haystack at offset %04x" % (hexify(needle), found) | |
return indexes | |
######################################################################################################################## | |
def commify (number): | |
number = str(number) | |
processing = 1 | |
regex = re.compile(r"^(-?\d+)(\d{3})") | |
while processing: | |
(number, processing) = regex.subn(r"\1,\2",number) | |
return number | |
######################################################################################################################## | |
def carve_helper (starters, enders, data, output_dir, marker_kind): | |
for s in starters: | |
for e in enders: | |
# start must be before end. | |
if not s < e: | |
continue | |
# carvings are tagged by their type followed by their start/end index. | |
carving_name = "%s-%02d-%02d.%s" | |
carving_name %= marker_kind, starters.index(s), enders.index(e), CARVE_PAIRINGS[marker_kind][EXTENSION] | |
# open a file for writing, ensure to include the full length of the end marker. | |
carving_path = os.path.join(output_dir, carving_name) | |
with open(carving_path, "wb+") as fh: | |
# NOTE: we're removing the one-byte per iteration chop from find_all(). | |
slice = data[s:e] | |
print "writing %s bytes from %08x to %08x to %s" % (commify(len(slice)), s, e, carving_path) | |
fh.write(slice) | |
######################################################################################################################## | |
def carve_all (path=None, data=None): | |
if path is None and data is None: | |
raise Exception("carve_all() either 'path' or 'data' optargs must be specified.") | |
if path: | |
with open(input_file, "rb") as fh: | |
data = fh.read() | |
# walk the carving pairing datastructure and carve away. | |
for kind, marker in CARVE_PAIRINGS.iteritems(): | |
print ">>>>> carving for %s" % kind | |
# there's always a start. | |
starters = find_all(marker[START], data) | |
# use end of file, if no end marker is defined. | |
if marker[END]: | |
enders = find_all(marker[END], data, include_marker=True) | |
else: | |
enders = [len(data)] | |
if DEBUG: | |
print "\n>>> %s START MARKERS (dump from marker)" % kind | |
for index in starters: | |
print index, hex(index), hex_dump(data[index : index + 64]) | |
print "\n>>> %s END MARKERS (dump to marker)" % kind | |
for index in enders: | |
print index, hex(index), hex_dump(data[index - 64 - len(marker[END]) : index + len(marker[END])]) | |
# carve out the markers. | |
carve_helper(starters, enders, data, output_dir, kind) | |
######################################################################################################################## | |
if __name__ == "__main__": | |
# toggle on verbose outputs. | |
if "-v" in sys.argv: | |
sys.argv.remove("-v") | |
DEBUG = True | |
# we at least need an input file. | |
if len(sys.argv) == 3: | |
output_dir = sys.argv.pop() | |
input_file = sys.argv.pop() | |
elif len(sys.argv) == 2: | |
output_dir = "./" | |
input_file = sys.argv.pop() | |
else: | |
usage() | |
# ensure input file and output directories exist. | |
if not os.path.exists(input_file): | |
usage("file not found: %s" % input_file) | |
mkdir_p(output_dir) | |
# call the carver. | |
carve_all(path=input_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment