Skip to content

Instantly share code, notes, and snippets.

@williballenthin
Last active November 30, 2023 13:27
Show Gist options
  • Save williballenthin/52debe05295266186cd2673ebf169967 to your computer and use it in GitHub Desktop.
Save williballenthin/52debe05295266186cd2673ebf169967 to your computer and use it in GitHub Desktop.
Carve PE files from binary data.
#!/usr/bin/env python2
'''
Carve PE files from binary data.
Write them into the current directy named after their hash.
Example::
$ python carvepe.py unallocated.bin
INFO:__main__:found pe at 0x0, length: 0xd8000
INFO:__main__:writing pe file to 273ed32b617fd79ed1b88ebd4521a441.bin
$ ls
595f44fec1e92a71d3e9e77456ba80d1.bin
71f920fa275127a7b60fa4d4d41432a3.bin
43c191bf6d6c3f263a8cd0efd4a058ab.bin
author: Willi Ballenthin
'''
import sys
import mmap
import hashlib
import logging
import contextlib
from collections import namedtuple
import pefile
import argparse
logger = logging.getLogger(__name__)
Match = namedtuple('Match', ['offset', 'size'])
def carve(data):
"""
find things that look like PE files from arbitrary binary data.
Args:
data (str): arbitrary byte string
Yields:
Match: one Match instance per identified PE file.
"""
offset = 0
while True:
offset = data.find('MZ', offset)
if offset == -1:
break
logger.debug('found MZ: 0x%x', offset)
# grab a bunch of data that should include the entire binary.
# assume less than 10mb.
max_offset = min(len(data), offset + 10 * 1024 * 1024)
payload = data[offset:max_offset]
try:
pe = pefile.PE(data=payload)
except pefile.PEFormatError:
logger.debug('not actually a PE, sorry.')
else:
logger.debug('yup, this looks ok.')
# try to compute the size of the PE file.
# we'll enumerate each section, and find the end of the last section.
# this should work for most binaries, unless there is an overlay.
# the PE file format does not have a true "file length" field, unfortunately.
max_addr = 0
for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
section_max_addr = section.PointerToRawData + section.SizeOfRawData
if section_max_addr > max_addr:
max_addr = section_max_addr
if pe.OPTIONAL_HEADER.CheckSum == pe.generate_checksum():
logger.debug('checksum verified')
yield Match(offset, max_addr)
offset += 1
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
parser = argparse.ArgumentParser(description="Carve PE files from binary data.")
parser.add_argument("input", type=str,
help="Path to input file")
parser.add_argument("-v", "--verbose", action="store_true",
help="Enable debug logging")
parser.add_argument("-q", "--quiet", action="store_true",
help="Disable all output but errors")
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
elif args.quiet:
logging.basicConfig(level=logging.ERROR)
else:
logging.basicConfig(level=logging.INFO)
with open(args.input, 'rb') as f:
# we're using a memory map here.
# it lets us read from a large file as if it were entirely in memory.
# (but its not, actually)
with contextlib.closing(mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as m:
for match in carve(m):
pe = m[match.offset:match.offset+match.size]
logger.info('found pe at 0x%x, length: 0x%x', match.offset, match.size)
md5 = hashlib.md5()
md5.update(pe)
logger.debug('md5sum: %s', md5.hexdigest())
outpath = md5.hexdigest() + '.bin'
logger.info('writing pe file to %s', outpath)
with open(outpath, 'wb') as g:
g.write(pe)
if __name__ == "__main__":
sys.exit(main())
@tbarabosch
Copy link

I ran into a scoping problem: if there are multiple PEs in the stream then the variable m will be of type hashlib.HASH. To solve this issue, just use a different name (quick fix: rename 'm' to 'n').

DEBUG:main:found MZ: 0x0
DEBUG:main:yup, this looks ok.
HERE
<type 'mmap.mmap'>
INFO:main:found pe at 0x0, length: 0xde00
DEBUG:main:md5sum: 7ec82fb43c704d56d1a8dfa3cf52b684
INFO:main:writing pe file to 7ec82fb43c704d56d1a8dfa3cf52b684.bin
DEBUG:main:found MZ: 0x4a64
DEBUG:main:not actually a PE, sorry.
DEBUG:main:found MZ: 0xb3e9
DEBUG:main:not actually a PE, sorry.
DEBUG:main:found MZ: 0xb5f6
DEBUG:main:not actually a PE, sorry.
DEBUG:main:found MZ: 0xba88
DEBUG:main:not actually a PE, sorry.
DEBUG:main:found MZ: 0xbfc0
DEBUG:main:yup, this looks ok.
HERE
<type '_hashlib.HASH'>
Traceback (most recent call last):
File "../../../tools/carvepe/carvepe.py", line 120, in
sys.exit(main())
File "../../../tools/carvepe/carvepe.py", line 106, in main
pe = m[match.offset:match.offset + match.size]
TypeError: '_hashlib.HASH' object has no attribute 'getitem'

@williballenthin
Copy link
Author

thanks! fixed!

@tbarabosch
Copy link

tbarabosch commented Dec 23, 2019

thanks! fixed!

Thanks, but it still crashes because of "outpath = m.hexdigest() + '.bin'", which should be now "outpath = md5.hexdigest() + '.bin'".

@williballenthin
Copy link
Author

(facepalm) of course, thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment