Last active
November 30, 2023 13:27
-
-
Save williballenthin/52debe05295266186cd2673ebf169967 to your computer and use it in GitHub Desktop.
Carve PE files from binary data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
''' | |
Carve PE files from binary data. | |
Write them into the current directy named after their hash. | |
Example:: | |
$ python carvepe.py unallocated.bin | |
INFO:__main__:found pe at 0x0, length: 0xd8000 | |
INFO:__main__:writing pe file to 273ed32b617fd79ed1b88ebd4521a441.bin | |
$ ls | |
595f44fec1e92a71d3e9e77456ba80d1.bin | |
71f920fa275127a7b60fa4d4d41432a3.bin | |
43c191bf6d6c3f263a8cd0efd4a058ab.bin | |
author: Willi Ballenthin | |
''' | |
import sys | |
import mmap | |
import hashlib | |
import logging | |
import contextlib | |
from collections import namedtuple | |
import pefile | |
import argparse | |
logger = logging.getLogger(__name__) | |
Match = namedtuple('Match', ['offset', 'size']) | |
def carve(data): | |
""" | |
find things that look like PE files from arbitrary binary data. | |
Args: | |
data (str): arbitrary byte string | |
Yields: | |
Match: one Match instance per identified PE file. | |
""" | |
offset = 0 | |
while True: | |
offset = data.find('MZ', offset) | |
if offset == -1: | |
break | |
logger.debug('found MZ: 0x%x', offset) | |
# grab a bunch of data that should include the entire binary. | |
# assume less than 10mb. | |
max_offset = min(len(data), offset + 10 * 1024 * 1024) | |
payload = data[offset:max_offset] | |
try: | |
pe = pefile.PE(data=payload) | |
except pefile.PEFormatError: | |
logger.debug('not actually a PE, sorry.') | |
else: | |
logger.debug('yup, this looks ok.') | |
# try to compute the size of the PE file. | |
# we'll enumerate each section, and find the end of the last section. | |
# this should work for most binaries, unless there is an overlay. | |
# the PE file format does not have a true "file length" field, unfortunately. | |
max_addr = 0 | |
for section in sorted(pe.sections, key=lambda s: s.PointerToRawData): | |
section_max_addr = section.PointerToRawData + section.SizeOfRawData | |
if section_max_addr > max_addr: | |
max_addr = section_max_addr | |
if pe.OPTIONAL_HEADER.CheckSum == pe.generate_checksum(): | |
logger.debug('checksum verified') | |
yield Match(offset, max_addr) | |
offset += 1 | |
def main(argv=None): | |
if argv is None: | |
argv = sys.argv[1:] | |
parser = argparse.ArgumentParser(description="Carve PE files from binary data.") | |
parser.add_argument("input", type=str, | |
help="Path to input file") | |
parser.add_argument("-v", "--verbose", action="store_true", | |
help="Enable debug logging") | |
parser.add_argument("-q", "--quiet", action="store_true", | |
help="Disable all output but errors") | |
args = parser.parse_args() | |
if args.verbose: | |
logging.basicConfig(level=logging.DEBUG) | |
elif args.quiet: | |
logging.basicConfig(level=logging.ERROR) | |
else: | |
logging.basicConfig(level=logging.INFO) | |
with open(args.input, 'rb') as f: | |
# we're using a memory map here. | |
# it lets us read from a large file as if it were entirely in memory. | |
# (but its not, actually) | |
with contextlib.closing(mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as m: | |
for match in carve(m): | |
pe = m[match.offset:match.offset+match.size] | |
logger.info('found pe at 0x%x, length: 0x%x', match.offset, match.size) | |
md5 = hashlib.md5() | |
md5.update(pe) | |
logger.debug('md5sum: %s', md5.hexdigest()) | |
outpath = md5.hexdigest() + '.bin' | |
logger.info('writing pe file to %s', outpath) | |
with open(outpath, 'wb') as g: | |
g.write(pe) | |
if __name__ == "__main__": | |
sys.exit(main()) |
thanks! fixed!
thanks! fixed!
Thanks, but it still crashes because of "outpath = m.hexdigest() + '.bin'", which should be now "outpath = md5.hexdigest() + '.bin'".
(facepalm) of course, thanks!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I ran into a scoping problem: if there are multiple PEs in the stream then the variable m will be of type hashlib.HASH. To solve this issue, just use a different name (quick fix: rename 'm' to 'n').