williballenthin · November 30, 2023 13:27 · tbarabosch · Nov 28, 2019 · williballenthin · Dec 19, 2019
diff --git a/carvepe.py b/carvepe.py
 #!/usr/bin/env python2
 '''
 Carve PE files from binary data.
 Write them into the current directy named after their hash.

 Example::

    $ python carvepe.py unallocated.bin
      INFO:__main__:found pe at 0x0, length: 0xd8000
      INFO:__main__:writing pe file to 273ed32b617fd79ed1b88ebd4521a441.bin

    $ ls
      595f44fec1e92a71d3e9e77456ba80d1.bin
      71f920fa275127a7b60fa4d4d41432a3.bin
      43c191bf6d6c3f263a8cd0efd4a058ab.bin

 author: Willi Ballenthin
 '''
 import sys
 import mmap
 import hashlib
 import logging
 import contextlib
 from collections import namedtuple

 import pefile
 import argparse


 logger = logging.getLogger(__name__)


 Match = namedtuple('Match', ['offset', 'size'])


 def carve(data):
    """
    find things that look like PE files from arbitrary binary data.

    Args:
      data (str): arbitrary byte string

    Yields:
      Match: one Match instance per identified PE file.
    """
    offset = 0

    while True:
        offset = data.find('MZ', offset)
        if offset == -1:
            break

        logger.debug('found MZ: 0x%x', offset)

        # grab a bunch of data that should include the entire binary.
        # assume less than 10mb.
        max_offset = min(len(data), offset + 10 * 1024 * 1024)
        payload = data[offset:max_offset]

        try:
            pe = pefile.PE(data=payload)
        except pefile.PEFormatError:
            logger.debug('not actually a PE, sorry.')
        else:
            logger.debug('yup, this looks ok.')

            # try to compute the size of the PE file.
            # we'll enumerate each section, and find the end of the last section.
            # this should work for most binaries, unless there is an overlay.
            # the PE file format does not have a true "file length" field, unfortunately.
            max_addr = 0
            for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
                section_max_addr = section.PointerToRawData + section.SizeOfRawData
                if section_max_addr > max_addr:
                    max_addr = section_max_addr
                    
            if pe.OPTIONAL_HEADER.CheckSum == pe.generate_checksum():
                 logger.debug('checksum verified')
                    
            yield Match(offset, max_addr)

        offset += 1


 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    parser = argparse.ArgumentParser(description="Carve PE files from binary data.")
    parser.add_argument("input", type=str,
                        help="Path to input file")
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="Enable debug logging")
    parser.add_argument("-q", "--quiet", action="store_true",
                        help="Disable all output but errors")
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    elif args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        logging.basicConfig(level=logging.INFO)

    with open(args.input, 'rb') as f:
        # we're using a memory map here.
        # it lets us read from a large file as if it were entirely in memory.
        # (but its not, actually)
        with contextlib.closing(mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as m:
            for match in carve(m):
                pe = m[match.offset:match.offset+match.size]
                logger.info('found pe at 0x%x, length: 0x%x', match.offset, match.size)

                md5 = hashlib.md5()
                md5.update(pe)
                logger.debug('md5sum: %s', md5.hexdigest())

                outpath = md5.hexdigest() + '.bin'
                logger.info('writing pe file to %s', outpath)
                with open(outpath, 'wb') as g:
                    g.write(pe)


 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python2
	'''
	Carve PE files from binary data.
	Write them into the current directy named after their hash.

	Example::

	$ python carvepe.py unallocated.bin
	INFO:__main__:found pe at 0x0, length: 0xd8000
	INFO:__main__:writing pe file to 273ed32b617fd79ed1b88ebd4521a441.bin

	$ ls
	595f44fec1e92a71d3e9e77456ba80d1.bin
	71f920fa275127a7b60fa4d4d41432a3.bin
	43c191bf6d6c3f263a8cd0efd4a058ab.bin

	author: Willi Ballenthin
	'''
	import sys
	import mmap
	import hashlib
	import logging
	import contextlib
	from collections import namedtuple

	import pefile
	import argparse


	logger = logging.getLogger(__name__)


	Match = namedtuple('Match', ['offset', 'size'])


	def carve(data):
	"""
	find things that look like PE files from arbitrary binary data.

	Args:
	data (str): arbitrary byte string

	Yields:
	Match: one Match instance per identified PE file.
	"""
	offset = 0

	while True:
	offset = data.find('MZ', offset)
	if offset == -1:
	break

	logger.debug('found MZ: 0x%x', offset)

	# grab a bunch of data that should include the entire binary.
	# assume less than 10mb.
	max_offset = min(len(data), offset + 10 * 1024 * 1024)
	payload = data[offset:max_offset]

	try:
	pe = pefile.PE(data=payload)
	except pefile.PEFormatError:
	logger.debug('not actually a PE, sorry.')
	else:
	logger.debug('yup, this looks ok.')

	# try to compute the size of the PE file.
	# we'll enumerate each section, and find the end of the last section.
	# this should work for most binaries, unless there is an overlay.
	# the PE file format does not have a true "file length" field, unfortunately.
	max_addr = 0
	for section in sorted(pe.sections, key=lambda s: s.PointerToRawData):
	section_max_addr = section.PointerToRawData + section.SizeOfRawData
	if section_max_addr > max_addr:
	max_addr = section_max_addr

	if pe.OPTIONAL_HEADER.CheckSum == pe.generate_checksum():
	logger.debug('checksum verified')

	yield Match(offset, max_addr)

	offset += 1


	def main(argv=None):
	if argv is None:
	argv = sys.argv[1:]

	parser = argparse.ArgumentParser(description="Carve PE files from binary data.")
	parser.add_argument("input", type=str,
	help="Path to input file")
	parser.add_argument("-v", "--verbose", action="store_true",
	help="Enable debug logging")
	parser.add_argument("-q", "--quiet", action="store_true",
	help="Disable all output but errors")
	args = parser.parse_args()

	if args.verbose:
	logging.basicConfig(level=logging.DEBUG)
	elif args.quiet:
	logging.basicConfig(level=logging.ERROR)
	else:
	logging.basicConfig(level=logging.INFO)

	with open(args.input, 'rb') as f:
	# we're using a memory map here.
	# it lets us read from a large file as if it were entirely in memory.
	# (but its not, actually)
	with contextlib.closing(mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)) as m:
	for match in carve(m):
	pe = m[match.offset:match.offset+match.size]
	logger.info('found pe at 0x%x, length: 0x%x', match.offset, match.size)

	md5 = hashlib.md5()
	md5.update(pe)
	logger.debug('md5sum: %s', md5.hexdigest())

	outpath = md5.hexdigest() + '.bin'
	logger.info('writing pe file to %s', outpath)
	with open(outpath, 'wb') as g:
	g.write(pe)


	if __name__ == "__main__":
	sys.exit(main())