danlmyers · March 7, 2015 03:18
diff --git a/magic_numbers.py b/magic_numbers.py
 MAGIC_NUMBERS = {
    # List of magic numbers to determine file types.
    # A partial Listing is available at: http://en.wikipedia.org/wiki/List_of_file_signatures
    'zip': {'numbers': ['\x50\x4B\x03\x04'], 'offset': 0},
    'gz': {'numbers': ['\x1F\x8B\x08'], 'offset': 0},
    'bz2': {'numbers': ['\x42\x5A\x68'], 'offset': 0},
    'tar': {'numbers': ['\x75\x73\x74\x61\x72\x00\x30\x30', '\x75\x73\x74\x61\x72\x20\x20\x00'], 'offset': 257},
    'rar': {'numbers': ['\x52\x61\x72\x21\x1A\x07\x00', '\x52\x61\x72\x21\x1A\x07\x01\x00'], 'offset': 0},
    '7z': {'numbers': ['\x37\x7A\xBC\xAF\x27\x1C'], 'offset': 0},
    'Z': {'numbers': ['\x1F\x9D'], 'offset': 0}
 }


 def determine_filetype(target_file):
    """
    Reads the headers of a file and determines the file type based on the headers.
    :param target_file: File to check what the file type is
    :return: Short name of the type of file, like gz for gzipped archives, bz2 for bzipped archives. Doesn't make any
             inferences to what is contained in the file in cases of archives, for example a tar.gz file will return
             that it is a gzip archive, but won't know that there is a tar inside of it. Possible Returns: False,
             apk, docx, jar, odp, ods, odt, pptx, xlsx, zip, gz, bz2, tar, rar, 7z, Z (as configured in MAGIC_NUMBERS)
    """
    if not os.path.isfile(target_file):
        # Not a regular file, don't bother.
        return False
   
    alternate_zips = ['apk', 'docx', 'jar', 'odp', 'ods', 'odt', 'pptx', 'xlsx', 'zipx']
    magic_number_lengths = []
    header_offsets = []
    for file_type in MAGIC_NUMBERS:
        header_offsets.append(MAGIC_NUMBERS[file_type]['offset'])
        for number in MAGIC_NUMBERS[file_type]['numbers']:
            magic_number_lengths.append(len(number))

    header_length = max(magic_number_lengths) + max(header_offsets)

    with open(target_file) as raw_file:
        headers = raw_file.read(header_length)

    for file_type in MAGIC_NUMBERS:
        for magic in MAGIC_NUMBERS[file_type]['numbers']:
            if headers[MAGIC_NUMBERS[file_type]['offset']:].startswith(magic):
                if file_type == 'zip':
                    file_extension = os.path.splitext(target_file)[1][1:]
                    if file_extension in alternate_zips:
                        return file_extension
                return file_type
    return False  # No filetypes matched.
	MAGIC_NUMBERS = {
	# List of magic numbers to determine file types.
	# A partial Listing is available at: http://en.wikipedia.org/wiki/List_of_file_signatures
	'zip': {'numbers': ['\x50\x4B\x03\x04'], 'offset': 0},
	'gz': {'numbers': ['\x1F\x8B\x08'], 'offset': 0},
	'bz2': {'numbers': ['\x42\x5A\x68'], 'offset': 0},
	'tar': {'numbers': ['\x75\x73\x74\x61\x72\x00\x30\x30', '\x75\x73\x74\x61\x72\x20\x20\x00'], 'offset': 257},
	'rar': {'numbers': ['\x52\x61\x72\x21\x1A\x07\x00', '\x52\x61\x72\x21\x1A\x07\x01\x00'], 'offset': 0},
	'7z': {'numbers': ['\x37\x7A\xBC\xAF\x27\x1C'], 'offset': 0},
	'Z': {'numbers': ['\x1F\x9D'], 'offset': 0}
	}


	def determine_filetype(target_file):
	"""
	Reads the headers of a file and determines the file type based on the headers.
	:param target_file: File to check what the file type is
	:return: Short name of the type of file, like gz for gzipped archives, bz2 for bzipped archives. Doesn't make any
	inferences to what is contained in the file in cases of archives, for example a tar.gz file will return
	that it is a gzip archive, but won't know that there is a tar inside of it. Possible Returns: False,
	apk, docx, jar, odp, ods, odt, pptx, xlsx, zip, gz, bz2, tar, rar, 7z, Z (as configured in MAGIC_NUMBERS)
	"""
	if not os.path.isfile(target_file):
	# Not a regular file, don't bother.
	return False

	alternate_zips = ['apk', 'docx', 'jar', 'odp', 'ods', 'odt', 'pptx', 'xlsx', 'zipx']
	magic_number_lengths = []
	header_offsets = []
	for file_type in MAGIC_NUMBERS:
	header_offsets.append(MAGIC_NUMBERS[file_type]['offset'])
	for number in MAGIC_NUMBERS[file_type]['numbers']:
	magic_number_lengths.append(len(number))

	header_length = max(magic_number_lengths) + max(header_offsets)

	with open(target_file) as raw_file:
	headers = raw_file.read(header_length)

	for file_type in MAGIC_NUMBERS:
	for magic in MAGIC_NUMBERS[file_type]['numbers']:
	if headers[MAGIC_NUMBERS[file_type]['offset']:].startswith(magic):
	if file_type == 'zip':
	file_extension = os.path.splitext(target_file)[1][1:]
	if file_extension in alternate_zips:
	return file_extension
	return file_type
	return False # No filetypes matched.