Created
March 7, 2015 03:18
-
-
Save danlmyers/73a0e6aa42d987437d7e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MAGIC_NUMBERS = { | |
# List of magic numbers to determine file types. | |
# A partial Listing is available at: http://en.wikipedia.org/wiki/List_of_file_signatures | |
'zip': {'numbers': ['\x50\x4B\x03\x04'], 'offset': 0}, | |
'gz': {'numbers': ['\x1F\x8B\x08'], 'offset': 0}, | |
'bz2': {'numbers': ['\x42\x5A\x68'], 'offset': 0}, | |
'tar': {'numbers': ['\x75\x73\x74\x61\x72\x00\x30\x30', '\x75\x73\x74\x61\x72\x20\x20\x00'], 'offset': 257}, | |
'rar': {'numbers': ['\x52\x61\x72\x21\x1A\x07\x00', '\x52\x61\x72\x21\x1A\x07\x01\x00'], 'offset': 0}, | |
'7z': {'numbers': ['\x37\x7A\xBC\xAF\x27\x1C'], 'offset': 0}, | |
'Z': {'numbers': ['\x1F\x9D'], 'offset': 0} | |
} | |
def determine_filetype(target_file): | |
""" | |
Reads the headers of a file and determines the file type based on the headers. | |
:param target_file: File to check what the file type is | |
:return: Short name of the type of file, like gz for gzipped archives, bz2 for bzipped archives. Doesn't make any | |
inferences to what is contained in the file in cases of archives, for example a tar.gz file will return | |
that it is a gzip archive, but won't know that there is a tar inside of it. Possible Returns: False, | |
apk, docx, jar, odp, ods, odt, pptx, xlsx, zip, gz, bz2, tar, rar, 7z, Z (as configured in MAGIC_NUMBERS) | |
""" | |
if not os.path.isfile(target_file): | |
# Not a regular file, don't bother. | |
return False | |
alternate_zips = ['apk', 'docx', 'jar', 'odp', 'ods', 'odt', 'pptx', 'xlsx', 'zipx'] | |
magic_number_lengths = [] | |
header_offsets = [] | |
for file_type in MAGIC_NUMBERS: | |
header_offsets.append(MAGIC_NUMBERS[file_type]['offset']) | |
for number in MAGIC_NUMBERS[file_type]['numbers']: | |
magic_number_lengths.append(len(number)) | |
header_length = max(magic_number_lengths) + max(header_offsets) | |
with open(target_file) as raw_file: | |
headers = raw_file.read(header_length) | |
for file_type in MAGIC_NUMBERS: | |
for magic in MAGIC_NUMBERS[file_type]['numbers']: | |
if headers[MAGIC_NUMBERS[file_type]['offset']:].startswith(magic): | |
if file_type == 'zip': | |
file_extension = os.path.splitext(target_file)[1][1:] | |
if file_extension in alternate_zips: | |
return file_extension | |
return file_type | |
return False # No filetypes matched. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment