Created
December 21, 2017 17:46
-
-
Save sagar03d/32d8b0cef9d99bf41e690991c09f3552 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python2 | |
import pefile | |
import os | |
import array | |
import math | |
import pickle | |
from sklearn.externals import joblib | |
import sys | |
import argparse | |
def get_entropy(data): | |
if len(data) == 0: | |
return 0.0 | |
occurences = array.array('L', [0]*256) | |
for x in data: | |
occurences[x if isinstance(x, int) else ord(x)] += 1 | |
entropy = 0 | |
for x in occurences: | |
if x: | |
p_x = float(x) / len(data) | |
entropy -= p_x*math.log(p_x, 2) | |
return entropy | |
def get_resources(pe): | |
"""Extract resources : | |
[entropy, size]""" | |
resources = [] | |
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): | |
try: | |
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: | |
if hasattr(resource_type, 'directory'): | |
for resource_id in resource_type.directory.entries: | |
if hasattr(resource_id, 'directory'): | |
for resource_lang in resource_id.directory.entries: | |
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) | |
size = resource_lang.data.struct.Size | |
entropy = get_entropy(data) | |
resources.append([entropy, size]) | |
except Exception as e: | |
return resources | |
return resources | |
def get_version_info(pe): | |
"""Return version infos""" | |
res = {} | |
for fileinfo in pe.FileInfo: | |
if fileinfo.Key == 'StringFileInfo': | |
for st in fileinfo.StringTable: | |
for entry in st.entries.items(): | |
res[entry[0]] = entry[1] | |
if fileinfo.Key == 'VarFileInfo': | |
for var in fileinfo.Var: | |
res[var.entry.items()[0][0]] = var.entry.items()[0][1] | |
if hasattr(pe, 'VS_FIXEDFILEINFO'): | |
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags | |
res['os'] = pe.VS_FIXEDFILEINFO.FileOS | |
res['type'] = pe.VS_FIXEDFILEINFO.FileType | |
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS | |
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS | |
res['signature'] = pe.VS_FIXEDFILEINFO.Signature | |
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion | |
return res | |
def extract_infos(fpath): | |
res = {} | |
pe = pefile.PE(fpath) | |
res['Machine'] = pe.FILE_HEADER.Machine | |
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader | |
res['Characteristics'] = pe.FILE_HEADER.Characteristics | |
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion | |
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion | |
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode | |
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData | |
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData | |
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint | |
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode | |
try: | |
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData | |
except AttributeError: | |
res['BaseOfData'] = 0 | |
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase | |
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment | |
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment | |
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion | |
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion | |
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion | |
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion | |
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion | |
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion | |
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage | |
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders | |
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum | |
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem | |
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics | |
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve | |
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit | |
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve | |
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit | |
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags | |
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes | |
# Sections | |
res['SectionsNb'] = len(pe.sections) | |
entropy = map(lambda x:x.get_entropy(), pe.sections) | |
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) | |
res['SectionsMinEntropy'] = min(entropy) | |
res['SectionsMaxEntropy'] = max(entropy) | |
raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections) | |
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) | |
res['SectionsMinRawsize'] = min(raw_sizes) | |
res['SectionsMaxRawsize'] = max(raw_sizes) | |
virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections) | |
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) | |
res['SectionsMinVirtualsize'] = min(virtual_sizes) | |
res['SectionMaxVirtualsize'] = max(virtual_sizes) | |
#Imports | |
try: | |
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) | |
imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []) | |
res['ImportsNb'] = len(imports) | |
res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports)) | |
except AttributeError: | |
res['ImportsNbDLL'] = 0 | |
res['ImportsNb'] = 0 | |
res['ImportsNbOrdinal'] = 0 | |
#Exports | |
try: | |
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) | |
except AttributeError: | |
# No export | |
res['ExportNb'] = 0 | |
#Resources | |
resources= get_resources(pe) | |
res['ResourcesNb'] = len(resources) | |
if len(resources)> 0: | |
entropy = map(lambda x:x[0], resources) | |
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) | |
res['ResourcesMinEntropy'] = min(entropy) | |
res['ResourcesMaxEntropy'] = max(entropy) | |
sizes = map(lambda x:x[1], resources) | |
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) | |
res['ResourcesMinSize'] = min(sizes) | |
res['ResourcesMaxSize'] = max(sizes) | |
else: | |
res['ResourcesNb'] = 0 | |
res['ResourcesMeanEntropy'] = 0 | |
res['ResourcesMinEntropy'] = 0 | |
res['ResourcesMaxEntropy'] = 0 | |
res['ResourcesMeanSize'] = 0 | |
res['ResourcesMinSize'] = 0 | |
res['ResourcesMaxSize'] = 0 | |
# Load configuration size | |
try: | |
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size | |
except AttributeError: | |
res['LoadConfigurationSize'] = 0 | |
# Version configuration size | |
try: | |
version_infos = get_version_info(pe) | |
res['VersionInformationSize'] = len(version_infos.keys()) | |
except AttributeError: | |
res['VersionInformationSize'] = 0 | |
return res | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Detect malicious files') | |
parser.add_argument('FILE', help='File to be tested') | |
args = parser.parse_args() | |
# Load classifier | |
clf = joblib.load(os.path.join( | |
os.path.dirname(os.path.realpath(__file__)), | |
'classifier/classifier.pkl' | |
)) | |
features = pickle.loads(open(os.path.join( | |
os.path.dirname(os.path.realpath(__file__)), | |
'classifier/features.pkl'), | |
'r').read() | |
) | |
data = extract_infos(args.FILE) | |
pe_features = map(lambda x:data[x], features) | |
res= clf.predict([pe_features])[0] | |
print('The file %s is %s' % ( | |
os.path.basename(sys.argv[1]), | |
['malicious', 'legitimate'][res]) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment