Skip to content

Instantly share code, notes, and snippets.

@fanbyprinciple
Created February 15, 2022 11:45
Show Gist options
  • Save fanbyprinciple/8d7895119f36441912a7af8d34426bcf to your computer and use it in GitHub Desktop.
Save fanbyprinciple/8d7895119f36441912a7af8d34426bcf to your computer and use it in GitHub Desktop.
getting file md5 hashes and sha1 hashes
import os
import hashlib
import sys
import pandas as pd
import numpy as np
all_files = []
all_files_without_path = []
all_hashes = []
BUF_SIZE = 65536 # lets read stuff in 64kb chunks!
# from : https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
def find_hashes(file_name):
md5 = hashlib.md5()
sha1 = hashlib.sha1()
with open(file_name, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
md5.update(data)
sha1.update(data)
md5_result = md5.hexdigest()
sha1_result = sha1.hexdigest()
print("MD5: {0} : {1}".format(md5_result, len(md5_result)))
print("SHA1: {0} : {1}".format(sha1_result, len(sha1_result)))
return {"MD5": md5.hexdigest(), "SHA1" : sha1.hexdigest()}
for path, currentDirectory, files in os.walk("./files"):
for file in files:
mypath = os.path.join(path, file)
#print(mypath)
all_files.append(mypath)
try:
isitarchive = mypath.split('.')[2]
except Exception as e:
isitarchive = "noext"
# print(mypath.split('.'))
print(isitarchive)
if ( isitarchive.lower().strip() != 'zip' and isitarchive.lower().strip() != '7z' and isitarchive.lower().strip() != 'rar' and isitarchive.lower().strip() != 'png' ):
print("find hashes.")
all_hashes.append(find_hashes(mypath))
else :
print(mypath, "\n")
print("this was a zip.")
all_hashes.append({"MD5" : "NOP", "SHA1" : "NOP"})
print(len(all_files))
print(len(all_files_without_path))
print(len(all_hashes))
simple_array = []
for i in range(len(all_files)):
print([all_files[i], all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])
file_name = all_files[i].split('/')[-1]
simple_array.append([file_name, all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])
np_array = np.array(simple_array)
print(np_array.shape)
df = pd.DataFrame(np_array, columns=['file names', 'MD5', 'SHA1'])
df = df.sort_values(['file names'])
df = df.drop_duplicates(subset=["file names", "MD5"], keep='first')
df = df[df.MD5 != "NOP"]
df = df.reset_index(drop=True)
df.to_csv('ioc.csv')
#move all files to onlyfiles
for i in all_files:
print(i)
isplit = all_files[i].split('/')[-1]
os.system(f'mkdir "./onlyfiles/{isplit}"')
os.system(f'cp "{i}" "./onlyfiles/{i}/"')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment