fanbyprinciple · February 15, 2022 11:45
diff --git a/get_hash.py b/get_hash.py
 import os
 import hashlib
 import sys
 import pandas as pd
 import numpy as np

 all_files = []
 all_files_without_path = []

 all_hashes = []
 BUF_SIZE = 65536  # lets read stuff in 64kb chunks!

 # from : https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
 def find_hashes(file_name):
    md5 = hashlib.md5()
    sha1 = hashlib.sha1()

    with open(file_name, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            md5.update(data)
            sha1.update(data)

    md5_result = md5.hexdigest()
    sha1_result = sha1.hexdigest()
    print("MD5: {0} : {1}".format(md5_result, len(md5_result)))
    print("SHA1: {0} : {1}".format(sha1_result, len(sha1_result)))

    
    return {"MD5": md5.hexdigest(), "SHA1" : sha1.hexdigest()}

 for path, currentDirectory, files in os.walk("./files"):
    for file in files:
        mypath = os.path.join(path, file)
        #print(mypath)
        all_files.append(mypath)
        try:
            isitarchive = mypath.split('.')[2]
        except Exception as e:
            isitarchive = "noext"
            # print(mypath.split('.'))
        print(isitarchive)
        
        if ( isitarchive.lower().strip() != 'zip' and isitarchive.lower().strip() != '7z' and isitarchive.lower().strip() != 'rar' and isitarchive.lower().strip() != 'png' ):
            print("find hashes.")
            all_hashes.append(find_hashes(mypath))
        else :
            print(mypath, "\n")
            print("this was a zip.")
            all_hashes.append({"MD5" : "NOP", "SHA1" : "NOP"})

 print(len(all_files))
 print(len(all_files_without_path))
 print(len(all_hashes))

 simple_array = []

 for i in range(len(all_files)):
    print([all_files[i], all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])

    file_name = all_files[i].split('/')[-1]
    simple_array.append([file_name, all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])    

 np_array = np.array(simple_array)

 print(np_array.shape)
 df = pd.DataFrame(np_array, columns=['file names', 'MD5', 'SHA1'])

 df = df.sort_values(['file names'])

 df = df.drop_duplicates(subset=["file names", "MD5"], keep='first')

 df = df[df.MD5 != "NOP"]


 df = df.reset_index(drop=True)

 df.to_csv('ioc.csv')

 #move all files to onlyfiles
 for i in all_files:
    print(i)
    isplit = all_files[i].split('/')[-1]
    os.system(f'mkdir "./onlyfiles/{isplit}"')
    os.system(f'cp "{i}" "./onlyfiles/{i}/"')
	import os
	import hashlib
	import sys
	import pandas as pd
	import numpy as np

	all_files = []
	all_files_without_path = []

	all_hashes = []
	BUF_SIZE = 65536 # lets read stuff in 64kb chunks!

	# from : https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
	def find_hashes(file_name):
	md5 = hashlib.md5()
	sha1 = hashlib.sha1()

	with open(file_name, 'rb') as f:
	while True:
	data = f.read(BUF_SIZE)
	if not data:
	break
	md5.update(data)
	sha1.update(data)

	md5_result = md5.hexdigest()
	sha1_result = sha1.hexdigest()
	print("MD5: {0} : {1}".format(md5_result, len(md5_result)))
	print("SHA1: {0} : {1}".format(sha1_result, len(sha1_result)))


	return {"MD5": md5.hexdigest(), "SHA1" : sha1.hexdigest()}

	for path, currentDirectory, files in os.walk("./files"):
	for file in files:
	mypath = os.path.join(path, file)
	#print(mypath)
	all_files.append(mypath)
	try:
	isitarchive = mypath.split('.')[2]
	except Exception as e:
	isitarchive = "noext"
	# print(mypath.split('.'))
	print(isitarchive)

	if ( isitarchive.lower().strip() != 'zip' and isitarchive.lower().strip() != '7z' and isitarchive.lower().strip() != 'rar' and isitarchive.lower().strip() != 'png' ):
	print("find hashes.")
	all_hashes.append(find_hashes(mypath))
	else :
	print(mypath, "\n")
	print("this was a zip.")
	all_hashes.append({"MD5" : "NOP", "SHA1" : "NOP"})

	print(len(all_files))
	print(len(all_files_without_path))
	print(len(all_hashes))

	simple_array = []

	for i in range(len(all_files)):
	print([all_files[i], all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])

	file_name = all_files[i].split('/')[-1]
	simple_array.append([file_name, all_hashes[i]["MD5"], all_hashes[i]["SHA1"] ])

	np_array = np.array(simple_array)

	print(np_array.shape)
	df = pd.DataFrame(np_array, columns=['file names', 'MD5', 'SHA1'])

	df = df.sort_values(['file names'])

	df = df.drop_duplicates(subset=["file names", "MD5"], keep='first')

	df = df[df.MD5 != "NOP"]


	df = df.reset_index(drop=True)

	df.to_csv('ioc.csv')

	#move all files to onlyfiles
	for i in all_files:
	print(i)
	isplit = all_files[i].split('/')[-1]
	os.system(f'mkdir "./onlyfiles/{isplit}"')
	os.system(f'cp "{i}" "./onlyfiles/{i}/"')