Skip to content

Instantly share code, notes, and snippets.

@kohnakagawa
Created November 25, 2018 06:39
Show Gist options
  • Save kohnakagawa/d772102d8f08143b1d8e276f3b70c125 to your computer and use it in GitHub Desktop.
Save kohnakagawa/d772102d8f08143b1d8e276f3b70c125 to your computer and use it in GitHub Desktop.
fuzzy hashを計算し、簡易的にクラスタリングを行うためのツール
import ssdeep
import pyimpfuzzy as pyimf
import sys
import pandas as pd
from hashlib import sha256
from itertools import combinations
from pylouvain.pylouvain import PyLouvain
SS_THRESHOLD = 30
def calc_ssdeep_relation(df, hash_type):
hashes = df[hash_type].values
index = df.index.values
relation = [(i0, i1, ssdeep.compare(h0, h1))
for (i0, h0), (i1, h1) in combinations(zip(index, hashes), 2)]
return relation
def do_clustering(nodes, relation):
edges = [[[edge[0], edge[1]], edge[2]] if edge[2] > SS_THRESHOLD
else [[edge[0], edge[1]], 0] for edge in relation]
pyl = PyLouvain(nodes, edges)
partition, _ = pyl.apply_method()
return partition
def calc_fuzzy_hashes(row):
fname = row["path"]
result = {
"ssdeep": ssdeep.hash_from_file(fname),
"impfuzzy": pyimf.get_impfuzzy(fname)
}
with open(fname, "rb") as f:
result["sha256"] = sha256(f.read()).hexdigest()
return pd.Series(result)
def main():
if len(sys.argv) != 2:
print("Usage: {} label.csv".format(sys.argv[0]))
sys.exit(1)
label_file = sys.argv[1]
df = pd.read_csv(label_file)
df = pd.concat([df, df.apply(calc_fuzzy_hashes, axis=1)], axis=1)
rel_ssdeep = calc_ssdeep_relation(df, "ssdeep")
rel_impfuzzy = calc_ssdeep_relation(df, "impfuzzy")
nodes = range(df.shape[0])
part = do_clustering(df.index.values, rel_impfuzzy)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment