Created
November 25, 2018 06:39
-
-
Save kohnakagawa/d772102d8f08143b1d8e276f3b70c125 to your computer and use it in GitHub Desktop.
fuzzy hashを計算し、簡易的にクラスタリングを行うためのツール
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ssdeep | |
| import pyimpfuzzy as pyimf | |
| import sys | |
| import pandas as pd | |
| from hashlib import sha256 | |
| from itertools import combinations | |
| from pylouvain.pylouvain import PyLouvain | |
| SS_THRESHOLD = 30 | |
| def calc_ssdeep_relation(df, hash_type): | |
| hashes = df[hash_type].values | |
| index = df.index.values | |
| relation = [(i0, i1, ssdeep.compare(h0, h1)) | |
| for (i0, h0), (i1, h1) in combinations(zip(index, hashes), 2)] | |
| return relation | |
| def do_clustering(nodes, relation): | |
| edges = [[[edge[0], edge[1]], edge[2]] if edge[2] > SS_THRESHOLD | |
| else [[edge[0], edge[1]], 0] for edge in relation] | |
| pyl = PyLouvain(nodes, edges) | |
| partition, _ = pyl.apply_method() | |
| return partition | |
| def calc_fuzzy_hashes(row): | |
| fname = row["path"] | |
| result = { | |
| "ssdeep": ssdeep.hash_from_file(fname), | |
| "impfuzzy": pyimf.get_impfuzzy(fname) | |
| } | |
| with open(fname, "rb") as f: | |
| result["sha256"] = sha256(f.read()).hexdigest() | |
| return pd.Series(result) | |
| def main(): | |
| if len(sys.argv) != 2: | |
| print("Usage: {} label.csv".format(sys.argv[0])) | |
| sys.exit(1) | |
| label_file = sys.argv[1] | |
| df = pd.read_csv(label_file) | |
| df = pd.concat([df, df.apply(calc_fuzzy_hashes, axis=1)], axis=1) | |
| rel_ssdeep = calc_ssdeep_relation(df, "ssdeep") | |
| rel_impfuzzy = calc_ssdeep_relation(df, "impfuzzy") | |
| nodes = range(df.shape[0]) | |
| part = do_clustering(df.index.values, rel_impfuzzy) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment