Created
August 11, 2019 02:45
-
-
Save kohnakagawa/ecc0dfdf3e81c0a69531fb1602101bdf to your computer and use it in GitHub Desktop.
Malware Data Science chapter 8の内容をEmberで使われている特徴量で実施した場合の結果
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sys\n", | |
"import os\n", | |
"import glob\n", | |
"import sklearn\n", | |
"import ember\n", | |
"import numpy as np\n", | |
"import yara\n", | |
"import pickle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"b_paths = glob.glob(os.path.join(\"data\", \"benignware\", \"*\"))\n", | |
"m_paths = glob.glob(os.path.join(\"data\", \"malware\", \"*\"))\n", | |
"labels = [0 for _ in b_paths] + [1 for _ in m_paths]\n", | |
"data_paths = b_paths + m_paths" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rule = yara.compile(source='rule IsPeFile {strings:$mz = \"MZ\"condition:$mz at 0 and uint32(uint32(0x3C)) == 0x4550}')\n", | |
"path_labels = [(p, l) for p, l in zip(data_paths, labels) if rule.match(p)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fextractor = ember.PEFeatureExtractor()\n", | |
"fvector = np.array([fextractor.feature_vector(bytez=open(p, \"rb\").read()) for p, _ in path_labels])\n", | |
"labels = np.array([l for _, l in path_labels])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(\"fvector.pickle\", \"wb\") as fb:\n", | |
" pickle.dump(fvector, fb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"X = fvector\n", | |
"y = labels\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", | |
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"from sklearn.model_selection import KFold\n", | |
"\n", | |
"clf = RandomForestClassifier()\n", | |
"kf = KFold(n_splits=4)\n", | |
"results = []\n", | |
"for train_idx, test_idx in kf.split(X_train, y_train):\n", | |
" clf.fit(X_train[train_idx], y_train[train_idx])\n", | |
" y_pred = clf.predict(X_train[test_idx])\n", | |
" results.append(accuracy_score(y_train[test_idx], y_pred))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.978494623655914\n" | |
] | |
} | |
], | |
"source": [ | |
"y_pred = clf.predict(X_test)\n", | |
"print(accuracy_score(y_test, y_pred))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import roc_curve, auc\n", | |
"y_pred_prob = clf.predict_proba(X_test)[:,1]\n", | |
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)\n", | |
"roc_auc = auc(fpr, tpr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<Figure size 640x480 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"plt.title('Receiver Operating Characteristic')\n", | |
"plt.plot(fpr, tpr, 'b', label = 'AUC = %0.5f' % roc_auc)\n", | |
"plt.legend(loc = 'lower right')\n", | |
"plt.xlim([0, 0.03])\n", | |
"plt.ylim([0, 1])\n", | |
"plt.ylabel('True Positive Rate')\n", | |
"plt.xlabel('False Positive Rate')\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7f7908ba6668>" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import seaborn as sns\n", | |
"fti = clf.feature_importances_ \n", | |
"idx = 0\n", | |
"feature_dim_names = []\n", | |
"for f in fextractor.features:\n", | |
" feature_dim_names.append((f.name, idx, idx + f.dim))\n", | |
" idx += f.dim\n", | |
"\n", | |
"importances = []\n", | |
"for name, idx_beg, idx_end in feature_dim_names:\n", | |
" importance_sum = np.sum(fti[idx_beg:idx_end])\n", | |
" importances.append(importance_sum)\n", | |
"\n", | |
"names = [f.name for f in fextractor.features] \n", | |
"sns.barplot(x=importances, y=names)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
AUCを比較すると、Emberのほうが優れていた。
Malware Data Scienceのモデルだと AUC が0.9951
Emberだと AUC が0.9972