Skip to content

Instantly share code, notes, and snippets.

@kohnakagawa
Created August 11, 2019 03:28
Show Gist options
  • Select an option

  • Save kohnakagawa/bb1994cdd1fbde10e338f66d81d01deb to your computer and use it in GitHub Desktop.

Select an option

Save kohnakagawa/bb1994cdd1fbde10e338f66d81d01deb to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import glob\n",
"import sklearn\n",
"import ember\n",
"import numpy as np\n",
"import yara\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"b_paths = glob.glob(os.path.join(\"data\", \"benignware\", \"*\"))\n",
"m_paths = glob.glob(os.path.join(\"data\", \"malware\", \"*\"))\n",
"labels = [0 for _ in b_paths] + [1 for _ in m_paths]\n",
"data_paths = b_paths + m_paths"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"rule = yara.compile(source='rule IsPeFile {strings:$mz = \"MZ\"condition:$mz at 0 and uint32(uint32(0x3C)) == 0x4550}')\n",
"path_labels = [(p, l) for p, l in zip(data_paths, labels) if rule.match(p)]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"fextractor = ember.PEFeatureExtractor()\n",
"fvector = np.array([fextractor.feature_vector(bytez=open(p, \"rb\").read())[0:256] for p, _ in path_labels])\n",
"labels = np.array([l for _, l in path_labels])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"with open(\"fvector.pickle\", \"wb\") as fb:\n",
" pickle.dump(fvector, fb)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X = fvector\n",
"y = labels\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import KFold\n",
"\n",
"clf = RandomForestClassifier()\n",
"kf = KFold(n_splits=4)\n",
"results = []\n",
"for train_idx, test_idx in kf.split(X_train, y_train):\n",
" clf.fit(X_train[train_idx], y_train[train_idx])\n",
" y_pred = clf.predict(X_train[test_idx])\n",
" results.append(accuracy_score(y_train[test_idx], y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.96415770609319\n"
]
}
],
"source": [
"y_pred = clf.predict(X_test)\n",
"print(accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9974856321839081\n"
]
}
],
"source": [
"from sklearn.metrics import roc_curve, auc\n",
"y_pred_prob = clf.predict_proba(X_test)[:,1]\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)\n",
"roc_auc = auc(fpr, tpr)\n",
"print(roc_auc)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"plt.title('Receiver Operating Characteristic')\n",
"plt.plot(fpr, tpr, 'b', label = 'AUC = %0.5f' % roc_auc)\n",
"plt.legend(loc = 'lower right')\n",
"plt.xlim([0, 0.03])\n",
"plt.ylim([0, 1])\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment