Created
May 11, 2019 21:05
-
-
Save apetenchea/c729f9a8a4606f8b4a8ecfce92a4b3a6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", | |
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n", | |
"\n", | |
"import numpy as np\n", | |
"import json\n", | |
"import pandas as pd\n", | |
"import tensorflow.keras as keras\n", | |
"from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", | |
"from tensorflow.keras.models import Model, load_model, Sequential\n", | |
"from tensorflow.keras.layers import Dense, BatchNormalization, Input, Dropout, Activation\n", | |
"from tensorflow.keras.models import load_model\n", | |
"import tensorflow as tf # 1.11.0\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"from sklearn.metrics import confusion_matrix\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"import matplotlib\n", | |
"matplotlib.rcParams['figure.figsize'] = (16, 9)\n", | |
"sns.set(font_scale=1.5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"config = tf.ConfigProto()\n", | |
"config.gpu_options.allow_growth = True\n", | |
"sess = tf.Session(config=config)\n", | |
"tf.keras.backend.set_session(sess)\n", | |
"\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"banned = [\n", | |
"'WRITE_CALL_LOG',\n", | |
"'WRITE_EXTERNAL_STORAGE',\n", | |
"'READ_CALL_LOG',\n", | |
"'READ_EXTERNAL_STORAGE',\n", | |
"'READ_PHONE_STATE',\n", | |
"'WRITE_SETTINGS',\n", | |
"'GET_ACCOUNTS',\n", | |
"'SYSTEM_ALERT_WINDOW',\n", | |
"'READ_SETTINGS',\n", | |
"'PERMISSIONS',\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Load data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = list()\n", | |
"with open('dataset281118.dat') as f:\n", | |
" for line in f:\n", | |
" dataset.append(json.loads(line.strip()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"features = json.load(open('features.json'))['config']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"d = list()\n", | |
"for i in dataset:\n", | |
" s = dict()\n", | |
" for f in features:\n", | |
" value = i['features'][f['name']]\n", | |
" if f['clipUpper'] is not None:\n", | |
" value = min(value, f['clipUpper'])\n", | |
" if f['clipLower'] is not None:\n", | |
" value = max(value, f['clipLower'])\n", | |
" s[f['name']] = value\n", | |
" s['verdict'] = 0 if i['verdict'] == 'clean' else 1\n", | |
" d.append(s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame(d, columns=d[0].keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X = df.drop('verdict', axis='columns')\n", | |
"Y = df.verdict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train, Y_train = X[:800000].values.astype(np.float32), Y[:800000].values\n", | |
"X_valid, Y_valid = X[800000:900000].values.astype(np.float32), Y[800000:900000].values\n", | |
"X_test, Y_test = X[900000:].values.astype(np.float32), Y[900000:].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train_scaled = X_train.copy()\n", | |
"X_valid_scaled = X_valid.copy()\n", | |
"X_test_scaled = X_test.copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for idx in range(len(features)):\n", | |
" if features[idx]['type'] in ('number', 'count'):\n", | |
" scaler = StandardScaler()\n", | |
" scaler.fit(X_train[:, idx:idx+1])\n", | |
" scaler.mean_ = np.float32(scaler.mean_)\n", | |
" scaler.scale_ = np.float32(scaler.scale_)\n", | |
" features[idx]['mean'] = float(scaler.mean_[0])\n", | |
" features[idx]['scale'] = float(scaler.scale_[0])\n", | |
" X_train_scaled[:, idx:idx+1] = scaler.transform(X_train_scaled[:, idx:idx+1])\n", | |
" X_valid_scaled[:, idx:idx+1] = scaler.transform(X_valid_scaled[:, idx:idx+1])\n", | |
" X_test_scaled[:, idx:idx+1] = scaler.transform(X_test_scaled[:, idx:idx+1])\n", | |
" else:\n", | |
" features[idx]['mean'] = None\n", | |
" features[idx]['scale'] = None" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"conf = dict(config=features)\n", | |
"with open('features.json', 'w') as f:\n", | |
" json.dump(conf, f, separators=(',', ':'))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def train():\n", | |
" nn = Sequential([\n", | |
" Dense(256, input_dim=len(features)),\n", | |
" Activation('relu'),\n", | |
" BatchNormalization(),\n", | |
" Dropout(0.2),\n", | |
" Dense(256),\n", | |
" Activation('relu'),\n", | |
" BatchNormalization(),\n", | |
" Dropout(0.5),\n", | |
" Dense(128),\n", | |
" Activation('relu'),\n", | |
" BatchNormalization(),\n", | |
" Dropout(0.5),\n", | |
" Dense(64),\n", | |
" Activation('relu'),\n", | |
" BatchNormalization(),\n", | |
" Dense(64),\n", | |
" Activation('relu'),\n", | |
" BatchNormalization(),\n", | |
" Dense(1),\n", | |
" Activation('sigmoid')\n", | |
" ])\n", | |
" nn.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'], weighted_metrics=['accuracy'])\n", | |
" callbacks = list()\n", | |
" callbacks.append(EarlyStopping(monitor='val_loss', min_delta=0, patience=3))\n", | |
" callbacks.append(ModelCheckpoint('earl_best.h5', monitor='val_acc', save_weights_only=False, save_best_only=True))\n", | |
"\n", | |
" nn.fit(X_train_scaled, Y_train, epochs=100, batch_size=32, validation_data=(X_valid_scaled, Y_valid), callbacks=callbacks)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train()\n", | |
"nn = load_model('earl_best.h5')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nn.evaluate(X_valid_scaled, Y_valid)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nn.evaluate(X_train_scaled, Y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nn.evaluate(X_test_scaled, Y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"label = ['clean', 'trojan']\n", | |
"matrix = confusion_matrix(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
"percent = np.zeros((2,2))\n", | |
"for i in range(2):\n", | |
" s = sum(matrix[i])\n", | |
" for j in range(2):\n", | |
" percent[i][j] = matrix[i][j].item() * 100.0 / s\n", | |
"cm = pd.DataFrame(percent, columns=label, index=label)\n", | |
"sns.heatmap(cm, annot=True, fmt='.2f')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import roc_auc_score, roc_curve\n", | |
"auc = roc_auc_score(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
"fpr, tpr, thresholds = roc_curve(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n", | |
"plt.plot([0, 1], [0, 1], linestyle='--')\n", | |
"plt.plot(fpr, tpr, marker='.')\n", | |
"plt.title(auc)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Convert" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"converter = tf.contrib.lite.TocoConverter.from_keras_model_file('earl_best.h5')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tflite_model = converter.convert()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"open('earl_model.tflite', \"wb\").write(tflite_model)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = tf.contrib.lite.Interpreter(f'earl_model.tflite')\n", | |
"model.allocate_tensors()\n", | |
"model_in = model.get_input_details()[0]\n", | |
"model_out = model.get_output_details()[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def predict(model, i, o, x):\n", | |
" model.set_tensor(i['index'], np.float32(x.reshape((1, len(features)))))\n", | |
" model.invoke()\n", | |
" y = model.get_tensor(o['index'])\n", | |
" return y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.get_input_details()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"x = open('dataset281118.dat')\n", | |
"q = []\n", | |
"for i in range(4):\n", | |
" s = json.loads(x.readline().strip())\n", | |
" l = []\n", | |
" for j in X_train_scaled[i]:\n", | |
" l.append(float(j))\n", | |
" d = dict(expected=float(predict(model, model_in, model_out, X_train_scaled[i])[0][0]), md5=s['md5'], verdict=float(Y_train[i]), features=l)\n", | |
" q.append(d)\n", | |
"with open('test.json', 'w') as f:\n", | |
" json.dump(q, f)\n", | |
"x.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"idx = 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train[idx].astype(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train_scaled[idx]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"predict(model, model_in, model_out, X_train_scaled[idx])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment