Skip to content

Instantly share code, notes, and snippets.

@apetenchea
Created May 11, 2019 21:05
Show Gist options
  • Save apetenchea/c729f9a8a4606f8b4a8ecfce92a4b3a6 to your computer and use it in GitHub Desktop.
Save apetenchea/c729f9a8a4606f8b4a8ecfce92a4b3a6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'\n",
"\n",
"import numpy as np\n",
"import json\n",
"import pandas as pd\n",
"import tensorflow.keras as keras\n",
"from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
"from tensorflow.keras.models import Model, load_model, Sequential\n",
"from tensorflow.keras.layers import Dense, BatchNormalization, Input, Dropout, Activation\n",
"from tensorflow.keras.models import load_model\n",
"import tensorflow as tf # 1.11.0\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import confusion_matrix\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import matplotlib\n",
"matplotlib.rcParams['figure.figsize'] = (16, 9)\n",
"sns.set(font_scale=1.5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config = tf.ConfigProto()\n",
"config.gpu_options.allow_growth = True\n",
"sess = tf.Session(config=config)\n",
"tf.keras.backend.set_session(sess)\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"banned = [\n",
"'WRITE_CALL_LOG',\n",
"'WRITE_EXTERNAL_STORAGE',\n",
"'READ_CALL_LOG',\n",
"'READ_EXTERNAL_STORAGE',\n",
"'READ_PHONE_STATE',\n",
"'WRITE_SETTINGS',\n",
"'GET_ACCOUNTS',\n",
"'SYSTEM_ALERT_WINDOW',\n",
"'READ_SETTINGS',\n",
"'PERMISSIONS',\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = list()\n",
"with open('dataset281118.dat') as f:\n",
" for line in f:\n",
" dataset.append(json.loads(line.strip()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features = json.load(open('features.json'))['config']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"d = list()\n",
"for i in dataset:\n",
" s = dict()\n",
" for f in features:\n",
" value = i['features'][f['name']]\n",
" if f['clipUpper'] is not None:\n",
" value = min(value, f['clipUpper'])\n",
" if f['clipLower'] is not None:\n",
" value = max(value, f['clipLower'])\n",
" s[f['name']] = value\n",
" s['verdict'] = 0 if i['verdict'] == 'clean' else 1\n",
" d.append(s)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(d, columns=d[0].keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X = df.drop('verdict', axis='columns')\n",
"Y = df.verdict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train, Y_train = X[:800000].values.astype(np.float32), Y[:800000].values\n",
"X_valid, Y_valid = X[800000:900000].values.astype(np.float32), Y[800000:900000].values\n",
"X_test, Y_test = X[900000:].values.astype(np.float32), Y[900000:].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_scaled = X_train.copy()\n",
"X_valid_scaled = X_valid.copy()\n",
"X_test_scaled = X_test.copy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for idx in range(len(features)):\n",
" if features[idx]['type'] in ('number', 'count'):\n",
" scaler = StandardScaler()\n",
" scaler.fit(X_train[:, idx:idx+1])\n",
" scaler.mean_ = np.float32(scaler.mean_)\n",
" scaler.scale_ = np.float32(scaler.scale_)\n",
" features[idx]['mean'] = float(scaler.mean_[0])\n",
" features[idx]['scale'] = float(scaler.scale_[0])\n",
" X_train_scaled[:, idx:idx+1] = scaler.transform(X_train_scaled[:, idx:idx+1])\n",
" X_valid_scaled[:, idx:idx+1] = scaler.transform(X_valid_scaled[:, idx:idx+1])\n",
" X_test_scaled[:, idx:idx+1] = scaler.transform(X_test_scaled[:, idx:idx+1])\n",
" else:\n",
" features[idx]['mean'] = None\n",
" features[idx]['scale'] = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conf = dict(config=features)\n",
"with open('features.json', 'w') as f:\n",
" json.dump(conf, f, separators=(',', ':'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train():\n",
" nn = Sequential([\n",
" Dense(256, input_dim=len(features)),\n",
" Activation('relu'),\n",
" BatchNormalization(),\n",
" Dropout(0.2),\n",
" Dense(256),\n",
" Activation('relu'),\n",
" BatchNormalization(),\n",
" Dropout(0.5),\n",
" Dense(128),\n",
" Activation('relu'),\n",
" BatchNormalization(),\n",
" Dropout(0.5),\n",
" Dense(64),\n",
" Activation('relu'),\n",
" BatchNormalization(),\n",
" Dense(64),\n",
" Activation('relu'),\n",
" BatchNormalization(),\n",
" Dense(1),\n",
" Activation('sigmoid')\n",
" ])\n",
" nn.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'], weighted_metrics=['accuracy'])\n",
" callbacks = list()\n",
" callbacks.append(EarlyStopping(monitor='val_loss', min_delta=0, patience=3))\n",
" callbacks.append(ModelCheckpoint('earl_best.h5', monitor='val_acc', save_weights_only=False, save_best_only=True))\n",
"\n",
" nn.fit(X_train_scaled, Y_train, epochs=100, batch_size=32, validation_data=(X_valid_scaled, Y_valid), callbacks=callbacks)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train()\n",
"nn = load_model('earl_best.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nn.evaluate(X_valid_scaled, Y_valid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nn.evaluate(X_train_scaled, Y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nn.evaluate(X_test_scaled, Y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label = ['clean', 'trojan']\n",
"matrix = confusion_matrix(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n",
"percent = np.zeros((2,2))\n",
"for i in range(2):\n",
" s = sum(matrix[i])\n",
" for j in range(2):\n",
" percent[i][j] = matrix[i][j].item() * 100.0 / s\n",
"cm = pd.DataFrame(percent, columns=label, index=label)\n",
"sns.heatmap(cm, annot=True, fmt='.2f')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import roc_auc_score, roc_curve\n",
"auc = roc_auc_score(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n",
"fpr, tpr, thresholds = roc_curve(Y_test, np.round(nn.predict(X_test_scaled).flatten()))\n",
"plt.plot([0, 1], [0, 1], linestyle='--')\n",
"plt.plot(fpr, tpr, marker='.')\n",
"plt.title(auc)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Convert"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"converter = tf.contrib.lite.TocoConverter.from_keras_model_file('earl_best.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tflite_model = converter.convert()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"open('earl_model.tflite', \"wb\").write(tflite_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = tf.contrib.lite.Interpreter(f'earl_model.tflite')\n",
"model.allocate_tensors()\n",
"model_in = model.get_input_details()[0]\n",
"model_out = model.get_output_details()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def predict(model, i, o, x):\n",
" model.set_tensor(i['index'], np.float32(x.reshape((1, len(features)))))\n",
" model.invoke()\n",
" y = model.get_tensor(o['index'])\n",
" return y"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.get_input_details()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"x = open('dataset281118.dat')\n",
"q = []\n",
"for i in range(4):\n",
" s = json.loads(x.readline().strip())\n",
" l = []\n",
" for j in X_train_scaled[i]:\n",
" l.append(float(j))\n",
" d = dict(expected=float(predict(model, model_in, model_out, X_train_scaled[i])[0][0]), md5=s['md5'], verdict=float(Y_train[i]), features=l)\n",
" q.append(d)\n",
"with open('test.json', 'w') as f:\n",
" json.dump(q, f)\n",
"x.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"idx = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train[idx].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_scaled[idx]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predict(model, model_in, model_out, X_train_scaled[idx])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment