Skip to content

Instantly share code, notes, and snippets.

@ctivanovich
Last active May 9, 2018 01:14
Show Gist options
  • Save ctivanovich/061ede7481a9e8ae32148cf911ae4ded to your computer and use it in GitHub Desktop.
Save ctivanovich/061ede7481a9e8ae32148cf911ae4ded to your computer and use it in GitHub Desktop.
Predictive modeling of trading data from a cryptocurrency platform
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.display import HTML\n",
"HTML(\"\"\"\n",
"<style>\n",
".output_png {\n",
" display: table-cell;\n",
" text-align: center;\n",
" vertical-align: middle;\n",
"}\n",
"</style>\n",
"\"\"\")\n",
"\n",
"%matplotlib inline\n",
"\n",
"import csv\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.utils import resample\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.metrics import precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from xgboost import XGBClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.svm import SVC\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def dimensioncounter(fileobj):\n",
"\n",
" reader = csv.reader(fileobj)\n",
" for row in reader:\n",
" ncols = len(row)\n",
" break\n",
" nrows = sum(1 for row in reader) + 1 # + 1 for the row used to count columns\n",
" \n",
" return nrows, ncols\n",
"\n",
"# with open('Test_Full.csv') as f:\n",
"# print(dimensioncounter(f))\n",
"#1228 columns, 520561 rows\n",
"# with open('Training_Full.csv') as f: \n",
"# print(dimensioncounter(f))\n",
"#1229 columns, 1214641 rows, column 0 is labels column"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def sampler(name, m, n):\n",
" with open(name+'.csv', 'r') as csv_in, open(name+'_sample.csv', 'w') as csv_out:\n",
" reader = csv.reader(csv_in, delimiter=',')\n",
" writer = csv.writer(csv_out)\n",
" for i, row in enumerate(reader):\n",
" if i%m == 0:\n",
" writer.writerow(row)\n",
"# sampler(\"Training_Full\", 100, None)\n",
"# sampler(\"Test_Full\", 1, None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Extracting labelled data\n",
"# training_chunker = pd.read_csv(\"Training_Full.csv\", chunksize = 10000)\n",
"# train_df = pd.concat([chunk[(chunk[\"I_VOLUME_BJ_CLASS\"] == 1) | (chunk[\"I_VOLUME_BJ_CLASS\"] == -1)] for chunk in training_chunker])\n",
"# train_df.to_csv(\"Training_subset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_df = pd.read_csv(\"Training_subset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.drop('Unnamed: 0', axis = 1, inplace=True)\n",
"train_df.iloc[:,0].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"X = train_df[train_df.columns[1:201]]\n",
"y = train_df[train_df.columns[0]]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Some out-of-the-box comparisons of ML classifiers"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.69441162060120476"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"knn_model = KNeighborsClassifier(3).fit(X_train, y_train)\n",
"pred = knn_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.70392252932530186"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgb_model = XGBClassifier(learning_rate=0.03, objective='binary:logistic').fit(X_train, y_train)\n",
"pred = xgb_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7016456754186241"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ada_model = AdaBoostClassifier().fit(X_train, y_train)\n",
"pred = ada_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.66783871804478778"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"forest_model = RandomForestClassifier().fit(X_train, y_train)\n",
"pred = forest_model.predict(X_test)\n",
"np.mean(pred == y_test)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.57079862812346893"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"bayes = GaussianNB().fit(X_train, y_train)\n",
"bayes.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x1f8abcf0ba8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA()\n",
"pca.fit_transform(X_train)\n",
"plt.plot(range(1, 11), pca.explained_variance_[:10])\n",
"plt.xlabel(\"Nth component\")\n",
"plt.ylabel(\"Explained variance\");"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"param_grid = [{\n",
"# 'pca__n_components':[7,8],\n",
" 'forest__criterion': ['entropy','gini'],\n",
" 'forest__min_samples_split' : [15], \n",
" 'forest__min_samples_leaf' : [50, 100],\n",
"# 'ada__n_estimators':[25, 50],\n",
"# 'xgb__learning_rate':[0.03, 0.3],\n",
"# 'xgb__gamma':[0.01, 0.1],\n",
"# 'xgb__objective':['binary:logistic']\n",
" \n",
"}]\n",
"\n",
"pipe = Pipeline(steps=\n",
" [('forest', RandomForestClassifier())],\n",
"# [('xgb', XGBClassifier())],\n",
" )\n",
"clf = GridSearchCV(pipe, param_grid)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.70251030348445109"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train,y_train)\n",
"clf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.69840263691683568, 0.66445543360270176, 0.68100624266023857, None)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"precision_recall_fscore_support(y_test, clf.predict(X_test), average='binary')"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(Pipeline(memory=None,\n",
" steps=[('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=100, min_samples_split=15,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False))]), array([-1, 1], dtype=int64))"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.best_estimator_, clf.classes_"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"test_data = pd.read_csv(\"Test_Full.csv\", chunksize = 1000)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"predictions = []\n",
"for chunk in test_data:\n",
" predictions.append(clf.predict_proba(chunk.iloc[:,0:200]))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(520560, 2)"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_df = pd.concat([pd.DataFrame(pred) for pred in predictions])\n",
"pred_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.4845007791752725"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_df[1].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"with open('predictions.csv', 'w') as outfile:\n",
" writer = csv.writer(outfile)\n",
" writer.writerow(['Label'])\n",
" for chunk in predictions:\n",
" for row in chunk:\n",
" writer.writerow([row[1]])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"probs = pd.read_csv('predictions.csv')\n",
"full_test = pd.read_csv('Test_Full.csv')\n",
"full_test['BUY_PROBA'] = probs"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"full_test.to_csv('labelled_test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 518560.000000\n",
"mean 0.491775\n",
"std 0.211203\n",
"min 0.038531\n",
"25% 0.322517\n",
"50% 0.493593\n",
"75% 0.658734\n",
"max 0.975740\n",
"Name: prob_buy, dtype: float64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_test.prob_buy.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment