Last active
July 24, 2020 01:56
-
-
Save sneakers-the-rat/6d39442931d88d0dd730f524005787d9 to your computer and use it in GitHub Desktop.
Kip - loading data pythonically
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"from hmmlearn import hmm\n", | |
"import warnings" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create a function to load data. Load whole file first as a list of split strings, then convert to a pandas dataframe. This function can then be used to load both data files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def load_data(fname: str, column_names: list, split_sep:str = ' ') -> pd.DataFrame:\n", | |
" \"\"\"\n", | |
" Load data to train a hidden markov model\n", | |
"\n", | |
" Args:\n", | |
" fname (str): path to the data to load\n", | |
" column_names (list): list of strings to use for column names.\n", | |
" split_sep (str): separator to use to split lines of file (default: ' ')\n", | |
"\n", | |
" Returns:\n", | |
" pandas.DataFrame loaded data\n", | |
" \"\"\"\n", | |
"\n", | |
" # load data to a list of a list of numbers as strings\n", | |
" # eg [['1.0', '2.0', '3.0\\n'], ['4.0', ... ]]\n", | |
" with open(fname, 'r') as fx:\n", | |
" data = [line.split(split_sep) for line in fx]\n", | |
" \n", | |
" # convert to pandas dataframe while casting as float\n", | |
" df = pd.DataFrame(data, columns=column_names, dtype='float')\n", | |
"\n", | |
" return df\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Now use it to load both data files. If you're working in a script context it's a good idea to force yourself to make a good directory structure by building your paths programmatically." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"base_dir = \"/Users/jonny/Dropbox/code/kip\"\n", | |
"fname_x = os.path.join(base_dir, 'X.txt')\n", | |
"#fname_y = os.path.join(base_dir, 'Y.txt')\n", | |
"\n", | |
"column_names = ['speed', 'cricketspeed', 'myrange', 'azimuth']\n", | |
"\n", | |
"X = load_data(fname_x, column_names)\n", | |
"#Y = load_data(fname_y, column_names)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>speed</th>\n", | |
" <th>cricketspeed</th>\n", | |
" <th>myrange</th>\n", | |
" <th>azimuth</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.116482</td>\n", | |
" <td>112.876960</td>\n", | |
" <td>646.952990</td>\n", | |
" <td>109.681069</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.506245</td>\n", | |
" <td>100.155766</td>\n", | |
" <td>707.283616</td>\n", | |
" <td>119.768465</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0.621848</td>\n", | |
" <td>82.521869</td>\n", | |
" <td>770.370649</td>\n", | |
" <td>127.505750</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.430308</td>\n", | |
" <td>61.800171</td>\n", | |
" <td>825.746663</td>\n", | |
" <td>132.654799</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0.725347</td>\n", | |
" <td>43.819538</td>\n", | |
" <td>869.389031</td>\n", | |
" <td>135.660915</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" speed cricketspeed myrange azimuth\n", | |
"0 0.116482 112.876960 646.952990 109.681069\n", | |
"1 0.506245 100.155766 707.283616 119.768465\n", | |
"2 0.621848 82.521869 770.370649 127.505750\n", | |
"3 0.430308 61.800171 825.746663 132.654799\n", | |
"4 0.725347 43.819538 869.389031 135.660915" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Also make a function to train model. wrapping operations in functions both is useful to reduce repeat labor but also to formalize workflow" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def train_hmm(data: pd.DataFrame, \n", | |
" n_components:int = 2, \n", | |
" covariance_type:str =\"full\", \n", | |
" n_iter:int = 100):\n", | |
" \"\"\"\n", | |
" Train a hmm on cricket data!\n", | |
" \n", | |
" Args:\n", | |
" data (pandas.DataFrame): data to train on \n", | |
" n_components (int): n components for HMM model \n", | |
" covariance_type (str): type of covariance in HMM model \n", | |
" n_iter (int): number of iterations to train model \n", | |
"\n", | |
" Returns:\n", | |
" trained model\n", | |
" \"\"\"\n", | |
" model = hmm.GaussianHMM(\n", | |
" n_components=n_components, \n", | |
" covariance_type=covariance_type, \n", | |
" n_iter=n_iter)\n", | |
" \n", | |
" \n", | |
" print(f'Training model for {n_iter} iterations')\n", | |
" model.fit(data)\n", | |
" \n", | |
" print('converged? ', end=' ')\n", | |
" print(model.monitor_.converged)\n", | |
" \n", | |
" return model\n", | |
"\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Training model for 100 iterations\n", | |
"converged? True\n" | |
] | |
} | |
], | |
"source": [ | |
"model = train_hmm(X)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Ideally functions should do one well defined thing. So separate train and predict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def save_array(x, out_fn):\n", | |
" \"\"\"\n", | |
" Args:\n", | |
" x (numpy.ndarray): array to save\n", | |
" out_fn (str): filename to save array, \n", | |
" extension should one of 'csv', 'pck', 'npy', or 'txt'\n", | |
" \"\"\"\n", | |
" if out_fn.endswith('.txt'):\n", | |
" with open(out_fn, 'w') as fz:\n", | |
" for row in x:\n", | |
" fz.write(f'{row}\\n')\n", | |
"\n", | |
" elif out_fn.endswith('.csv'):\n", | |
" np.savetxt(out_fn, x, delimiter=\",\")\n", | |
"\n", | |
" else:\n", | |
" if not out_fn.endswith('.pck') and not out_fn.endswith('.npy'):\n", | |
" warnings.warn(f'Extension of {out_fn} not recognized, using np.save to save as .npy')\n", | |
" \n", | |
" np.save(out_fn, x)\n", | |
" \n", | |
"\n", | |
"def predict_hmm(model, Y, out_fn = None) -> np.ndarray:\n", | |
"\n", | |
" z = model.predict(Y)\n", | |
" \n", | |
" if out_fn is not None:\n", | |
" print(f'Saving results to {out_fn}')\n", | |
" save_array(z, out_fn)\n", | |
" \n", | |
" return z\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Saving results to /Users/jonny/Dropbox/code/kip/z.csv\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 0, 0, ..., 0, 0, 0])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"out_fn = os.path.join(base_dir, 'z.csv')\n", | |
"\n", | |
"predict_hmm(model, X, out_fn=out_fn)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment