sneakers-the-rat · July 24, 2020 01:56
diff --git a/kip_hmm.ipynb b/kip_hmm.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from hmmlearn import hmm\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a function to load data. Load whole file first as a list of split strings, then convert to a pandas dataframe.  This function can then be used to load both data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_data(fname: str, column_names: list, split_sep:str = ' ') -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Load data to train a hidden markov model\n",
    "\n",
    "    Args:\n",
    "        fname (str): path to the data to load\n",
    "        column_names (list): list of strings to use for column names.\n",
    "        split_sep (str): separator to use to split lines of file (default: ' ')\n",
    "\n",
    "    Returns:\n",
    "        pandas.DataFrame loaded data\n",
    "    \"\"\"\n",
    "\n",
    "    # load data to a list of a list of numbers as strings\n",
    "    # eg [['1.0', '2.0', '3.0\\n'], ['4.0', ... ]]\n",
    "    with open(fname, 'r') as fx:\n",
    "        data = [line.split(split_sep) for line in fx]\n",
    "        \n",
    "    # convert to pandas dataframe while casting as float\n",
    "    df = pd.DataFrame(data, columns=column_names, dtype='float')\n",
    "\n",
    "    return df\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now use it to load both data files. If you're working in a script context it's a good idea to force yourself to make a good directory structure by building your paths programmatically."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = \"/Users/jonny/Dropbox/code/kip\"\n",
    "fname_x = os.path.join(base_dir, 'X.txt')\n",
    "#fname_y = os.path.join(base_dir, 'Y.txt')\n",
    "\n",
    "column_names = ['speed', 'cricketspeed', 'myrange', 'azimuth']\n",
    "\n",
    "X = load_data(fname_x, column_names)\n",
    "#Y = load_data(fname_y, column_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>speed</th>\n",
       "      <th>cricketspeed</th>\n",
       "      <th>myrange</th>\n",
       "      <th>azimuth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.116482</td>\n",
       "      <td>112.876960</td>\n",
       "      <td>646.952990</td>\n",
       "      <td>109.681069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.506245</td>\n",
       "      <td>100.155766</td>\n",
       "      <td>707.283616</td>\n",
       "      <td>119.768465</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.621848</td>\n",
       "      <td>82.521869</td>\n",
       "      <td>770.370649</td>\n",
       "      <td>127.505750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.430308</td>\n",
       "      <td>61.800171</td>\n",
       "      <td>825.746663</td>\n",
       "      <td>132.654799</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.725347</td>\n",
       "      <td>43.819538</td>\n",
       "      <td>869.389031</td>\n",
       "      <td>135.660915</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      speed  cricketspeed     myrange     azimuth\n",
       "0  0.116482    112.876960  646.952990  109.681069\n",
       "1  0.506245    100.155766  707.283616  119.768465\n",
       "2  0.621848     82.521869  770.370649  127.505750\n",
       "3  0.430308     61.800171  825.746663  132.654799\n",
       "4  0.725347     43.819538  869.389031  135.660915"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Also make a function to train model. wrapping operations in functions both is useful to reduce repeat labor but also to formalize workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_hmm(data: pd.DataFrame, \n",
    "              n_components:int = 2, \n",
    "              covariance_type:str =\"full\", \n",
    "              n_iter:int = 100):\n",
    "    \"\"\"\n",
    "    Train a hmm on cricket data!\n",
    "    \n",
    "    Args:\n",
    "        data (pandas.DataFrame): data to train on \n",
    "        n_components (int): n components for HMM model \n",
    "        covariance_type (str): type of covariance in HMM model \n",
    "        n_iter (int): number of iterations to train model \n",
    "\n",
    "    Returns:\n",
    "        trained model\n",
    "    \"\"\"\n",
    "    model = hmm.GaussianHMM(\n",
    "        n_components=n_components, \n",
    "        covariance_type=covariance_type, \n",
    "        n_iter=n_iter)\n",
    "    \n",
    "    \n",
    "    print(f'Training model for {n_iter} iterations')\n",
    "    model.fit(data)\n",
    "    \n",
    "    print('converged? ', end=' ')\n",
    "    print(model.monitor_.converged)\n",
    "    \n",
    "    return model\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training model for 100 iterations\n",
      "converged?  True\n"
     ]
    }
   ],
   "source": [
    "model = train_hmm(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ideally functions should do one well defined thing. So separate train and predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_array(x, out_fn):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        x (numpy.ndarray): array to save\n",
    "        out_fn (str): filename to save array, \n",
    "            extension should one of 'csv', 'pck', 'npy', or 'txt'\n",
    "    \"\"\"\n",
    "    if out_fn.endswith('.txt'):\n",
    "        with open(out_fn, 'w') as fz:\n",
    "            for row in x:\n",
    "                fz.write(f'{row}\\n')\n",
    "\n",
    "    elif out_fn.endswith('.csv'):\n",
    "        np.savetxt(out_fn, x, delimiter=\",\")\n",
    "\n",
    "    else:\n",
    "        if not out_fn.endswith('.pck') and not out_fn.endswith('.npy'):\n",
    "            warnings.warn(f'Extension of {out_fn} not recognized, using np.save to save as .npy')\n",
    "        \n",
    "        np.save(out_fn, x)\n",
    "    \n",
    "\n",
    "def predict_hmm(model, Y, out_fn = None) -> np.ndarray:\n",
    "\n",
    "    z = model.predict(Y)\n",
    "    \n",
    "    if out_fn is not None:\n",
    "        print(f'Saving results to {out_fn}')\n",
    "        save_array(z, out_fn)\n",
    "        \n",
    "    return z\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving results to /Users/jonny/Dropbox/code/kip/z.csv\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, ..., 0, 0, 0])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out_fn = os.path.join(base_dir, 'z.csv')\n",
    "\n",
    "predict_hmm(model, X, out_fn=out_fn)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"from hmmlearn import hmm\n",
	"import warnings"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create a function to load data. Load whole file first as a list of split strings, then convert to a pandas dataframe. This function can then be used to load both data files"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def load_data(fname: str, column_names: list, split_sep:str = ' ') -> pd.DataFrame:\n",
	" \"\"\"\n",
	" Load data to train a hidden markov model\n",
	"\n",
	" Args:\n",
	" fname (str): path to the data to load\n",
	" column_names (list): list of strings to use for column names.\n",
	" split_sep (str): separator to use to split lines of file (default: ' ')\n",
	"\n",
	" Returns:\n",
	" pandas.DataFrame loaded data\n",
	" \"\"\"\n",
	"\n",
	" # load data to a list of a list of numbers as strings\n",
	" # eg [['1.0', '2.0', '3.0\\n'], ['4.0', ... ]]\n",
	" with open(fname, 'r') as fx:\n",
	" data = [line.split(split_sep) for line in fx]\n",
	" \n",
	" # convert to pandas dataframe while casting as float\n",
	" df = pd.DataFrame(data, columns=column_names, dtype='float')\n",
	"\n",
	" return df\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now use it to load both data files. If you're working in a script context it's a good idea to force yourself to make a good directory structure by building your paths programmatically."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"base_dir = \"/Users/jonny/Dropbox/code/kip\"\n",
	"fname_x = os.path.join(base_dir, 'X.txt')\n",
	"#fname_y = os.path.join(base_dir, 'Y.txt')\n",
	"\n",
	"column_names = ['speed', 'cricketspeed', 'myrange', 'azimuth']\n",
	"\n",
	"X = load_data(fname_x, column_names)\n",
	"#Y = load_data(fname_y, column_names)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>speed</th>\n",
	" <th>cricketspeed</th>\n",
	" <th>myrange</th>\n",
	" <th>azimuth</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.116482</td>\n",
	" <td>112.876960</td>\n",
	" <td>646.952990</td>\n",
	" <td>109.681069</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.506245</td>\n",
	" <td>100.155766</td>\n",
	" <td>707.283616</td>\n",
	" <td>119.768465</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>0.621848</td>\n",
	" <td>82.521869</td>\n",
	" <td>770.370649</td>\n",
	" <td>127.505750</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0.430308</td>\n",
	" <td>61.800171</td>\n",
	" <td>825.746663</td>\n",
	" <td>132.654799</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0.725347</td>\n",
	" <td>43.819538</td>\n",
	" <td>869.389031</td>\n",
	" <td>135.660915</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" speed cricketspeed myrange azimuth\n",
	"0 0.116482 112.876960 646.952990 109.681069\n",
	"1 0.506245 100.155766 707.283616 119.768465\n",
	"2 0.621848 82.521869 770.370649 127.505750\n",
	"3 0.430308 61.800171 825.746663 132.654799\n",
	"4 0.725347 43.819538 869.389031 135.660915"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"X.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Also make a function to train model. wrapping operations in functions both is useful to reduce repeat labor but also to formalize workflow"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"def train_hmm(data: pd.DataFrame, \n",
	" n_components:int = 2, \n",
	" covariance_type:str =\"full\", \n",
	" n_iter:int = 100):\n",
	" \"\"\"\n",
	" Train a hmm on cricket data!\n",
	" \n",
	" Args:\n",
	" data (pandas.DataFrame): data to train on \n",
	" n_components (int): n components for HMM model \n",
	" covariance_type (str): type of covariance in HMM model \n",
	" n_iter (int): number of iterations to train model \n",
	"\n",
	" Returns:\n",
	" trained model\n",
	" \"\"\"\n",
	" model = hmm.GaussianHMM(\n",
	" n_components=n_components, \n",
	" covariance_type=covariance_type, \n",
	" n_iter=n_iter)\n",
	" \n",
	" \n",
	" print(f'Training model for {n_iter} iterations')\n",
	" model.fit(data)\n",
	" \n",
	" print('converged? ', end=' ')\n",
	" print(model.monitor_.converged)\n",
	" \n",
	" return model\n",
	"\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Training model for 100 iterations\n",
	"converged? True\n"
	]
	}
	],
	"source": [
	"model = train_hmm(X)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Ideally functions should do one well defined thing. So separate train and predict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"def save_array(x, out_fn):\n",
	" \"\"\"\n",
	" Args:\n",
	" x (numpy.ndarray): array to save\n",
	" out_fn (str): filename to save array, \n",
	" extension should one of 'csv', 'pck', 'npy', or 'txt'\n",
	" \"\"\"\n",
	" if out_fn.endswith('.txt'):\n",
	" with open(out_fn, 'w') as fz:\n",
	" for row in x:\n",
	" fz.write(f'{row}\\n')\n",
	"\n",
	" elif out_fn.endswith('.csv'):\n",
	" np.savetxt(out_fn, x, delimiter=\",\")\n",
	"\n",
	" else:\n",
	" if not out_fn.endswith('.pck') and not out_fn.endswith('.npy'):\n",
	" warnings.warn(f'Extension of {out_fn} not recognized, using np.save to save as .npy')\n",
	" \n",
	" np.save(out_fn, x)\n",
	" \n",
	"\n",
	"def predict_hmm(model, Y, out_fn = None) -> np.ndarray:\n",
	"\n",
	" z = model.predict(Y)\n",
	" \n",
	" if out_fn is not None:\n",
	" print(f'Saving results to {out_fn}')\n",
	" save_array(z, out_fn)\n",
	" \n",
	" return z\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Saving results to /Users/jonny/Dropbox/code/kip/z.csv\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"array([0, 0, 0, ..., 0, 0, 0])"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"out_fn = os.path.join(base_dir, 'z.csv')\n",
	"\n",
	"predict_hmm(model, X, out_fn=out_fn)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}