Skip to content

Instantly share code, notes, and snippets.

@cstrap
Forked from kidpixo/data-normalisation.ipynb
Created May 15, 2019 10:22
Show Gist options
  • Save cstrap/6edb161015514166af6ef6aca4bef320 to your computer and use it in GitHub Desktop.
Save cstrap/6edb161015514166af6ef6aca4bef320 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-15T08:21:39.632487Z",
"start_time": "2019-05-15T08:21:38.401500Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"# define pandas options\n",
"opts = {\n",
" 'display.max_columns': None,\n",
" 'display.max_colwidth': 50,\n",
" 'display.expand_frame_repr': False, # Don't wrap to multiple pages\n",
" 'display.max_rows': 14, # maximum number of rows pandas should output \n",
" 'display.max_seq_items': 50, # Max length of printed sequence\n",
" 'display.precision': 4, # Floating point output precision\n",
" 'display.show_dimensions': True,\n",
" 'display.width':200, # Width of the display in characters. in a terminal this can be set to None, not in notebook or qtconsole.\n",
" }\n",
"\n",
"# set pandas options\n",
"[pd.set_option(n,o) for n,o in opts.items()]\n",
"\n",
"# plotting\n",
"import matplotlib\n",
"from matplotlib import pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"# useful for displaying html, images and so on\n",
"from IPython.display import display\n"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-15T09:02:04.945010Z",
"start_time": "2019-05-15T09:02:04.929769Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>data</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.6611</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.7424</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1.8250</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2.6117</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.6216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.7763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2.8110</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3.1272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4.1058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>4.8813</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.8847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5.1660</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>12 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" data\n",
"0 0.6611\n",
"1 1.7424\n",
"3 1.8250\n",
"9 2.6117\n",
"5 2.6216\n",
"4 2.7763\n",
"7 2.8110\n",
"6 3.1272\n",
"10 4.1058\n",
"8 4.8813\n",
"2 4.8847\n",
"11 5.1660\n",
"\n",
"[12 rows x 1 columns]"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = [0.6610893966935933,\n",
"1.7423882770775445,\n",
"4.884723708070242,\n",
"1.8249901622170461,\n",
"2.7763139747735814,\n",
"2.621601355004263,\n",
"3.127249155066913,\n",
"2.810962943840614,\n",
"4.881331167897244,\n",
"2.611656683144541,\n",
"4.105819626936935,\n",
"5.165968850726315]\n",
"\n",
"df = pd.DataFrame(data,columns=['data']).sort_values('data')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-15T09:02:06.709649Z",
"start_time": "2019-05-15T09:02:06.705908Z"
}
},
"outputs": [],
"source": [
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-15T09:04:05.849513Z",
"start_time": "2019-05-15T09:04:05.564313Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5.675149533470947 2.5077768681686736\n",
"{0: 0.0, 1: 0.08333333333333333, 2: 0.0, 3: 0.16666666666666666, 4: 0.16666666666666666, 5: 0.25, 6: 0.0, 7: 0.08333333333333333, 8: 0.08333333333333333, 9: 0.16666666666666666}\n"
]
},
{
"data": {
"text/plain": [
"count 12.0000\n",
"mean 5.6751\n",
"std 2.6193\n",
"min 1.1259\n",
"25% 4.3958\n",
"50% 5.1018\n",
"75% 7.9096\n",
"max 9.5247\n",
"Name: scaled, Length: 8, dtype: float64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAD8CAYAAABkbJM/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAE9xJREFUeJzt3X+sX3d93/HnC7sWjReWrTE/5h+1Sy1Sq8LDuwTWZFShBMVkxY26DVMGGi31rMWlbKtWF1WsUjWJSKylqCmul2UbtJlVIEZeY+KEbmr+oBTfQJTEIe6ujIcvBsVJWcOv4Ti898f3GH25+dr3c6997vfr+PmQrnzO53ze5/u+X+y8OOd7vuekqpAkaT4vGHcDkqRLg4EhSWpiYEiSmhgYkqQmBoYkqYmBIUlqYmBIkpoYGJKkJgaGJKnJ8nE3cDFdffXVtX79+nG3IUmXjAcffPDJqlrVMvd5FRjr169nenp63G1I0iUjyf9pnespKUlSEwNDktTEwJAkNek1MJLclORokpkku88z79VJnk3yTxZaK0laGr0FRpJlwO3AVmAT8NYkm84x7zbg0EJrJUlLp88jjGuBmao6VlWngX3AthHzfgX4BPDEImolSUukz8BYDZwYWp/txr4vyWrgFmDPQmslSUurz8DIiLG5z4P9IPDrVfXsImoHE5MdSaaTTJ86dWoRbUqSWvT5xb1ZYO3Q+hrg5Jw5U8C+JABXA29KcqaxFoCq2gvsBZiamvIB5ZLUkz4D4zCwMckG4CvAduAXhidU1Yazy0n+K/CnVfXJJMvnq5UWY/3ue8byusfff/NYXle6mHoLjKo6k2QXg6uflgF3VtWRJDu77XM/t5i3tq9eJUnz6/VeUlV1EDg4Z2xkUFTVv5ivVpI0Pn7TW5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ16TUwktyU5GiSmSS7R2zfluThJA8lmU5y/dC240keObutzz4lSfPr7ZneSZYBtwM3ArPA4SQHquqxoWl/BhyoqkrySuBPgGuGtt9QVU/21aMkqV2fRxjXAjNVdayqTgP7gG3DE6rqm1VV3epKoJAkTaQ+A2M1cGJofbYb+wFJbknyOHAP8ItDmwq4L8mDSXac60WS7OhOZ02fOnXqIrUuSZqrz8DIiLHnHEFU1f6qugb4OeC3hzZdV1VbgK3ArUleN+pFqmpvVU1V1dSqVasuRt+SpBH6DIxZYO3Q+hrg5LkmV9UDwMuTXN2tn+z+fALYz+AUlyRpTPoMjMPAxiQbkqwAtgMHhick+fEk6Za3ACuAp5KsTHJlN74SeCPwaI+9SpLm0dtVUlV1Jsku4BCwDLizqo4k2dlt3wP8PPCOJM8A3wHe0l0x9RJgf5cly4G7qurevnqVJM2vt8AAqKqDwME5Y3uGlm8DbhtRdwzY3GdvkqSF8ZvekqQmBoYkqYmBIUlqYmBIkpoYGJKkJgaGJKmJgSFJamJgSJKaGBiSpCYGhiSpiYEhSWpiYEiSmhgYkqQmBoYkqYmBIUlqYmBIkpoYGJKkJr0GRpKbkhxNMpNk94jt25I8nOShJNNJrm+tlSQtrd4CI8ky4HZgK7AJeGuSTXOm/Rmwuar+PvCLwB0LqJUkLaE+jzCuBWaq6lhVnQb2AduGJ1TVN6uqutWVQLXWSpKWVp+BsRo4MbQ+2439gCS3JHkcuIfBUUZzbVe/ozudNX3q1KmL0rgk6bmW97jvjBir5wxU7Qf2J3kd8NvAG1pru/q9wF6AqampkXMkaSms333PWF73+PtvXpLX6fMIYxZYO7S+Bjh5rslV9QDw8iRXL7RWktS/PgPjMLAxyYYkK4DtwIHhCUl+PEm65S3ACuCpllpJ0tLq7ZRUVZ1Jsgs4BCwD7qyqI0l2dtv3AD8PvCPJM8B3gLd0H4KPrO2rV0nS/Pr8DIOqOggcnDO2Z2j5NuC21lpJ0vj4TW9JUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVKTXgMjyU1JjiaZSbJ7xPa3JXm4+/lMks1D244neSTJQ0mm++xTkjS/pke0JvnJqnp0ITtOsgy4HbgRmAUOJzlQVY8NTfsS8NNV9fUkW4G9wGuGtt9QVU8u5HUlSf1oPcLYk+RzSf5Vkqsaa64FZqrqWFWdBvYB24YnVNVnqurr3epngTWN+5YkLbGmwKiq64G3AWuB6SR3JblxnrLVwImh9dlu7Fx+CfjU8MsC9yV5MMmOcxUl2ZFkOsn0qVOn5mlJkrRYTaekAKrqfyf5TWAa+BDwqiQB3ltVd48oyajdjNp3khsYBMb1Q8PXVdXJJC8G7k/yeFU9MKKvvQxOZTE1NTVy/5KkC9d0hJHklUl+F/gi8HrgZ6vqJ7rl3z1H2SyDI5Kz1gAnR+0buAPYVlVPnR2vqpPdn08A+xmc4pIkjUnrZxi/D3we2FxVt1bV5+H7/1H/zXPUHAY2JtmQZAWwHTgwPCHJOuBu4O1V9VdD4yuTXHl2GXgjsKAP3SVJF1frKak3Ad+pqmcBkrwAeGFVfbuqPjqqoKrOJNkFHAKWAXdW1ZEkO7vte4D3AT8C/MHg7BZnqmoKeAmwvxtbDtxVVfcu9peUJF241sD4NPAG4Jvd+hXAfcBPna+oqg4CB+eM7RlafhfwrhF1x4DNc8clSePTekrqhVV1Nizolq/opyVJ0iRqDYxvJdlydiXJPwC+009LkqRJ1HpK6j3Ax5KcvcrpZcBb+mlJkjSJmgKjqg4nuQZ4BYPvVzxeVc/02pkkaaI0f3EPeDWwvqt5VRKq6iO9dCVJmjitNx/8KPBy4CHg2W64AANDki4TrUcYU8CmqvLWG5J0mWq9SupR4KV9NiJJmmytRxhXA48l+Rzw3bODVfXmXrqSJE2c1sD4rT6bkCRNvtbLav88yY8CG6vq00muYHB/KEnSZaL19ua/DHwc+MNuaDXwyb6akiRNntYPvW8FrgOehsHDlIAX99WUJGnytAbGd7vncgOQZDnneHqeJOn5qTUw/jzJe4Ef7p7l/THgf/TXliRp0rQGxm7gFPAI8C8ZPOPiXE/akyQ9D7VeJfU94D91P5Kky1DrVVJfSnJs7k9D3U1JjiaZSbJ7xPa3JXm4+/lMks2ttZKkpbWQe0md9ULgnwJ/93wFSZYBtwM3ArPA4SQHquqxoWlfAn66qr6eZCuwF3hNY60kaQk1HWFU1VNDP1+pqg8Cr5+n7FpgpqqOdVdY7QO2zdnvZ6rq693qZ4E1rbWSpKXVenvzLUOrL2BwxHHlPGWrgRND67PAa84z/5eATy20NskOYAfAunXr5mlJkrRYraek/uPQ8hngOPDP5qnJiLGR391IcgODwLh+obVVtZfBqSympqb8bsgCrN99z1he9/j7bx7L60q6MK1XSd2wiH3PAmuH1tcAJ+dOSvJK4A5ga1U9tZBaSdLSaT0l9W/Ot72qfmfE8GFgY5INwFeA7cAvzNnvOuBu4O1V9VcLqZUkLa2FXCX1auBAt/6zwAP84OcMP6CqziTZBRxicGfbO6vqSJKd3fY9wPuAHwH+IAnAmaqaOlftgn87SdJFs5AHKG2pqm8AJPkt4GNV9a7zFVXVQQbfCh8e2zO0/C5g5D5G1UqSxqf11iDrgNND66eB9Re9G0nSxGo9wvgo8Lkk+xlcrXQL8JHeupIkTZzWq6T+Q5JPAf+oG3pnVX2hv7YkSZOm9ZQUwBXA01X1e8BsdwWTJOky0XrzwX8P/DrwG93QDwF/1FdTkqTJ03qEcQvwZuBbAFV1kvlvDSJJeh5pDYzTVVV0t+dIsrK/liRJk6g1MP4kyR8CVyX5ZeDT+DAlSbqstF4l9YHuWd5PA68A3ldV9/famSRposwbGN3DjA5V1RsAQ0KSLlPznpKqqmeBbyf520vQjyRpQrV+0/v/AY8kuZ/uSimAqnp3L11JkiZOa2Dc0/1Iki5T5w2MJOuq6stV9d+WqiFJ0mSa7zOMT55dSPKJnnuRJE2w+QJj+NnaP9ZnI5KkyTZfYNQ5liVJl5n5AmNzkqeTfAN4Zbf8dJJvJHl6vp0nuSnJ0SQzSXaP2H5Nkr9I8t0kvzZn2/EkjyR5KMn0wn4tSdLFdt4Pvatq2WJ33H3h73bgRmAWOJzkQFU9NjTtr4F3Az93jt3cUFVPLrYHSdLFs5DnYSzUtcBMVR2rqtPAPmDb8ISqeqKqDgPP9NiHJOki6DMwVgMnhtZnu7FWBdyX5MEkOy5qZ5KkBWv94t5iZMTYQj44v66qTiZ5MXB/kser6oHnvMggTHYArFu3bnGdSpLm1ecRxiywdmh9DXCytbh7SBNV9QSwn8EprlHz9lbVVFVNrVq16gLalSSdT5+BcRjYmGRDkhXAduBAS2GSlUmuPLsMvBF4tLdOJUnz6u2UVFWdSbILOAQsA+6sqiNJdnbb9yR5KTANvAj4XpL3AJuAq4H9Sc72eFdV3dtXr5Kk+fX5GQZVdRA4OGdsz9Dy1xicqprraWBzn71Jkhamz1NSkqTnEQNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUpNeAyPJTUmOJplJsnvE9muS/EWS7yb5tYXUSpKWVm+BkWQZcDuwlcFzut+aZNOcaX8NvBv4wCJqJUlLqM8jjGuBmao6VlWngX3AtuEJVfVEVR0GnllorSRpafUZGKuBE0Prs91Y37WSpB4s73HfGTFWF7s2yQ5gB8C6desad/9c63ffs+jaC3H8/TeP5XW1tC7Hv1+X4+/8fNfnEcYssHZofQ1w8mLXVtXeqpqqqqlVq1YtqlFJ0vz6DIzDwMYkG5KsALYDB5agVpLUg95OSVXVmSS7gEPAMuDOqjqSZGe3fU+SlwLTwIuA7yV5D7Cpqp4eVdtXr5Kk+fX5GQZVdRA4OGdsz9Dy1xicbmqqlSSNj9/0liQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNek1MJLclORokpkku0dsT5IPddsfTrJlaNvxJI8keSjJdJ99SpLm19sjWpMsA24HbgRmgcNJDlTVY0PTtgIbu5/XAB/u/jzrhqp6sq8eJUnt+jzCuBaYqapjVXUa2AdsmzNnG/CRGvgscFWSl/XYkyRpkfoMjNXAiaH12W6sdU4B9yV5MMmO3rqUJDXp7ZQUkBFjtYA511XVySQvBu5P8nhVPfCcFxmEyQ6AdevWXUi/kqTz6PMIYxZYO7S+BjjZOqeqzv75BLCfwSmu56iqvVU1VVVTq1atukitS5Lm6jMwDgMbk2xIsgLYDhyYM+cA8I7uaqnXAn9TVV9NsjLJlQBJVgJvBB7tsVdJ0jx6OyVVVWeS7AIOAcuAO6vqSJKd3fY9wEHgTcAM8G3gnV35S4D9Sc72eFdV3dtXr5Kk+fX5GQZVdZBBKAyP7RlaLuDWEXXHgM199iZJWhi/6S1JamJgSJKaGBiSpCYGhiSpiYEhSWpiYEiSmhgYkqQmBoYkqYmBIUlqYmBIkpoYGJKkJgaGJKmJgSFJamJgSJKaGBiSpCYGhiSpiYEhSWrSa2AkuSnJ0SQzSXaP2J4kH+q2P5xkS2utJGlp9RYYSZYBtwNbgU3AW5NsmjNtK7Cx+9kBfHgBtZKkJdTnEca1wExVHauq08A+YNucOduAj9TAZ4GrkryssVaStIT6DIzVwImh9dlurGVOS60kaQkt73HfGTFWjXNaagc7SHYwOJ0F8M0kR5s7nAC5DYCrgSfH28nS6X7nhbqs3qNFGPn+LPK9vqSd53d+3v4dusD/nX+0dWKfgTELrB1aXwOcbJyzoqEWgKraC+y90GbHKcl0VU2Nu49J5nt0fr4/8/M9unB9npI6DGxMsiHJCmA7cGDOnAPAO7qrpV4L/E1VfbWxVpK0hHo7wqiqM0l2AYeAZcCdVXUkyc5u+x7gIPAmYAb4NvDO89X21askaX6pGvnRgJZQkh3dqTWdg+/R+fn+zM/36MIZGJKkJt4aRJLUxMAYoyRrk/yvJF9MciTJr467p0mUZFmSLyT503H3MomSXJXk40ke7/4u/cNx9zRJkvzr7t/Xo0n+e5IXjrunS5WBMV5ngH9bVT8BvBa41VugjPSrwBfH3cQE+z3g3qq6BtiM79X3JVkNvBuYqqqfZHARzfbxdnXpMjDGqKq+WlWf75a/weAfut9oH5JkDXAzcMe4e5lESV4EvA74zwBVdbqq/u94u5o4y4EfTrIcuIJzfKdL8zMwJkSS9cCrgL8cbycT54PAvwO+N+5GJtSPAaeA/9KdtrsjycpxNzUpquorwAeALwNfZfBdr/vG29Wly8CYAEn+FvAJ4D1V9fS4+5kUSf4x8ERVPTjuXibYcmAL8OGqehXwLcDHAXSS/B0GNy7dAPw9YGWSfz7eri5dBsaYJfkhBmHxx1V197j7mTDXAW9OcpzBHYtfn+SPxtvSxJkFZqvq7JHpxxkEiAbeAHypqk5V1TPA3cBPjbmnS5aBMUZJwuDc8xer6nfG3c+kqarfqKo1VbWewQeV/7Oq/H+HQ6rqa8CJJK/ohn4GeGyMLU2aLwOvTXJF9+/tZ/CigEXr8+aDmt91wNuBR5I81I29t6oOjrEnXXp+Bfjj7r5rx+husSOoqr9M8nHg8wyuSvwCl/jNSsfJb3pLkpp4SkqS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUpP/D5m9RfAg4MsNAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"scaler = preprocessing.StandardScaler()\n",
"std = df.data.std()*1.785\n",
"mean = df.data.mean()*1.83\n",
"\n",
"df['scaled'] = scaler.fit_transform(df)*std+mean\n",
"\n",
"print(mean,std)\n",
"freq, bins = np.histogram(df['scaled'], bins=range(11),density=True)\n",
"\n",
"print(dict(zip(bins,freq)))\n",
"display(df.scaled.describe())\n",
"ax = df['scaled'].plot(kind='hist',bins=10,density=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment