mickypaganini · December 18, 2015 10:08
diff --git a/non_param_fit.ipynb b/non_param_fit.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def non_param_fit(hist, kernel = 'gaussian', bandwidth = 12):\n",
    "    '''\n",
    "    Function:\n",
    "    ---------\n",
    "        used to calculate a non-parametric curve fit to a histogram via Kernel Density Estimation in scikit-learn \n",
    "        (http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity)\n",
    "    Args:\n",
    "    -----\n",
    "        hist: a list containing all the x-values usually plotted in a histogram we want to fit\n",
    "        kernel: (optional) the type of kernel to use. Options:'gaussian', 'cosine', 'tophat', 'epanechnikov', 'exponential', 'linear'\n",
    "        bandwidth: (optional) width of the kernel used in the fit. If this number is too small, the fit will appear noisy and with a lot of \n",
    "                              bumps due to local behaviors. If this number is too big, the curve will be overly smoothed, and won't capture \n",
    "                              much of the underlying data structure.\n",
    "    Returns:\n",
    "    --------\n",
    "        log_dense: density model of the input data 'hist'; need to take the np.exp(log_dense) to actually plot the fit\n",
    "    '''\n",
    "    from sklearn.neighbors import KernelDensity\n",
    "    \n",
    "    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(hist[:, np.newaxis])\n",
    "    log_dense = kde.score_samples(hist[:, np.newaxis])\n",
    "    return log_dense "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_peak(hist, log_dense):\n",
    "    '''\n",
    "    Function:\n",
    "    ---------\n",
    "        used to find the peak in the fit of a histogram\n",
    "    Args:\n",
    "    -----\n",
    "        hist: a list containing all the x-values usually plotted in a histogram we want to fit and find the peak of\n",
    "        log_dense: density model of the input data 'hist' (the output of the function above)\n",
    "    Returns:\n",
    "    --------\n",
    "        the x location of the peak, according to the fit\n",
    "    '''\n",
    "    return hist[np.argmax(log_dense)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def plotting_example(hist):\n",
    "    '''\n",
    "    quick example on how to use the functions above\n",
    "    '''\n",
    "    plt.hist(hist, histtype = 'stepfilled')                   # plot histogram\n",
    "    ixa = np.argsort(hist[:1000])                             # sort histogram indices and only take first 1000 samples for the fit\n",
    "    log_dense_a = non_param_fit(hist[:1000])                  # calculate log density\n",
    "    plt.plot(hist[:1000][ixa], np.exp(log_dense_a)[ixa], '-') # plot fit \n",
    "    plt.show()\n",
    "    print 'Peak at : {}'.format(get_peak(hist, log_dense_a))\n",
    "    return"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def non_param_fit(hist, kernel = 'gaussian', bandwidth = 12):\n",
	" '''\n",
	" Function:\n",
	" ---------\n",
	" used to calculate a non-parametric curve fit to a histogram via Kernel Density Estimation in scikit-learn \n",
	" (http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html#sklearn.neighbors.KernelDensity)\n",
	" Args:\n",
	" -----\n",
	" hist: a list containing all the x-values usually plotted in a histogram we want to fit\n",
	" kernel: (optional) the type of kernel to use. Options:'gaussian', 'cosine', 'tophat', 'epanechnikov', 'exponential', 'linear'\n",
	" bandwidth: (optional) width of the kernel used in the fit. If this number is too small, the fit will appear noisy and with a lot of \n",
	" bumps due to local behaviors. If this number is too big, the curve will be overly smoothed, and won't capture \n",
	" much of the underlying data structure.\n",
	" Returns:\n",
	" --------\n",
	" log_dense: density model of the input data 'hist'; need to take the np.exp(log_dense) to actually plot the fit\n",
	" '''\n",
	" from sklearn.neighbors import KernelDensity\n",
	" \n",
	" kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(hist[:, np.newaxis])\n",
	" log_dense = kde.score_samples(hist[:, np.newaxis])\n",
	" return log_dense "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_peak(hist, log_dense):\n",
	" '''\n",
	" Function:\n",
	" ---------\n",
	" used to find the peak in the fit of a histogram\n",
	" Args:\n",
	" -----\n",
	" hist: a list containing all the x-values usually plotted in a histogram we want to fit and find the peak of\n",
	" log_dense: density model of the input data 'hist' (the output of the function above)\n",
	" Returns:\n",
	" --------\n",
	" the x location of the peak, according to the fit\n",
	" '''\n",
	" return hist[np.argmax(log_dense)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def plotting_example(hist):\n",
	" '''\n",
	" quick example on how to use the functions above\n",
	" '''\n",
	" plt.hist(hist, histtype = 'stepfilled') # plot histogram\n",
	" ixa = np.argsort(hist[:1000]) # sort histogram indices and only take first 1000 samples for the fit\n",
	" log_dense_a = non_param_fit(hist[:1000]) # calculate log density\n",
	" plt.plot(hist[:1000][ixa], np.exp(log_dense_a)[ixa], '-') # plot fit \n",
	" plt.show()\n",
	" print 'Peak at : {}'.format(get_peak(hist, log_dense_a))\n",
	" return"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}