import numpy as np def hist_pdf(x, data, n_bins=10, minv=None, maxv=None): """Histogram density estimator [minv, minv+delta, , minv+delta*2, ..., 1] for any x in B_l=[minv+delta*j, minv+delta*(j+1)] density is estimated in the following way p(x) = p(x | x \in B_l) * p(x \in B_l) = (1/delta) * (\sum_{x_j}{I(x_j \in B_l)} / N) where N - number of points in dataset See lecture notes for details: http://faculty.washington.edu/yenchic/18W_425/Lec6_hist_KDE.pdf Parameters ---------- x: float point to estimate density at data: numpy array data points used to construct the density n_bins: int number of bins minv: float or None minimum value of the domain. If None, estimated from data maxv: float or None maximum value of the domain. If None, estimated from data Returns ------- pdf: float computed density at point x given data """ if minv is None: minv = np.min(data) if maxv is None: maxv = np.max(data) delta = (maxv-minv) / n_bins bins = np.arange(minv, maxv, delta) bin_id = int((x-minv)/delta) bin_minv = minv+delta*bin_id bin_maxv = minv+delta*(bin_id+1) n_data = len(data) n_data_in_bin = len(data[np.where((data>bin_minv) & (data<bin_maxv))]) pdf = (1.0/delta) * (n_data_in_bin / n_data) return pdf # Demo from sklearn.datasets import load_boston import matplotlib.pyplot as plt ds = load_boston() data = ds['target'] xvals = np.arange(min(data), max(data), 1) n_bins=15 pdf = [hist_pdf(x, data, n_bins=n_bins) for x in xvals] plt.plot(xvals, pdf)