-
-
Save joferkington/d95101a61a02e0ba63e5 to your computer and use it in GitHub Desktop.
import numpy as np | |
import scipy.sparse | |
import scipy.ndimage | |
import scipy.stats | |
import scipy.signal | |
import matplotlib.pyplot as plt | |
def main(): | |
x, y = generate_data(int(1e7)) | |
grid, extents, density = fast_kde(x, y, sample=True) | |
image_example(grid, extents) | |
scatter_example(x, y, density) | |
plt.show() | |
def generate_data(num): | |
x = 10 * np.random.random(num) | |
y = x**2 + np.random.normal(0, 5, num)**2 | |
return x, y | |
def image_example(grid, extents): | |
fig, ax = plt.subplots() | |
im = ax.imshow(grid, origin='lower', extent=extents, aspect='auto', | |
cmap='gist_earth_r') | |
fig.colorbar(im) | |
def scatter_example(x, y, density, num_points=10000): | |
# Randomly draw a subset based on the _inverse_ of the estimated density | |
prob = 1.0 / density | |
prob /= prob.sum() | |
subset = np.random.choice(np.arange(x.size), num_points, False, prob) | |
x, y, density = x[subset], y[subset], density[subset] | |
fig, ax = plt.subplots() | |
ax.scatter(x, y, c=density, cmap='gist_earth_r') | |
ax.axis('tight') | |
def fast_kde(x, y, gridsize=(400, 400), extents=None, weights=None, | |
sample=False): | |
""" | |
Performs a gaussian kernel density estimate over a regular grid using a | |
convolution of the gaussian kernel with a 2D histogram of the data. | |
This function is typically several orders of magnitude faster than | |
scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and | |
produces an essentially identical result. | |
Input: | |
x: array-like | |
The x-coords of the input data points | |
y: array-like | |
The y-coords of the input data points | |
gridsize: tuple, optional | |
An (nx,ny) tuple of the size of the output | |
grid. Defaults to (400, 400). | |
extents: tuple, optional | |
A (xmin, xmax, ymin, ymax) tuple of the extents of output grid. | |
Defaults to min/max of x & y input. | |
weights: array-like or None, optional | |
An array of the same shape as x & y that weighs each sample (x_i, | |
y_i) by each value in weights (w_i). Defaults to an array of ones | |
the same size as x & y. | |
sample: boolean | |
Whether or not to return the estimated density at each location. | |
Defaults to False | |
Output: | |
density : 2D array of shape *gridsize* | |
The estimated probability distribution function on a regular grid | |
extents : tuple | |
xmin, xmax, ymin, ymax | |
sampled_density : 1D array of len(*x*) | |
Only returned if *sample* is True. The estimated density at each | |
point. | |
""" | |
#---- Setup -------------------------------------------------------------- | |
x, y = np.atleast_1d([x, y]) | |
x, y = x.reshape(-1), y.reshape(-1) | |
if x.size != y.size: | |
raise ValueError('Input x & y arrays must be the same size!') | |
nx, ny = gridsize | |
n = x.size | |
if weights is None: | |
# Default: Weight all points equally | |
weights = np.ones(n) | |
else: | |
weights = np.squeeze(np.asarray(weights)) | |
if weights.size != x.size: | |
raise ValueError('Input weights must be an array of the same size' | |
' as input x & y arrays!') | |
# Default extents are the extent of the data | |
if extents is None: | |
xmin, xmax = x.min(), x.max() | |
ymin, ymax = y.min(), y.max() | |
else: | |
xmin, xmax, ymin, ymax = map(float, extents) | |
extents = xmin, xmax, ymin, ymax | |
dx = (xmax - xmin) / (nx - 1) | |
dy = (ymax - ymin) / (ny - 1) | |
#---- Preliminary Calculations ------------------------------------------- | |
# Most of this is a hack to re-implment np.histogram2d using `coo_matrix` | |
# for better memory/speed performance with huge numbers of points. | |
# First convert x & y over to pixel coordinates | |
# (Avoiding np.digitize due to excessive memory usage!) | |
ij = np.column_stack((y, x)) | |
ij -= [ymin, xmin] | |
ij /= [dy, dx] | |
ij = np.floor(ij, ij).T | |
# Next, make a 2D histogram of x & y | |
# Avoiding np.histogram2d due to excessive memory usage with many points | |
grid = scipy.sparse.coo_matrix((weights, ij), shape=(ny, nx)).toarray() | |
# Calculate the covariance matrix (in pixel coords) | |
cov = image_cov(grid) | |
# Scaling factor for bandwidth | |
scotts_factor = np.power(n, -1.0 / 6) # For 2D | |
#---- Make the gaussian kernel ------------------------------------------- | |
# First, determine how big the kernel needs to be | |
std_devs = np.diag(np.sqrt(cov)) | |
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) | |
kern_nx, kern_ny = int(kern_nx), int(kern_ny) | |
# Determine the bandwidth to use for the gaussian kernel | |
inv_cov = np.linalg.inv(cov * scotts_factor**2) | |
# x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center | |
xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0 | |
yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0 | |
xx, yy = np.meshgrid(xx, yy) | |
# Then evaluate the gaussian function on the kernel grid | |
kernel = np.vstack((xx.flatten(), yy.flatten())) | |
kernel = np.dot(inv_cov, kernel) * kernel | |
kernel = np.sum(kernel, axis=0) / 2.0 | |
kernel = np.exp(-kernel) | |
kernel = kernel.reshape((kern_ny, kern_nx)) | |
#---- Produce the kernel density estimate -------------------------------- | |
# Convolve the gaussian kernel with the 2D histogram, producing a gaussian | |
# kernel density estimate on a regular grid | |
# Big kernel, use fft... | |
if kern_nx * kern_ny > np.product(gridsize) / 4.0: | |
grid = scipy.signal.fftconvolve(grid, kernel, mode='same') | |
# Small kernel, use ndimage | |
else: | |
grid = scipy.ndimage.convolve(grid, kernel, mode='constant', cval=0) | |
# Normalization factor to divide result by so that units are in the same | |
# units as scipy.stats.kde.gaussian_kde's output. | |
norm_factor = 2 * np.pi * cov * scotts_factor**2 | |
norm_factor = np.linalg.det(norm_factor) | |
norm_factor = n * dx * dy * np.sqrt(norm_factor) | |
# Normalize the result | |
grid /= norm_factor | |
if sample: | |
i, j = ij.astype(int) | |
return grid, extents, grid[i, j] | |
else: | |
return grid, extents | |
def image_cov(data): | |
"""Efficiently calculate the cov matrix of an image.""" | |
def raw_moment(data, ix, iy, iord, jord): | |
data = data * ix**iord * iy**jord | |
return data.sum() | |
ni, nj = data.shape | |
iy, ix = np.mgrid[:ni, :nj] | |
data_sum = data.sum() | |
m10 = raw_moment(data, ix, iy, 1, 0) | |
m01 = raw_moment(data, ix, iy, 0, 1) | |
x_bar = m10 / data_sum | |
y_bar = m01 / data_sum | |
u11 = (raw_moment(data, ix, iy, 1, 1) - x_bar * m01) / data_sum | |
u20 = (raw_moment(data, ix, iy, 2, 0) - x_bar * m10) / data_sum | |
u02 = (raw_moment(data, ix, iy, 0, 2) - y_bar * m01) / data_sum | |
cov = np.array([[u20, u11], [u11, u02]]) | |
return cov | |
if __name__ == '__main__': | |
main() |
@MikeDacre - Sorry I didn't notice your comment earlier!
In that case, there's the correct way and then there's a shortcut.
The shortcut is to just plot the log of the values. If you're not displaying a colorbar, that's the route you'll want to take. For example, plot ax.scatter(x, y, c=np.log(density), cmap='gist_earth_r')
and you should get something that looks similar to the R plot.
If you want to display a colorbar alongside it and still have the correct values, you'd want to change the Normalize
type used for the color plotting. Something like ax.scatter(x, y, c=density, cmap='gist_earth_r', norm=LogNorm(density.min(), density.max()))
.
Hi @joferkington , My query is not related to the above case, apologies. I have determined the kde_values on grid points. Now, I want to plot the 1 sigma and 2 sigma contours based on the kde_values, the distribution of which is not necessarily have to be coming from a bivariate normal distribution. Basically, the contours may not be a perfect ellipse. How to go about that?
This is amazing! Thanks for sharing.
Do you have an idea of how to make this code show a more smooth plot even when the vast majority of the points are in a small area?
Specifically, I have some R code that uses densCols ((cisVar)[https://github.com/TheFraserLab/cisVar/blob/31e5c8c92643c6ccc7efd0ac6d5b50e1cef252c5/scripts/plot_fun.R]) to plot some frequency data and produces this:
Whereas when I use this fast kde method, I end up with this:
I would really like to be able to adapt this code to do more general density plots even in highly asymmetric data but I am not sure how to do that. Maybe by adjusting the weights or the cmap?