Last active
September 23, 2024 11:18
-
-
Save joferkington/d95101a61a02e0ba63e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.sparse | |
import scipy.ndimage | |
import scipy.stats | |
import scipy.signal | |
import matplotlib.pyplot as plt | |
def main(): | |
x, y = generate_data(int(1e7)) | |
grid, extents, density = fast_kde(x, y, sample=True) | |
image_example(grid, extents) | |
scatter_example(x, y, density) | |
plt.show() | |
def generate_data(num): | |
x = 10 * np.random.random(num) | |
y = x**2 + np.random.normal(0, 5, num)**2 | |
return x, y | |
def image_example(grid, extents): | |
fig, ax = plt.subplots() | |
im = ax.imshow(grid, origin='lower', extent=extents, aspect='auto', | |
cmap='gist_earth_r') | |
fig.colorbar(im) | |
def scatter_example(x, y, density, num_points=10000): | |
# Randomly draw a subset based on the _inverse_ of the estimated density | |
prob = 1.0 / density | |
prob /= prob.sum() | |
subset = np.random.choice(np.arange(x.size), num_points, False, prob) | |
x, y, density = x[subset], y[subset], density[subset] | |
fig, ax = plt.subplots() | |
ax.scatter(x, y, c=density, cmap='gist_earth_r') | |
ax.axis('tight') | |
def fast_kde(x, y, gridsize=(400, 400), extents=None, weights=None, | |
sample=False): | |
""" | |
Performs a gaussian kernel density estimate over a regular grid using a | |
convolution of the gaussian kernel with a 2D histogram of the data. | |
This function is typically several orders of magnitude faster than | |
scipy.stats.kde.gaussian_kde for large (>1e7) numbers of points and | |
produces an essentially identical result. | |
Input: | |
x: array-like | |
The x-coords of the input data points | |
y: array-like | |
The y-coords of the input data points | |
gridsize: tuple, optional | |
An (nx,ny) tuple of the size of the output | |
grid. Defaults to (400, 400). | |
extents: tuple, optional | |
A (xmin, xmax, ymin, ymax) tuple of the extents of output grid. | |
Defaults to min/max of x & y input. | |
weights: array-like or None, optional | |
An array of the same shape as x & y that weighs each sample (x_i, | |
y_i) by each value in weights (w_i). Defaults to an array of ones | |
the same size as x & y. | |
sample: boolean | |
Whether or not to return the estimated density at each location. | |
Defaults to False | |
Output: | |
density : 2D array of shape *gridsize* | |
The estimated probability distribution function on a regular grid | |
extents : tuple | |
xmin, xmax, ymin, ymax | |
sampled_density : 1D array of len(*x*) | |
Only returned if *sample* is True. The estimated density at each | |
point. | |
""" | |
#---- Setup -------------------------------------------------------------- | |
x, y = np.atleast_1d([x, y]) | |
x, y = x.reshape(-1), y.reshape(-1) | |
if x.size != y.size: | |
raise ValueError('Input x & y arrays must be the same size!') | |
nx, ny = gridsize | |
n = x.size | |
if weights is None: | |
# Default: Weight all points equally | |
weights = np.ones(n) | |
else: | |
weights = np.squeeze(np.asarray(weights)) | |
if weights.size != x.size: | |
raise ValueError('Input weights must be an array of the same size' | |
' as input x & y arrays!') | |
# Default extents are the extent of the data | |
if extents is None: | |
xmin, xmax = x.min(), x.max() | |
ymin, ymax = y.min(), y.max() | |
else: | |
xmin, xmax, ymin, ymax = map(float, extents) | |
extents = xmin, xmax, ymin, ymax | |
dx = (xmax - xmin) / (nx - 1) | |
dy = (ymax - ymin) / (ny - 1) | |
#---- Preliminary Calculations ------------------------------------------- | |
# Most of this is a hack to re-implment np.histogram2d using `coo_matrix` | |
# for better memory/speed performance with huge numbers of points. | |
# First convert x & y over to pixel coordinates | |
# (Avoiding np.digitize due to excessive memory usage!) | |
ij = np.column_stack((y, x)) | |
ij -= [ymin, xmin] | |
ij /= [dy, dx] | |
ij = np.floor(ij, ij).T | |
# Next, make a 2D histogram of x & y | |
# Avoiding np.histogram2d due to excessive memory usage with many points | |
grid = scipy.sparse.coo_matrix((weights, ij), shape=(ny, nx)).toarray() | |
# Calculate the covariance matrix (in pixel coords) | |
cov = image_cov(grid) | |
# Scaling factor for bandwidth | |
scotts_factor = np.power(n, -1.0 / 6) # For 2D | |
#---- Make the gaussian kernel ------------------------------------------- | |
# First, determine how big the kernel needs to be | |
std_devs = np.diag(np.sqrt(cov)) | |
kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) | |
kern_nx, kern_ny = int(kern_nx), int(kern_ny) | |
# Determine the bandwidth to use for the gaussian kernel | |
inv_cov = np.linalg.inv(cov * scotts_factor**2) | |
# x & y (pixel) coords of the kernel grid, with <x,y> = <0,0> in center | |
xx = np.arange(kern_nx, dtype=np.float) - kern_nx / 2.0 | |
yy = np.arange(kern_ny, dtype=np.float) - kern_ny / 2.0 | |
xx, yy = np.meshgrid(xx, yy) | |
# Then evaluate the gaussian function on the kernel grid | |
kernel = np.vstack((xx.flatten(), yy.flatten())) | |
kernel = np.dot(inv_cov, kernel) * kernel | |
kernel = np.sum(kernel, axis=0) / 2.0 | |
kernel = np.exp(-kernel) | |
kernel = kernel.reshape((kern_ny, kern_nx)) | |
#---- Produce the kernel density estimate -------------------------------- | |
# Convolve the gaussian kernel with the 2D histogram, producing a gaussian | |
# kernel density estimate on a regular grid | |
# Big kernel, use fft... | |
if kern_nx * kern_ny > np.product(gridsize) / 4.0: | |
grid = scipy.signal.fftconvolve(grid, kernel, mode='same') | |
# Small kernel, use ndimage | |
else: | |
grid = scipy.ndimage.convolve(grid, kernel, mode='constant', cval=0) | |
# Normalization factor to divide result by so that units are in the same | |
# units as scipy.stats.kde.gaussian_kde's output. | |
norm_factor = 2 * np.pi * cov * scotts_factor**2 | |
norm_factor = np.linalg.det(norm_factor) | |
norm_factor = n * dx * dy * np.sqrt(norm_factor) | |
# Normalize the result | |
grid /= norm_factor | |
if sample: | |
i, j = ij.astype(int) | |
return grid, extents, grid[i, j] | |
else: | |
return grid, extents | |
def image_cov(data): | |
"""Efficiently calculate the cov matrix of an image.""" | |
def raw_moment(data, ix, iy, iord, jord): | |
data = data * ix**iord * iy**jord | |
return data.sum() | |
ni, nj = data.shape | |
iy, ix = np.mgrid[:ni, :nj] | |
data_sum = data.sum() | |
m10 = raw_moment(data, ix, iy, 1, 0) | |
m01 = raw_moment(data, ix, iy, 0, 1) | |
x_bar = m10 / data_sum | |
y_bar = m01 / data_sum | |
u11 = (raw_moment(data, ix, iy, 1, 1) - x_bar * m01) / data_sum | |
u20 = (raw_moment(data, ix, iy, 2, 0) - x_bar * m10) / data_sum | |
u02 = (raw_moment(data, ix, iy, 0, 2) - y_bar * m01) / data_sum | |
cov = np.array([[u20, u11], [u11, u02]]) | |
return cov | |
if __name__ == '__main__': | |
main() |
Hi @joferkington , My query is not related to the above case, apologies. I have determined the kde_values on grid points. Now, I want to plot the 1 sigma and 2 sigma contours based on the kde_values, the distribution of which is not necessarily have to be coming from a bivariate normal distribution. Basically, the contours may not be a perfect ellipse. How to go about that?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@MikeDacre - Sorry I didn't notice your comment earlier!
In that case, there's the correct way and then there's a shortcut.
The shortcut is to just plot the log of the values. If you're not displaying a colorbar, that's the route you'll want to take. For example, plot
ax.scatter(x, y, c=np.log(density), cmap='gist_earth_r')
and you should get something that looks similar to the R plot.If you want to display a colorbar alongside it and still have the correct values, you'd want to change the
Normalize
type used for the color plotting. Something likeax.scatter(x, y, c=density, cmap='gist_earth_r', norm=LogNorm(density.min(), density.max()))
.