-
-
Save bnordgren/9d49556f7586dd715c61d3f72e17690e to your computer and use it in GitHub Desktop.
Python implementation of zonal statistics function. Optimized for dense polygon layers, uses numpy, GDAL and OGR to rival the speed of starspan.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import pandas as pd | |
import zonal_stats as z | |
def trends(shpfile, years, rasterpattern, histobins=10, historange=None) : | |
rasterfiles = [ glob.glob(rasterpattern.format(y))[0] for y in years ] | |
if historange is None : | |
historange = z.calc_range_many_rasters(shpfile,rasterfiles) | |
trend = {} | |
for i_y in range(len(years)) : | |
stats = z.zonal_histogram(shpfile, rasterfiles[i_y], | |
histobins, historange) | |
trend[years[i_y]] = z.make_histogram_frame(stats) | |
return pd.concat(trend) | |
def scalar_trends(shpfile, years, rasterpattern, lc_pattern, mask_fn, | |
valid_range=None, raster_nodata=None) : | |
rasterfiles = [ glob.glob(rasterpattern.format(y))[0] for y in years ] | |
landcover_files = [ glob.glob(lc_pattern.format(y))[0] for y in years ] | |
trend = {} | |
for i_y in range(len(years)) : | |
stats = z.zonal_stats(shpfile, rasterfiles[i_y], | |
mask_path=landcover_files[i_y], | |
mask_fn=mask_fn,valid_range=valid_range, | |
nodata_value=raster_nodata) | |
trend[years[i_y]] = z.make_stats_frame(stats) | |
return pd.concat(trend) | |
def scalar_trends_bands(shpfile, years, bands, rasterfile, raster_nodata=None) : | |
"""A variant for the case where time is represented by a band of raster information.""" | |
trend = {} | |
for i_y in range(len(years)) : | |
print years[i_y], bands[i_y] | |
stats = z.zonal_stats(shpfile, rasterfile, | |
band=bands[i_y], | |
nodata_value=raster_nodata) | |
trend[years[i_y]] = z.make_stats_frame(stats) | |
return pd.concat(trend) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Zonal Statistics | |
Vector-Raster Analysis | |
Copyright 2013 Matthew Perry | |
Usage: | |
zonal_stats.py VECTOR RASTER | |
zonal_stats.py -h | --help | |
zonal_stats.py --version | |
Options: | |
-h --help Show this screen. | |
--version Show version. | |
""" | |
from osgeo import gdal, ogr | |
from osgeo.gdalconst import * | |
import warnings | |
import numpy as np | |
import numpy.ma as ma | |
import pandas as pd | |
import sys | |
import collections as c | |
from functools import partial | |
gdal.PushErrorHandler('CPLQuietErrorHandler') | |
Offsets = c.namedtuple("Offsets", ['x0','y0','x_size','y_size']) | |
def bbox_to_pixel_offsets(gt, bbox): | |
originX = gt[0] | |
originY = gt[3] | |
pixel_width = gt[1] | |
pixel_height = gt[5] | |
# ensure there are no negative pixel indices | |
x1 = max(0,int((bbox[0] - originX) / pixel_width)) | |
x2 = int((bbox[1] - originX) / pixel_width) # + 1 | |
# ensure there are no negative pixel indices | |
y1 = max(0,int((bbox[3] - originY) / pixel_height)) | |
y2 = int((bbox[2] - originY) / pixel_height) # + 1 | |
xsize = max(1, x2 - x1) | |
ysize = max(1, y2 - y1) | |
return Offsets(x1, y1, xsize, ysize) | |
def offset_polygon(offsets) : | |
x0 = offsets.x0 | |
y0 = offsets.y0 | |
x1 = x0 + offsets.x_size | |
y1 = y0 + offsets.y_size | |
ring = ogr.Geometry(ogr.wkbLinearRing) | |
ring.AddPoint(x0,y0) | |
ring.AddPoint(x1,y0) | |
ring.AddPoint(x1,y1) | |
ring.AddPoint(x0,y1) | |
ring.AddPoint(x0,y0) | |
poly = ogr.Geometry(ogr.wkbPolygon) | |
poly.AddGeometry(ring) | |
return poly | |
def clip_offsets_to_raster(offsets, raster) : | |
rast_offsets = Offsets(0,0,raster.RasterXSize, raster.RasterYSize) | |
o_env = offset_polygon(offsets) | |
r_env = offset_polygon(rast_offsets) | |
clipped_offsets = None | |
clipped_geom = o_env.Intersection(r_env) | |
if clipped_geom is not None : | |
clipped_area = clipped_geom.GetEnvelope() | |
x1 = int(clipped_area[0]) | |
x2 = int(clipped_area[1]) | |
y1 = int(clipped_area[2]) | |
y2 = int(clipped_area[3]) | |
x_size = max(1, x2-x1) | |
y_size = max(1, y2-y1) | |
if x_size > 1 and y_size > 1 : | |
clipped_offsets = Offsets(x1,y1,x_size,y_size) | |
return clipped_offsets | |
def calc_histobins(histobins, historange) : | |
# if histobins is the "int" type, that means to make integer-sized bins | |
# to cover the specified range. | |
if histobins == int : | |
min_left = int(np.floor(historange[0])) | |
max_right = int(np.ceil(historange[1])) | |
histobins = np.linspace(min_left,max_right+1,num=max_right-min_left+2) | |
# otherwise we just return what the user passed in. | |
return histobins | |
def calc_stats(masked) : | |
feature_stats = { | |
'min': float(masked.min()), | |
'mean': float(masked.mean()), | |
'max': float(masked.max()), | |
'std': float(masked.std()), | |
'sum': float(masked.sum()), | |
'count': int(masked.count())} | |
return feature_stats | |
def calc_histogram(histobins, historange, masked) : | |
feature_stats = { | |
'histogram' : np.histogram(masked.compressed(), bins=histobins, | |
range=historange)} | |
return feature_stats | |
def mask_nodata_and_rv(src_array, rv_array, nodata_value, mask_array) : | |
"""Masks any pixels where the src_array is the nodata value, or outside of the | |
rasterized vector. | |
+ src_array: the raster data | |
+ rv_array : boolean raster array converted from shapefile False=outside vector | |
+ nodata_value: nodata value to be applied to the array | |
+ mask_array: ignored""" | |
return np.ma.MaskedArray( | |
src_array, | |
mask=np.logical_or( | |
src_array == nodata_value, | |
np.logical_not(rv_array) | |
) | |
) | |
def mask_nodata_range_and_rv(src_array, rv_array, nodata_value, mask_array, low, high) : | |
"""Masks pixels where: | |
+ src_array is the nodata value | |
+ pixel outside the rasterized vector | |
+ mask_array pixel value is outside the closed interval [low, high]. | |
""" | |
return np.ma.MaskedArray( | |
src_array, | |
mask=np.logical_or( | |
np.logical_or( | |
np.logical_not(rv_array), | |
src_array == nodata_value), | |
np.logical_or( | |
mask_array < low, | |
mask_array > high))) | |
def select_forest(src_array, rv_array, nodata_value, mask_array) : | |
"""Assuming mask_array is a MODIS landcover raster, selects only forest types""" | |
return mask_nodata_range_and_rv(src_array, rv_array, nodata_value, mask_array, 1,5) | |
def select_non_forest(src_array, rv_array, nodata_value, mask_array) : | |
"""Assuming mask_array is a MODIS landcover raster, selects grassland/savannah types""" | |
return mask_nodata_range_and_rv(src_array, rv_array, nodata_value, mask_array, 6, 10) | |
def select_non_ag(src_array, rv_array, nodata_value, mask_array) : | |
"""Assuming mask_array is a MODIS landcover raster, selects forest and grassland/savannah types""" | |
return mask_nodata_range_and_rv(src_array, rv_array, nodata_value, mask_array, 1, 10) | |
def zonal_stats(vector_path, raster_path, mask_path=None, nodata_value=None, | |
global_src_extent=False, only_global=False, | |
calc_fn=calc_stats, mask_fn=mask_nodata_and_rv, | |
valid_range=None, band=1): | |
"""computes zonal statistics for the raster over the vector. | |
Allows user to optionally specify a mask layer, an external mask calculation | |
function, nodata value, whether to perform the stats globally, and whether | |
to load the raster data just once, globally, or once for each feature in the | |
vector file. | |
If the user specifies a mask file, it must be the same dimensions, projection, and | |
coverage as the raster file.""" | |
warnings.simplefilter("error", UserWarning) | |
rds = gdal.Open(raster_path, GA_ReadOnly) | |
assert(rds) | |
rb = rds.GetRasterBand(band) | |
rgt = rds.GetGeoTransform() | |
mask_array = None | |
if mask_path is not None : | |
mds = gdal.Open(mask_path, GA_ReadOnly) | |
assert(mds) | |
mb = mds.GetRasterBand(1) | |
mgt = mds.GetGeoTransform() | |
if not np.allclose(mgt,rgt) : | |
raise ValueError("Raster and Mask must have same resolution and coverage.") | |
rproj=rds.GetProjection() | |
mproj=mds.GetProjection() | |
if rproj != mproj : | |
raise ValueError("Raster and Mask must have same projection.") | |
if nodata_value: | |
nodata_value = float(nodata_value) | |
rb.SetNoDataValue(nodata_value) | |
vds = ogr.Open(vector_path, GA_ReadOnly) # TODO maybe open update if we want to write stats | |
assert(vds) | |
vlyr = vds.GetLayer(0) | |
# create an in-memory numpy array of the source raster data | |
# covering the whole extent of the vector layer | |
if global_src_extent or only_global: | |
# find "global min/max" to calculate common bins for all histograms | |
src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent()) | |
src_offset = clip_offsets_to_raster(src_offset, rds) | |
if src_offset is None: | |
raise ValueError("raster and vector do not intersect") | |
src_array = rb.ReadAsArray(*src_offset) | |
src_array = ma.masked_values(src_array, rb.GetNoDataValue()) | |
if valid_range is not None: | |
src_array = ma.masked_outside(src_array,valid_range[0],valid_range[1]) | |
if mask_path is not None : | |
mask_array = mb.ReadAsArray(*src_offset) | |
# use global source extent | |
# useful only when disk IO or raster scanning inefficiencies are your limiting factor | |
# advantage: reads raster data in one pass | |
# disadvantage: large vector extents may have big memory requirements | |
# calculate new geotransform of the layer subset | |
new_gt = ( | |
(rgt[0] + (src_offset[0] * rgt[1])), | |
rgt[1], | |
0.0, | |
(rgt[3] + (src_offset[1] * rgt[5])), | |
0.0, | |
rgt[5] | |
) | |
# if asked, compute the stats just for the contents of the bounding box | |
if only_global : | |
masked = ma.masked_values(src_array, rb.GetNoDataValue()) | |
feature_stats = calc_fn(masked) | |
return [ feature_stats ] | |
mem_drv = ogr.GetDriverByName('Memory') | |
driver = gdal.GetDriverByName('MEM') | |
# Loop through vectors | |
stats = [] | |
feat = vlyr.GetNextFeature() | |
while feat is not None: | |
if not global_src_extent: | |
# use local source extent | |
# fastest option when you have fast disks and well indexed raster (ie tiled Geotiff) | |
# advantage: each feature uses the smallest raster chunk | |
# disadvantage: lots of reads on the source raster | |
src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope()) | |
src_offset = clip_offsets_to_raster(src_offset,rds) | |
# what follows should read as | |
# if src_offset is None : | |
# feat=vlyr.GetNextFeature() | |
# continue | |
# | |
# Unfortunately, ogr has a bug that makes the while | |
# loop spin infinitely if a continue statement is used. | |
if src_offset is not None : | |
src_array = rb.ReadAsArray(*src_offset) | |
src_array = ma.masked_values(src_array, rb.GetNoDataValue()) | |
if valid_range is not None: | |
src_array = ma.masked_outside(src_array,valid_range[0],valid_range[1]) | |
if mask_path is not None : | |
mask_array = mb.ReadAsArray(*src_offset) | |
# calculate new geotransform of the feature subset | |
new_gt = ( | |
(rgt[0] + (src_offset[0] * rgt[1])), | |
rgt[1], | |
0.0, | |
(rgt[3] + (src_offset[1] * rgt[5])), | |
0.0, | |
rgt[5] | |
) | |
# workaround for python/ogr bug involving continue statements | |
# see above. | |
if src_offset is not None: | |
# Create a temporary vector layer in memory | |
mem_ds = mem_drv.CreateDataSource('out') | |
mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon) | |
mem_layer.CreateFeature(feat.Clone()) | |
# Rasterize it | |
rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte) | |
rvds.SetGeoTransform(new_gt) | |
gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1]) | |
rv_array = rvds.ReadAsArray() | |
# Mask the source data array with our current feature | |
# we take the logical_not to flip 0<->1 to get the correct mask effect | |
# we also mask out nodata values explictly | |
masked = mask_fn(src_array, rv_array, nodata_value, mask_array) | |
# Compute stats only if there are one or more pixels remaining (unmasked) | |
if masked.count() > 0 : | |
feature_stats = calc_fn(masked) | |
feature_stats['fid'] = int(feat.GetFID()) | |
stats.append(feature_stats) | |
rvds = None | |
mem_ds = None | |
feat = vlyr.GetNextFeature() | |
vds = None | |
rds = None | |
if mask_path is not None : | |
mds = None | |
return stats | |
def calc_range(vector_path, raster_path) : | |
stats = zonal_stats(vector_path, raster_path, only_global=True) | |
return (stats[0]['min'], stats[0]['max']) | |
def calc_range_many_rasters(vector_path, raster_path_array) : | |
totalrange = None | |
for raster_path in raster_path_array : | |
cur_range = calc_range(vector_path, raster_path) | |
if totalrange is None : | |
totalrange = list(cur_range) | |
else : | |
totalrange[0] = min(totalrange[0],cur_range[0]) | |
totalrange[1] = max(totalrange[1],cur_range[1]) | |
return totalrange | |
def zonal_histogram(vector_path, raster_path, | |
histobins=10, historange=None) : | |
# go compute the range if necessary | |
if historange is None : | |
historange = calc_range(vector_path, raster_path) | |
# does the number of bins need modifying? | |
histobins = calc_histobins(histobins, historange) | |
#compose the stats function | |
histo_fn = partial(calc_histogram, histobins, historange) | |
# go compute the zonal histogram, using the same bins for all zones/fids | |
return zonal_stats(vector_path, raster_path, calc_fn=histo_fn) | |
def make_histogram_frame(stats) : | |
"""Processes stats from zonal histogram to return a data frame containing | |
only the histogram data. Fid + bins are columns. Each row contains data for | |
one fid.""" | |
# post process the results into a data frame where the | |
fids = [ i['fid'] for i in stats ] | |
num_histobins = len(stats[0]['histogram'][0]) | |
histobins = [None] * num_histobins | |
for i in range(num_histobins) : | |
histobins[i] = [ item['histogram'][0][i] for item in stats ] | |
tabledef = c.OrderedDict() | |
tabledef['FID'] = fids | |
histo_edges = stats[0]['histogram'][1] | |
for i in range(num_histobins) : | |
tabledef['Bin_{}'.format(histo_edges[i])] = histobins[i] | |
return pd.DataFrame(tabledef) | |
def make_stats_frame(stats) : | |
return pd.DataFrame(stats) | |
if __name__ == "__main__": | |
opts = {'VECTOR': sys.argv[1], 'RASTER': sys.argv[2]} | |
stats = zonal_stats(opts['VECTOR'], opts['RASTER']) | |
try: | |
from pandas import DataFrame | |
print DataFrame(stats) | |
except ImportError: | |
import json | |
print json.dumps(stats, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ time python zonal_stats.py test.shp terrain/slope.tif | |
count fid max mean min std sum | |
0 203 0 96 65.876847 3 17.968489 13373 | |
1 130 1 90 60.100000 3 16.728994 7813 | |
2 1341 2 102 53.211037 2 17.901655 71356 | |
3 130 3 90 60.100000 3 16.728994 7813 | |
4 132 4 64 15.962121 1 15.360519 2107 | |
5 132 5 53 31.515152 17 7.970100 4160 | |
6 131 6 42 9.893130 0 8.168317 1296 | |
7 132 7 64 28.712121 2 14.853594 3790 | |
8 133 8 54 35.548872 11 8.878856 4728 | |
9 131 9 82 52.297710 4 17.349877 6851 | |
10 131 10 11 3.030534 0 1.781752 397 | |
11 134 11 57 10.156716 1 11.960042 1361 | |
12 133 12 45 19.000000 0 13.727750 2527 | |
13 132 13 64 26.507576 1 18.848075 3499 | |
14 132 14 94 52.787879 1 22.297585 6968 | |
15 131 15 84 19.450382 1 15.992944 2548 | |
16 132 16 52 11.583333 0 11.538501 1529 | |
17 132 17 108 53.515152 6 18.198603 7064 | |
18 341 18 76 39.117302 9 11.540482 13339 | |
19 337 19 57 19.988131 4 9.593512 6736 | |
20 336 20 78 48.636905 11 13.357014 16342 | |
21 338 21 3 0.855030 0 0.527067 289 | |
22 337 22 34 5.347181 0 7.069888 1802 | |
23 341 23 0 0.000000 0 0.000000 0 | |
24 341 24 42 16.612903 0 9.041271 5665 | |
25 337 25 128 78.848665 5 18.689028 26572 | |
26 341 26 29 7.973607 1 5.341357 2719 | |
27 339 27 78 35.616519 5 14.455317 12074 | |
28 341 28 65 20.199413 0 16.636394 6888 | |
29 340 29 84 35.855882 1 17.022989 12191 | |
30 338 30 96 61.440828 2 16.703587 20767 | |
31 340 31 101 57.832353 8 18.161971 19663 | |
real 0m1.311s | |
user 0m0.372s | |
sys 0m0.752s | |
#### Starspan equivalent | |
$ time starspan --vector test.shp --out-prefix testout --out-type table \ | |
--summary-suffix _stats.csv --raster terrain/slope.tif \ | |
--stats avg mode median min max sum stdev nulls && \ | |
cat testout_stats.csv | |
1: Extracting from /usr/local/apps/land_owner_tools/lot/fixtures/downloads/terrain/slope.tif | |
Summary: | |
Intersecting features: 32 | |
Polygons: 32 | |
Processed pixels: 8379 | |
real 0m1.440s | |
user 0m0.944s | |
sys 0m0.296s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment