Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Last active March 23, 2025 23:26
Show Gist options
  • Save mdsumner/55de7c645dbfa0e364a24283876eb4d5 to your computer and use it in GitHub Desktop.
Save mdsumner/55de7c645dbfa0e364a24283876eb4d5 to your computer and use it in GitHub Desktop.
## https://developmentseed.org/obstore/latest/examples/fastapi/

# Example large Parquet file hosted in AWS open data
#store = S3Store("ookla-open-data", region="us-west-2", skip_signature=True)
#path = "parquet/performance/type=fixed/year=2024/quarter=1/2024-01-01_performance_fixed_tiles.parquet"


Sys.setenv("AWS_REGION" = "us-west-2")
Sys.setenv("AWS_NO_SIGN_REQUEST" = "YES")
#Sys.setenv("AWS_VIRTUAL_HOSTING" = "TRUE") ## true by default bucket.host vs host/bucket

library(gdalraster)
bucket <- "/vsis3/ookla-open-data"
vsi_read_dir(bucket)

## all
obs <- vsi_read_dir("/vsis3/ookla-open-data/parquet", recursive = TRUE)
length(obs)

## drill into what we're looking for ()
grep("2024-01-01", obs, value = TRUE)

(key <- grep("2024-01-01.*fixed", obs, value = TRUE))
## "performance/type=fixed/year=2024/quarter=1/2024-01-01_performance_fixed_tiles.parquet"

ds <- new(GDALVector, glue::glue("{bucket}/parquet/{key}"))
ds$getDriverLongName()
#[1] "(Geo)Parquet"

str(ds$getNextFeature())
# List of 12
# $ FID            :integer64 1 
# $ quadkey        : chr "0022133222330012"
# $ tile           : chr "POLYGON((-160.037841796875 70.6399478155463, -160.032348632812 70.6399478155463, -160.032348632812 70.638126730"| __truncated__
# $ tile_x         : num -160
# $ tile_y         : num 70.6
# $ avg_d_kbps     :integer64 158062 
# $ avg_u_kbps     :integer64 14698 
# $ avg_lat_ms     :integer64 223 
# $ avg_lat_down_ms: int 402
# $ avg_lat_up_ms  : int 204
# $ tests          :integer64 1 
# $ devices        :integer64 1 
# - attr(*, "gis")=List of 5
# ..$ type         : chr "vector"
# ..$ geom_column  : chr(0) 
# ..$ geom_col_type: chr(0) 
# ..$ geom_col_srs : chr(0) 
# ..$ geom_format  : chr "NONE"
# - attr(*, "class")= chr [1:2] "OGRFeature" "list"

ds$close()
@mdsumner
Copy link
Author

mdsumner commented Mar 23, 2025

# https://developmentseed.org/async-tiff/latest/#reading-sentinel-2-l2a
#store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
#path = "sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"


Sys.setenv("AWS_REGION" = "us-west-2")
Sys.setenv("AWS_NO_SIGN_REQUEST" = "YES")
#Sys.setenv("AWS_VIRTUAL_HOSTING" = "TRUE") ## true by default bucket.host vs host/bucket
library(gdalraster)
bucket <- "sentinel-cogs"
path <- "sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"
dsn <- glue::glue("/vsis3/{bucket}/{path}")
ds <- new(GDALRaster, dsn)
ds$bbox()
ds$getProjectionRef()
ds$dim()
ds$getRasterCount()
ds$getBlockSize(1L)
data <- read_ds(ds, xoff  = 0, yoff = 0, xsize = 1024, ysize = 1024)
plot_raster(data)


## more interestingly we might warp to our chosen sensible map projection, and get a decimated subset of the source 10980x10980
## and communicate between packages without having to plumb orrible code or use files
reproj::reproj_extent(ds$bbox()[c(1, 3, 2, 4)], "EPSG:4326", source = ds$getProjectionRef())
gdalraster::warp(ds, "/vsimem/local.tif", "+proj=laea +lon_0=-111 +lat_0=36", cl_arg = c("-ts", 1024, 0))
terra::plot(terra::rast("/vsimem/local.tif"))

@mdsumner
Copy link
Author

VSIFile, we can seek and read raw bytes from an object

vsi <- new(VSIFile, dsn)
vsi$rewind()
vsi$tell()
vsi$read(10)
vsi$close()

Here's a real example where I warp to MEM, ingest with VSIFile, and write that load of COG bytes to object store:

https://github.com/mdsumner/pawsey-aad/blob/7c8ad261da764a79095150b9293bea86164aca67/acacia/sync-ghrsst.R#L57-L60

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment