Skip to content

Instantly share code, notes, and snippets.

@h-a-graham
Last active March 17, 2025 13:47
Show Gist options
  • Save h-a-graham/53a7ce9a3095245017acf93f6c674754 to your computer and use it in GitHub Desktop.
Save h-a-graham/53a7ce9a3095245017acf93f6c674754 to your computer and use it in GitHub Desktop.
Download Major-Tom geospatial embeddings
#' Download geospatial embeddings for Major-TOM datsets
#' @param bbox numeric vector of length 4 representing the bounding box in
#' longitude and latitude coordinates (WGS84). Ordered as c(xmin, xmax, ymin,
#' ymax).
#' @param embed character vector of length 1 representing the embedding to
#' download.
#' @param limit integer vector of length 1 representing the maximum number of
#' rows to return. Useful for testing, if NULL then all rows within the bbox
#' are returned.
#' @return a data frame containing the embeddings within the specified bbox.
#' @details
#' https://huggingface.co/Major-TOM for more info.
majortom_embeds <- function(
bbox,
embed = c(
"Core-S2RGB-DINOv2",
"Core-S2L1C-SSL4EO",
"Core-S1RTC-SSL4EO",
"Core-S2RGB-SigLIP"
),
limit = NULL
) {
if (!is.numeric(bbox) || length(bbox) != 4) {
rlang::abort("bbox must be a numeric vector of length 4")
}
embed <- rlang::arg_match(embed)
if (!is.null(limit) && (!is.numeric(limit) || length(limit) != 1)) {
rlang::abort("limit must be a numeric vector of length 1")
}
url <- glue::glue("hf://datasets/Major-TOM/{embed}/embeddings/*.parquet")
con <- DBI::dbConnect(duckdb::duckdb())
DBI::dbExecute(con, "INSTALL httpfs; LOAD httpfs;")
DBI::dbExecute(
con,
glue::glue("CREATE VIEW embeddings AS SELECT * FROM '{url}'")
)
if (!is.null(limit)) {
limit <- glue::glue("LIMIT {limit}")
} else {
limit <- ""
}
sql <- glue::glue(
"
SELECT *
FROM embeddings
WHERE centre_lon BETWEEN {bbox['xmin']} AND {bbox['xmax']}
AND centre_lat BETWEEN {bbox['ymin']} AND {bbox['ymax']}
{limit}
"
)
DBI::dbGetQuery(con, sql)
}
library(dplyr)
library(sf)
bounds <- c(
xmin = 115.33,
xmax = 119.41,
ymin = 4.09,
ymax = 7.55
)
result <- majortom_embeds(bounds)
embeds <- tibble(result) |>
sf::st_as_sf() |>
sf::st_set_crs(4326)
em_pca <- embeds |>
# group_by(grid_cell) |>
reframe(
embedding_matrix = list(do.call(rbind, embedding)),
pca = list(prcomp(embedding_matrix[[1]])),
# Scale PCA values to [0,1] range for RGB
pca1_scaled = scales::rescale(pca[[1]]$x[, 1], to = c(0, 1)),
pca2_scaled = scales::rescale(pca[[1]]$x[, 2], to = c(0, 1)),
pca3_scaled = scales::rescale(pca[[1]]$x[, 3], to = c(0, 1))
) |>
mutate(
# Combine into RGB hex color
rgb_color = rgb(pca1_scaled, pca2_scaled, pca3_scaled),
geometry = embeds$geometry
) |>
sf::st_as_sf() |>
sf::st_set_crs(4326) |>
select(!c(pca, embedding_matrix))
em_pca |>
ggplot() +
geom_sf(aes(fill = rgb_color), alpha = 0.9) +
scale_fill_identity() +
theme_light() +
theme(legend.position = "none")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment