Last active
March 17, 2025 13:47
-
-
Save h-a-graham/53a7ce9a3095245017acf93f6c674754 to your computer and use it in GitHub Desktop.
Download Major-Tom geospatial embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Download geospatial embeddings for Major-TOM datsets | |
#' @param bbox numeric vector of length 4 representing the bounding box in | |
#' longitude and latitude coordinates (WGS84). Ordered as c(xmin, xmax, ymin, | |
#' ymax). | |
#' @param embed character vector of length 1 representing the embedding to | |
#' download. | |
#' @param limit integer vector of length 1 representing the maximum number of | |
#' rows to return. Useful for testing, if NULL then all rows within the bbox | |
#' are returned. | |
#' @return a data frame containing the embeddings within the specified bbox. | |
#' @details | |
#' https://huggingface.co/Major-TOM for more info. | |
majortom_embeds <- function( | |
bbox, | |
embed = c( | |
"Core-S2RGB-DINOv2", | |
"Core-S2L1C-SSL4EO", | |
"Core-S1RTC-SSL4EO", | |
"Core-S2RGB-SigLIP" | |
), | |
limit = NULL | |
) { | |
if (!is.numeric(bbox) || length(bbox) != 4) { | |
rlang::abort("bbox must be a numeric vector of length 4") | |
} | |
embed <- rlang::arg_match(embed) | |
if (!is.null(limit) && (!is.numeric(limit) || length(limit) != 1)) { | |
rlang::abort("limit must be a numeric vector of length 1") | |
} | |
url <- glue::glue("hf://datasets/Major-TOM/{embed}/embeddings/*.parquet") | |
con <- DBI::dbConnect(duckdb::duckdb()) | |
DBI::dbExecute(con, "INSTALL httpfs; LOAD httpfs;") | |
DBI::dbExecute( | |
con, | |
glue::glue("CREATE VIEW embeddings AS SELECT * FROM '{url}'") | |
) | |
if (!is.null(limit)) { | |
limit <- glue::glue("LIMIT {limit}") | |
} else { | |
limit <- "" | |
} | |
sql <- glue::glue( | |
" | |
SELECT * | |
FROM embeddings | |
WHERE centre_lon BETWEEN {bbox['xmin']} AND {bbox['xmax']} | |
AND centre_lat BETWEEN {bbox['ymin']} AND {bbox['ymax']} | |
{limit} | |
" | |
) | |
DBI::dbGetQuery(con, sql) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(sf) | |
bounds <- c( | |
xmin = 115.33, | |
xmax = 119.41, | |
ymin = 4.09, | |
ymax = 7.55 | |
) | |
result <- majortom_embeds(bounds) | |
embeds <- tibble(result) |> | |
sf::st_as_sf() |> | |
sf::st_set_crs(4326) | |
em_pca <- embeds |> | |
# group_by(grid_cell) |> | |
reframe( | |
embedding_matrix = list(do.call(rbind, embedding)), | |
pca = list(prcomp(embedding_matrix[[1]])), | |
# Scale PCA values to [0,1] range for RGB | |
pca1_scaled = scales::rescale(pca[[1]]$x[, 1], to = c(0, 1)), | |
pca2_scaled = scales::rescale(pca[[1]]$x[, 2], to = c(0, 1)), | |
pca3_scaled = scales::rescale(pca[[1]]$x[, 3], to = c(0, 1)) | |
) |> | |
mutate( | |
# Combine into RGB hex color | |
rgb_color = rgb(pca1_scaled, pca2_scaled, pca3_scaled), | |
geometry = embeds$geometry | |
) |> | |
sf::st_as_sf() |> | |
sf::st_set_crs(4326) |> | |
select(!c(pca, embedding_matrix)) | |
em_pca |> | |
ggplot() + | |
geom_sf(aes(fill = rgb_color), alpha = 0.9) + | |
scale_fill_identity() + | |
theme_light() + | |
theme(legend.position = "none") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment