Created
January 6, 2025 18:07
-
-
Save bmschmidt/250ac80bb5a843e2db0ffb8da794e164 to your computer and use it in GitHub Desktop.
Overture maps data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pyarrow as pa | |
import duckdb | |
from nomic import AtlasDataset | |
import typer | |
import time | |
center = [-71, 42.3] # Boston. | |
def main(name: str, limit: int = 1_000_000, timeout : float = 5): | |
ds = AtlasDataset(name, unique_id_field='id') | |
con = duckdb.connect(":memory:") | |
con.execute(""" | |
INSTALL spatial; | |
INSTALL httpfs; | |
LOAD spatial; | |
LOAD httpfs; | |
SET s3_region='us-west-2'""") | |
con.execute(""" | |
CREATE VIEW overture AS SELECT * FROM read_parquet('2024-10-23.0/*/*/*',filename=true, hive_partitioning=1, union_by_name=True) wHERE theme='places'; | |
""") | |
con.execute(""" | |
create table topbrands AS SELECT brand['names']['primary'] brand_name, COUNT(*) AS count FROM overture WHERE BRAND IS NOT NULL GROUP BY brand_name ORDER BY count DESC LIMIT 2000; | |
""") | |
reader = con.query(f""" | |
SELECT | |
id, | |
[(bbox.xmin + bbox.xmax)/2, | |
-- Turns out this isn't the mercator projection. Thanks for nothing, Claude. | |
-1 * ((bbox.ymin + bbox.ymax) / 2) / cos(radians(bbox.ymin)) ] xy, | |
websites[1] url, | |
categories['primary'] AS category, | |
brand['names']['primary'] brand_text, | |
-- brand_name brand_category, | |
TRY_CAST(REGEXP_REPLACE(phones[1], '[^0-9]', '', 'g') AS DOUBLE) phone, | |
socials[1] social, | |
names['primary']::STRING AS "name", | |
addresses[1]['country'] country, | |
addresses[1]['postcode'] as postcode, | |
FROM overture | |
LEFT JOIN topbrands ON (topbrands.brand_name = overture.brand['names']['primary']) | |
WHERE | |
--bbox.ymin > {center[1] - .5} AND bbox.ymax < {center[1] + .5} AND | |
--bbox.xmin > {center[0] - .5} AND bbox.xmax < {center[0] + .5} AND | |
type='place' | |
AND names['primary'] IS NOT NULL | |
AND bbox.ymin > -80 AND bbox.ymax > -80 | |
LIMIT {limit} | |
""").fetch_arrow_reader() | |
added = 0 | |
for sample in reader: | |
embeddings = np.vstack(sample['xy'].to_numpy(zero_copy_only=False)) | |
data = pa.Table.from_batches([sample]).drop(['xy']) | |
#proj = AtlasProject("Boston", unique_id_field='id', modality='embedding', description="Places using data from Overture Maps: see https://github.com/OvertureMaps/data") | |
if ds.total_datums > 0: | |
raise ValueError("Project alread created!!") | |
# Reduce quantization fuzz | |
# commenting out the upload step | |
# ds.add_data(data, embeddings = embeddings) | |
added += len(embeddings) | |
print(f"Added {added}") | |
time.sleep(timeout) | |
if __name__ == "__main__": | |
typer.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment