Skip to content

Instantly share code, notes, and snippets.

@bmschmidt
Created January 6, 2025 18:07
Show Gist options
  • Save bmschmidt/250ac80bb5a843e2db0ffb8da794e164 to your computer and use it in GitHub Desktop.
Save bmschmidt/250ac80bb5a843e2db0ffb8da794e164 to your computer and use it in GitHub Desktop.
Overture maps data
import numpy as np
import pyarrow as pa
import duckdb
from nomic import AtlasDataset
import typer
import time
center = [-71, 42.3] # Boston.
def main(name: str, limit: int = 1_000_000, timeout : float = 5):
ds = AtlasDataset(name, unique_id_field='id')
con = duckdb.connect(":memory:")
con.execute("""
INSTALL spatial;
INSTALL httpfs;
LOAD spatial;
LOAD httpfs;
SET s3_region='us-west-2'""")
con.execute("""
CREATE VIEW overture AS SELECT * FROM read_parquet('2024-10-23.0/*/*/*',filename=true, hive_partitioning=1, union_by_name=True) wHERE theme='places';
""")
con.execute("""
create table topbrands AS SELECT brand['names']['primary'] brand_name, COUNT(*) AS count FROM overture WHERE BRAND IS NOT NULL GROUP BY brand_name ORDER BY count DESC LIMIT 2000;
""")
reader = con.query(f"""
SELECT
id,
[(bbox.xmin + bbox.xmax)/2,
-- Turns out this isn't the mercator projection. Thanks for nothing, Claude.
-1 * ((bbox.ymin + bbox.ymax) / 2) / cos(radians(bbox.ymin)) ] xy,
websites[1] url,
categories['primary'] AS category,
brand['names']['primary'] brand_text,
-- brand_name brand_category,
TRY_CAST(REGEXP_REPLACE(phones[1], '[^0-9]', '', 'g') AS DOUBLE) phone,
socials[1] social,
names['primary']::STRING AS "name",
addresses[1]['country'] country,
addresses[1]['postcode'] as postcode,
FROM overture
LEFT JOIN topbrands ON (topbrands.brand_name = overture.brand['names']['primary'])
WHERE
--bbox.ymin > {center[1] - .5} AND bbox.ymax < {center[1] + .5} AND
--bbox.xmin > {center[0] - .5} AND bbox.xmax < {center[0] + .5} AND
type='place'
AND names['primary'] IS NOT NULL
AND bbox.ymin > -80 AND bbox.ymax > -80
LIMIT {limit}
""").fetch_arrow_reader()
added = 0
for sample in reader:
embeddings = np.vstack(sample['xy'].to_numpy(zero_copy_only=False))
data = pa.Table.from_batches([sample]).drop(['xy'])
#proj = AtlasProject("Boston", unique_id_field='id', modality='embedding', description="Places using data from Overture Maps: see https://github.com/OvertureMaps/data")
if ds.total_datums > 0:
raise ValueError("Project alread created!!")
# Reduce quantization fuzz
# commenting out the upload step
# ds.add_data(data, embeddings = embeddings)
added += len(embeddings)
print(f"Added {added}")
time.sleep(timeout)
if __name__ == "__main__":
typer.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment