Skip to content

Instantly share code, notes, and snippets.

@thedivtagguy
Created November 5, 2024 08:50
Show Gist options
  • Save thedivtagguy/d2f58da8a81a954c14eb734f5c2d0c35 to your computer and use it in GitHub Desktop.
Save thedivtagguy/d2f58da8a81a954c14eb734f5c2d0c35 to your computer and use it in GitHub Desktop.
Easier way to download stuff from the Microsoft Buildings dataset. Find your quadkey here: https://aman.bh/blog/2023/building-footprints
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape, box
import json
import inquirer
import argparse
from typing import Optional, Tuple, Dict
import sys
import webbrowser
import re
def parse_bbox_string(bbox_str: str) -> Optional[Tuple[float, float, float, float]]:
"""
Parse a bounding box string from bboxfinder.com
Accepts formats like:
- 78.359299,17.402408,78.568039,17.615883
- Long,Lat: (78.359299,17.402408) (78.568039,17.615883)
"""
# Try simple comma-separated format
if ',' in bbox_str and '(' not in bbox_str:
try:
coords = [float(x.strip()) for x in bbox_str.split(',')]
if len(coords) == 4:
return tuple(coords)
except ValueError:
pass
# Try bboxfinder's "Long,Lat" format
match = re.search(r'\(([0-9.-]+),([0-9.-]+)\)\s*\(([0-9.-]+),([0-9.-]+)\)', bbox_str)
if match:
try:
coords = [float(x) for x in match.groups()]
return (coords[0], coords[1], coords[2], coords[3])
except ValueError:
pass
return None
def prompt_for_bounds_with_bboxfinder() -> Optional[Tuple[float, float, float, float]]:
"""Interactive prompt for bounding box coordinates using bboxfinder.com."""
print("\nOpening bboxfinder.com in your default browser...")
print("Instructions:")
print("1. Draw a rectangle on the map to select your area")
print("2. Copy the coordinates from the 'Box' field at the bottom")
print("3. Paste them here when ready")
webbrowser.open('http://bboxfinder.com')
while True:
bbox_input = input("\nPaste the coordinates from bboxfinder.com (or type 'cancel' to skip): ").strip()
if bbox_input.lower() == 'cancel':
return None
bounds = parse_bbox_string(bbox_input)
if bounds is None:
print("Invalid coordinate format. Please copy the coordinates exactly as shown in bboxfinder.com")
print("Expected format: longitude,latitude,longitude,latitude")
print("Example: 78.359299,17.402408,78.568039,17.615883")
continue
# Validate that min is less than max
if bounds[0] >= bounds[2] or bounds[1] >= bounds[3]:
print("Error: Minimum coordinates must be less than maximum coordinates")
continue
return bounds
def get_available_regions() -> list:
"""Fetch list of available regions from the dataset."""
try:
dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
return sorted(dataset_links['Location'].unique())
except Exception as e:
print(f"Error fetching regions: {e}")
return ['India', 'United States', 'Canada', 'Australia'] # Fallback list
def process_buildings(region: str, quadkey: int, bounds: Optional[Tuple] = None) -> gpd.GeoDataFrame:
"""
Process building footprints for a given region and quadkey, optionally cropping to a bounding box.
"""
try:
# Load dataset links
dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
# Select tile
selectedTile = dataset_links[(dataset_links.Location == region) & (dataset_links.QuadKey == quadkey)]
if selectedTile.empty:
raise ValueError(f"No data found for region {region} and quadkey {quadkey}")
# Process selected tile
for _, row in selectedTile.iterrows():
print(f"Loading data from {row.Url}...")
df = pd.read_json(row.Url, lines=True)
print("Converting geometries...")
geometries = []
for idx, geom_row in df.iterrows():
try:
geom = shape(geom_row['geometry'])
geometries.append(geom)
except Exception as e:
print(f"Warning: Skipping invalid geometry at index {idx}: {e}")
continue
print("Creating GeoDataFrame...")
gdf = gpd.GeoDataFrame(geometry=geometries, crs=4326)
if bounds:
print("Cropping to bounding box...")
bbox = box(*bounds)
gdf = gdf[gdf.intersects(bbox)].copy()
gdf['geometry'] = gdf.geometry.intersection(bbox)
gdf = gdf[~gdf.geometry.is_empty]
# Generate output filename
output_filename = f"{region.lower().replace(' ', '_')}_{quadkey}"
if bounds:
output_filename += f"_cropped"
output_filename += ".geojson"
print(f"Writing results to {output_filename}...")
gdf.to_file(output_filename, driver="GeoJSON")
print(f"Successfully processed {len(gdf)} buildings")
return gdf
except Exception as e:
print(f"Error: {str(e)}")
return None
def main():
parser = argparse.ArgumentParser(description='Process building footprints from global dataset')
parser.add_argument('--non-interactive', action='store_true',
help='Run in non-interactive mode (requires --region and --quadkey)')
parser.add_argument('--region', help='Region name')
parser.add_argument('--quadkey', type=int, help='Quadkey identifier')
args = parser.parse_args()
if args.non_interactive:
if not args.region or not args.quadkey:
parser.error("Non-interactive mode requires --region and --quadkey arguments")
region = args.region
quadkey = args.quadkey
# Optional bounds prompt
use_bounds = input("Do you want to specify a bounding box using bboxfinder.com? (y/N): ").lower().strip()
bounds = prompt_for_bounds_with_bboxfinder() if use_bounds == 'y' else None
else:
# Interactive mode
regions = get_available_regions()
questions = [
inquirer.List('region',
message="Select a region",
choices=regions),
inquirer.Text('quadkey',
message="Enter the quadkey",
validate=lambda _, x: x.isdigit()),
inquirer.Confirm('use_bounds',
message="Do you want to specify a bounding box using bboxfinder.com?",
default=False)
]
answers = inquirer.prompt(questions)
if not answers:
sys.exit(1)
region = answers['region']
quadkey = int(answers['quadkey'])
bounds = prompt_for_bounds_with_bboxfinder() if answers['use_bounds'] else None
# Process the buildings
print(f"\nProcessing buildings for {region} (Quadkey: {quadkey})")
if bounds:
print(f"Using bounding box: {bounds}")
result = process_buildings(region, quadkey, bounds)
if result is not None:
print("\nProcessing completed successfully!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment