GBA LOD 763M buildings

Download https://gist.github.com/marklit/237bec7b8a82e1290e2f481c8adda611 to manifest.lod.txt.

for LINE in `cat manifest.lod.txt`; do
    GEOJSON=`echo "$LINE" | grep -o 'files=.*' | sed 's/files=//g'`
    ZIPFILE="$GEOJSON.zip"
    if [ ! -f $ZIPFILE ]; then
        echo $LINE
        wget -c -O $GEOJSON "$LINE"
        zip -9 "$ZIPFILE" "$GEOJSON"
        rm "$GEOJSON"
    else
        echo "$ZIPFILE already exists, skipping.."
    fi
done

923 ZIPs in total. 187 files are less than 4K.

$ ls *.zip | wc -l # 923
$ find . -type f -name '*geojson.zip' -size +4096 | wc -l # 187

These files are valid ZIPs with GeoJSON but the GeoJSON is zero bytes.

$ unzip -l e170_n10_e175_n05.geojson.zip

Archive:  e170_n10_e175_n05.geojson.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2025-09-03 07:27   e170_n10_e175_n05.geojson
---------                     -------
        0                     1 file

$ hexdump -C e170_n10_e175_n05.geojson.zip

00000000  50 4b 03 04 0a 00 00 00  00 00 69 3b 23 5b 00 00  |PK........i;#[..|
00000010  00 00 00 00 00 00 00 00  00 00 19 00 1c 00 65 31  |..............e1|
00000020  37 30 5f 6e 31 30 5f 65  31 37 35 5f 6e 30 35 2e  |70_n10_e175_n05.|
00000030  67 65 6f 6a 73 6f 6e 55  54 09 00 03 a6 c3 b7 68  |geojsonUT......h|
00000040  a6 c3 b7 68 75 78 0b 00  01 04 e8 03 00 00 04 e8  |...hux..........|
00000050  03 00 00 50 4b 01 02 1e  03 0a 00 00 00 00 00 69  |...PK..........i|
00000060  3b 23 5b 00 00 00 00 00  00 00 00 00 00 00 00 19  |;#[.............|
00000070  00 18 00 00 00 00 00 00  00 00 00 ff 81 00 00 00  |................|
00000080  00 65 31 37 30 5f 6e 31  30 5f 65 31 37 35 5f 6e  |.e170_n10_e175_n|
00000090  30 35 2e 67 65 6f 6a 73  6f 6e 55 54 05 00 03 a6  |05.geojsonUT....|
000000a0  c3 b7 68 75 78 0b 00 01  04 e8 03 00 00 04 e8 03  |..hux...........|
000000b0  00 00 50 4b 05 06 00 00  00 00 01 00 01 00 5f 00  |..PK.........._.|
000000c0  00 00 53 00 00 00 00 00                           |..S.....|
000000c8

Turn GeoJSON files into Parquet.

while :
do
    for ZIPFILE in *.geojson.zip; do
        BASENAME=`echo "$ZIPFILE" | cut -f1 -d.`
        PQ_FILE="$BASENAME.parquet"

        if [ ! -f $PQ_FILE ]; then
            echo $BASENAME

            mkdir -p working
            rm working/* || true 2>/dev/null
            unzip $ZIPFILE -d working/
            FIRST=`ls working/*.geojson | head -n1`

            echo "COPY (
                      SELECT   * EXCLUDE (geom),
                               ST_FLIPCOORDINATES(
                                    ST_TRANSFORM(geom,
                                                 'EPSG:3857',
                                                 'EPSG:4326')) geom
                      FROM     ST_READ('$FIRST')
                      ORDER BY HILBERT_ENCODE([ST_Y(ST_CENTROID(geom)),
                                               ST_X(ST_CENTROID(geom))]::double[2])
                  ) TO '$BASENAME.parquet' (
                      FORMAT            'PARQUET',
                      CODEC             'ZSTD',
                      COMPRESSION_LEVEL 22,
                      ROW_GROUP_SIZE    15000);" | ~/duckdb
        else
            echo "$PQ_FILE already exists, skipping.."
        fi
    done
    echo "Sleeping for 5 minutes.."
    sleep 300
done

45 GB of Parquet across 211 files.

$ du -hsc *.parquet | tail -n1 # 45G
$ ls *.parquet | wc -l # 211

763,111,107 buildings

select count(*) from read_parquet('*.parquet');

┌──────────────────┐
│   count_star()   │
│      int64       │
├──────────────────┤
│    763111107     │
│ (763.11 million) │
└──────────────────┘

$ ~/duckdb stats.duckdb

CREATE OR REPLACE TABLE h3_4_stats AS
    SELECT   H3_LATLNG_TO_CELL(
                ST_Y(ST_CENTROID(geom)),
                ST_X(ST_CENTROID(geom)), 4) as h3_4,
             COUNT(*) num_buildings
    FROM     read_parquet('*.parquet')
    WHERE    ST_X(ST_CENTROID(geom)) < 178.5
    AND      ST_X(ST_CENTROID(geom)) > -178.5
    GROUP BY 1;

COPY (
    SELECT ST_ASWKB(H3_CELL_TO_BOUNDARY_WKT(h3_4)::geometry) geom,
           num_buildings
    FROM   h3_4_stats
) TO 'h3_4_stats.pq' (
        FORMAT 'PARQUET',
        CODEC  'ZSTD',
        COMPRESSION_LEVEL 22,
        ROW_GROUP_SIZE 15000);

922 URLs for the LOD dataset.

$ wc -l manifest.lod.txt # 922

$ touch lods1.json && rm lods1.json
$ for URL in `cat manifest.lod.txt`; do
    echo $URL
    curl -sI "$URL" | jc --kv >> lods1.json
  done

1,344 GB of data across these 922 URLs.

SELECT sum("content-length"::BIGINT) / 1024 ** 3 AS GB FROM 'lods1.json';
-- 1344.3861094005406

There are 21 files that are under ~40KB. I was expecting this number to match the 187 ZIPs that had an empty GeoJSON file inside of them.

SELECT "content-length"::BIGINT > 40096, count(*) FROM 'lods1.json' group by 1;

┌────────────────────────────────────────────┬──────────────┐
│ (CAST("content-length" AS BIGINT) > 40096) │ count_star() │
│                  boolean                   │    int64     │
├────────────────────────────────────────────┼──────────────┤
│ false                                      │           21 │
│ true                                       │          901 │
└────────────────────────────────────────────┴──────────────┘

The ZIPs were produced by my download script. Their server went offline sometime overnight and I began collecting 0-byte GeoJSONs because I wasn't checking their HTTPS server's response codes. I'm waiting for their server to start up again.

marklit/lod.md