Download https://gist.github.com/marklit/237bec7b8a82e1290e2f481c8adda611 to manifest.lod.txt.
for LINE in `cat manifest.lod.txt`; do
GEOJSON=`echo "$LINE" | grep -o 'files=.*' | sed 's/files=//g'`
ZIPFILE="$GEOJSON.zip"
if [ ! -f $ZIPFILE ]; then
echo $LINE
wget -c -O $GEOJSON "$LINE"
zip -9 "$ZIPFILE" "$GEOJSON"
rm "$GEOJSON"
else
echo "$ZIPFILE already exists, skipping.."
fi
done923 ZIPs in total. 187 files are less than 4K.
$ ls *.zip | wc -l # 923
$ find . -type f -name '*geojson.zip' -size +4096 | wc -l # 187These files are valid ZIPs with GeoJSON but the GeoJSON is zero bytes.
$ unzip -l e170_n10_e175_n05.geojson.zipArchive: e170_n10_e175_n05.geojson.zip
Length Date Time Name
--------- ---------- ----- ----
0 2025-09-03 07:27 e170_n10_e175_n05.geojson
--------- -------
0 1 file
$ hexdump -C e170_n10_e175_n05.geojson.zip00000000 50 4b 03 04 0a 00 00 00 00 00 69 3b 23 5b 00 00 |PK........i;#[..|
00000010 00 00 00 00 00 00 00 00 00 00 19 00 1c 00 65 31 |..............e1|
00000020 37 30 5f 6e 31 30 5f 65 31 37 35 5f 6e 30 35 2e |70_n10_e175_n05.|
00000030 67 65 6f 6a 73 6f 6e 55 54 09 00 03 a6 c3 b7 68 |geojsonUT......h|
00000040 a6 c3 b7 68 75 78 0b 00 01 04 e8 03 00 00 04 e8 |...hux..........|
00000050 03 00 00 50 4b 01 02 1e 03 0a 00 00 00 00 00 69 |...PK..........i|
00000060 3b 23 5b 00 00 00 00 00 00 00 00 00 00 00 00 19 |;#[.............|
00000070 00 18 00 00 00 00 00 00 00 00 00 ff 81 00 00 00 |................|
00000080 00 65 31 37 30 5f 6e 31 30 5f 65 31 37 35 5f 6e |.e170_n10_e175_n|
00000090 30 35 2e 67 65 6f 6a 73 6f 6e 55 54 05 00 03 a6 |05.geojsonUT....|
000000a0 c3 b7 68 75 78 0b 00 01 04 e8 03 00 00 04 e8 03 |..hux...........|
000000b0 00 00 50 4b 05 06 00 00 00 00 01 00 01 00 5f 00 |..PK.........._.|
000000c0 00 00 53 00 00 00 00 00 |..S.....|
000000c8
Turn GeoJSON files into Parquet.
while :
do
for ZIPFILE in *.geojson.zip; do
BASENAME=`echo "$ZIPFILE" | cut -f1 -d.`
PQ_FILE="$BASENAME.parquet"
if [ ! -f $PQ_FILE ]; then
echo $BASENAME
mkdir -p working
rm working/* || true 2>/dev/null
unzip $ZIPFILE -d working/
FIRST=`ls working/*.geojson | head -n1`
echo "COPY (
SELECT * EXCLUDE (geom),
ST_FLIPCOORDINATES(
ST_TRANSFORM(geom,
'EPSG:3857',
'EPSG:4326')) geom
FROM ST_READ('$FIRST')
ORDER BY HILBERT_ENCODE([ST_Y(ST_CENTROID(geom)),
ST_X(ST_CENTROID(geom))]::double[2])
) TO '$BASENAME.parquet' (
FORMAT 'PARQUET',
CODEC 'ZSTD',
COMPRESSION_LEVEL 22,
ROW_GROUP_SIZE 15000);" | ~/duckdb
else
echo "$PQ_FILE already exists, skipping.."
fi
done
echo "Sleeping for 5 minutes.."
sleep 300
done45 GB of Parquet across 211 files.
$ du -hsc *.parquet | tail -n1 # 45G
$ ls *.parquet | wc -l # 211763,111,107 buildings
select count(*) from read_parquet('*.parquet');┌──────────────────┐
│ count_star() │
│ int64 │
├──────────────────┤
│ 763111107 │
│ (763.11 million) │
└──────────────────┘
$ ~/duckdb stats.duckdbCREATE OR REPLACE TABLE h3_4_stats AS
SELECT H3_LATLNG_TO_CELL(
ST_Y(ST_CENTROID(geom)),
ST_X(ST_CENTROID(geom)), 4) as h3_4,
COUNT(*) num_buildings
FROM read_parquet('*.parquet')
WHERE ST_X(ST_CENTROID(geom)) < 178.5
AND ST_X(ST_CENTROID(geom)) > -178.5
GROUP BY 1;
COPY (
SELECT ST_ASWKB(H3_CELL_TO_BOUNDARY_WKT(h3_4)::geometry) geom,
num_buildings
FROM h3_4_stats
) TO 'h3_4_stats.pq' (
FORMAT 'PARQUET',
CODEC 'ZSTD',
COMPRESSION_LEVEL 22,
ROW_GROUP_SIZE 15000);922 URLs for the LOD dataset.
$ wc -l manifest.lod.txt # 922
$ touch lods1.json && rm lods1.json
$ for URL in `cat manifest.lod.txt`; do
echo $URL
curl -sI "$URL" | jc --kv >> lods1.json
done1,344 GB of data across these 922 URLs.
SELECT sum("content-length"::BIGINT) / 1024 ** 3 AS GB FROM 'lods1.json';
-- 1344.3861094005406There are 21 files that are under ~40KB. I was expecting this number to match the 187 ZIPs that had an empty GeoJSON file inside of them.
SELECT "content-length"::BIGINT > 40096, count(*) FROM 'lods1.json' group by 1;┌────────────────────────────────────────────┬──────────────┐
│ (CAST("content-length" AS BIGINT) > 40096) │ count_star() │
│ boolean │ int64 │
├────────────────────────────────────────────┼──────────────┤
│ false │ 21 │
│ true │ 901 │
└────────────────────────────────────────────┴──────────────┘
The ZIPs were produced by my download script. Their server went offline sometime overnight and I began collecting 0-byte GeoJSONs because I wasn't checking their HTTPS server's response codes. I'm waiting for their server to start up again.
