-
-
Save ochafik/4a6d071b3e434f6bf239f63e7f15f436 to your computer and use it in GitHub Desktop.
Import a Wikidata JSON Dump to a nearly readable DuckDB format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- Copyright 2024 Google LLC. | |
| -- SPDX-License-Identifier: Apache-2.0 | |
| -- To get similar data about some entities from SPARQL on https://query.wikidata.org/: | |
| -- https://gist.github.com/ochafik/94c4ecb811c71989ec4f74cb660959a8 | |
| -- # Get example dump of 10 items: | |
| -- wget https://gist.githubusercontent.com/ochafik/d871e233d3563eccb10238ab4b2615a7/raw/4486482f31eef6109d85a1ddba2a2b4aab527cb4/wiki.json | |
| -- # Or get the full dump (140GB download, unzips to 1.7TB): | |
| -- aria2c --max-connection-per-server 16 https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz | |
| -- # Run on example (near-instant) | |
| -- DATA=wiki.json duckdb --init wikidata_duckdb.sql tmp.ddb | |
| -- # Run on full dump (8+ hours on ungzipped content, unclear how long w/ gzipped; single core operation :-( ) | |
| -- cat wikidata_duckdb.sql | DATA=latest-all.json.gz duckdb all.ddb | |
| -- TODO: Add support for references | |
| SET preserve_insertion_order = false; -- https://duckdb.org/docs/sql/dialect/order_preservation.html | |
| CREATE MACRO describe_snak(snak) AS [ | |
| snak->>'$.property', | |
| CASE snak->>'$.datatype' | |
| WHEN 'external-id' THEN '#' || (snak->>'$.datavalue.value') || '"' | |
| ELSE CASE snak->>'$.datavalue.type' | |
| WHEN 'wikibase-entityid' THEN (snak->>'$.datavalue.value.id') | |
| WHEN 'globecoordinate' THEN '@ '|| (snak->>'$.datavalue.value.latitude') || ', ' || (snak->>'$.datavalue.value.longitude') | |
| WHEN 'monolingualtext' THEN '"' || (snak->>'$.datavalue.value.text') || '"' | |
| WHEN 'quantity' THEN (snak->>'$.datavalue.value.amount') || IF(snak->>'$.datavalue.value.unit' = '1', '', ' x ' || (snak->>'$.datavalue.value.unit')) | |
| WHEN 'string' THEN '"' || (snak->>'$.datavalue.value') || '"' | |
| WHEN 'time' THEN (snak->>'$.datavalue.value.time') | |
| ELSE CAST(snak->>'$.datavalue' as STRING) | |
| END | |
| END, | |
| ]; | |
| CREATE TABLE wikidata AS | |
| SELECT | |
| type, | |
| id, | |
| labels->>'$.en.value' AS label, | |
| descriptions->>'$.en.value' AS description, | |
| aliases->>'$.en[*].value' AS aliases, | |
| flatten([ | |
| [ | |
| flatten([ | |
| [ | |
| describe_snak(claim->'$.mainsnak') | |
| ], | |
| flatten([ | |
| [ | |
| describe_snak(qualifier) | |
| FOR qualifier IN (claim->('$.qualifiers.' || kk)->'$[*]') | |
| IF ((qualifier->>'$.datavalue.type' != 'monolingualtext') OR (qualifier->>'$.datavalue.value.language' = 'en')) | |
| AND (qualifier->>'$.datatype' != 'external-id') | |
| ] | |
| FOR kk IN json_keys(claim->'$.qualifiers') | |
| IF json_extract(claim, '$.qualifiers.' || kk) != 'null'::JSON | |
| ]), | |
| ]) | |
| FOR claim IN (claims->k->'$[*]') | |
| IF (claim->>'$.rank' != 'deprecated') | |
| AND ((claim->>'$.mainsnak.datatype') != 'external-id') | |
| -- AND ((snak->>'$.mainsnak.datavalue.type') != 'monolingualtext' OR snak->>'$.mainsnak.datavalue.value.language' = 'en') | |
| ] | |
| FOR k IN json_keys(claims) | |
| IF json_extract(claims, '$.' || k) != 'null'::JSON | |
| ]) AS claims, | |
| FROM read_json(getenv('DATA'), columns={ | |
| type: 'VARCHAR', | |
| id: 'VARCHAR', | |
| -- title: 'VARCHAR', | |
| labels: 'JSON', | |
| descriptions: 'JSON', | |
| aliases: 'JSON', | |
| claims: 'JSON', | |
| modified: 'TIMESTAMP', | |
| }) | |
| -- where id = 'Q42' | |
| -- limit 1000 | |
| ; | |
| SELECT * FROM wikidata; | |
| SELECT count(*) FROM wikidata; | |
| CREATE index wikidata_id ON wikidata(id); | |
| -- # Turn an entry' statements to human readable strings (w/ entity / property ids in parentheses) | |
| -- select | |
| -- FORMAT( | |
| -- E'{}\n{}', | |
| -- FORMAT('{} ({}): {}', a.label, a.id, a.description), | |
| -- STRING_AGG(claim_desc, E'\n') | |
| -- ) AS claims_desc, | |
| -- from ( | |
| -- select | |
| -- a.id, | |
| -- a.label, | |
| -- a.description, | |
| -- claims, | |
| -- from wikidata AS a | |
| -- where id = 'Q42' | |
| -- -- join unnest(['Q42', 'Q1']) AS iid ON (a.id = iid['unnest']) | |
| -- -- Filter not used with the following: | |
| -- -- where id IN ('Q1', 'Q42') | |
| -- -- where id = 'Q1' OR id = 'Q42' | |
| -- ) AS a, | |
| -- ( | |
| -- SELECT | |
| -- STRING_AGG( | |
| -- FORMAT( | |
| -- '- {} -> {}', | |
| -- -- FORMAT('{} ({}): {}', k.label, k.id, k.description), | |
| -- FORMAT('{} ({})', k.label, k.id), | |
| -- IFNULL( | |
| -- FORMAT('{} ({}): {}', v.label, v.id, v.description), | |
| -- qual_value | |
| -- ) | |
| -- ), | |
| -- ', ') AS claim_desc, | |
| -- from ( | |
| -- select | |
| -- id, | |
| -- label, | |
| -- description, | |
| -- pair['unnest'][1] AS qual_id, | |
| -- pair['unnest'][2] AS qual_value, | |
| -- from | |
| -- unnest(a.claims) AS claim, | |
| -- unnest(claim['unnest']) AS pair | |
| -- ) | |
| -- inner join wikidata AS k ON (k.id = qual_id) | |
| -- left outer join wikidata AS v ON (v.id = qual_value) | |
| -- ) | |
| -- GROUP BY a.label, a.id, a.description | |
| -- ; | |
| -- select | |
| -- -- a.label, | |
| -- -- k.label, | |
| -- FORMAT('{} ({}): {}', a.label, a.id, a.description), | |
| -- FORMAT('{} ({}): {}', k.label, k.id, k.description), | |
| -- -- FORMAT('{} ({})', a.label, a.description), | |
| -- -- FORMAT('{} ({})', k.label, k.description), | |
| -- IFNULL(FORMAT('{} ({}): {}', v.label, v.id, v.description), qual_value) AS value, | |
| -- from ( | |
| -- select | |
| -- id, | |
| -- label, | |
| -- description, | |
| -- pair['unnest'][1] AS qual_id, | |
| -- pair['unnest'][2] AS qual_value, | |
| -- from ( | |
| -- select | |
| -- a.id, | |
| -- a.label, | |
| -- a.description, | |
| -- claims, | |
| -- from wikidata AS a | |
| -- where id = 'Q1' | |
| -- ), | |
| -- unnest(claims) AS claim, | |
| -- unnest(claim['unnest']) AS pair | |
| -- ) AS a | |
| -- inner join wikidata AS k ON (k.id = qual_id) | |
| -- left outer join wikidata AS v ON (v.id = qual_value) | |
| -- ; | |
| -- cat latest-all.json.gz | gunzip | DATA=/dev/stdin duckdb --init wikidata_duckdb.sql all.ddb | |
| -- cat latest-all.json.gz | gunzip | jsonarray2jsonl | DATA=/dev/stdin duckdb --init wikidata_duckdb.sql all.ddb |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment