Skip to content

Instantly share code, notes, and snippets.

@ochafik
Last active November 12, 2024 14:33
Show Gist options
  • Select an option

  • Save ochafik/4a6d071b3e434f6bf239f63e7f15f436 to your computer and use it in GitHub Desktop.

Select an option

Save ochafik/4a6d071b3e434f6bf239f63e7f15f436 to your computer and use it in GitHub Desktop.
Import a Wikidata JSON Dump to a nearly readable DuckDB format
-- Copyright 2024 Google LLC.
-- SPDX-License-Identifier: Apache-2.0
-- To get similar data about some entities from SPARQL on https://query.wikidata.org/:
-- https://gist.github.com/ochafik/94c4ecb811c71989ec4f74cb660959a8
-- # Get example dump of 10 items:
-- wget https://gist.githubusercontent.com/ochafik/d871e233d3563eccb10238ab4b2615a7/raw/4486482f31eef6109d85a1ddba2a2b4aab527cb4/wiki.json
-- # Or get the full dump (140GB download, unzips to 1.7TB):
-- aria2c --max-connection-per-server 16 https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz
-- # Run on example (near-instant)
-- DATA=wiki.json duckdb --init wikidata_duckdb.sql tmp.ddb
-- # Run on full dump (8+ hours on ungzipped content, unclear how long w/ gzipped; single core operation :-( )
-- cat wikidata_duckdb.sql | DATA=latest-all.json.gz duckdb all.ddb
-- TODO: Add support for references
SET preserve_insertion_order = false; -- https://duckdb.org/docs/sql/dialect/order_preservation.html
CREATE MACRO describe_snak(snak) AS [
snak->>'$.property',
CASE snak->>'$.datatype'
WHEN 'external-id' THEN '#' || (snak->>'$.datavalue.value') || '"'
ELSE CASE snak->>'$.datavalue.type'
WHEN 'wikibase-entityid' THEN (snak->>'$.datavalue.value.id')
WHEN 'globecoordinate' THEN '@ '|| (snak->>'$.datavalue.value.latitude') || ', ' || (snak->>'$.datavalue.value.longitude')
WHEN 'monolingualtext' THEN '"' || (snak->>'$.datavalue.value.text') || '"'
WHEN 'quantity' THEN (snak->>'$.datavalue.value.amount') || IF(snak->>'$.datavalue.value.unit' = '1', '', ' x ' || (snak->>'$.datavalue.value.unit'))
WHEN 'string' THEN '"' || (snak->>'$.datavalue.value') || '"'
WHEN 'time' THEN (snak->>'$.datavalue.value.time')
ELSE CAST(snak->>'$.datavalue' as STRING)
END
END,
];
CREATE TABLE wikidata AS
SELECT
type,
id,
labels->>'$.en.value' AS label,
descriptions->>'$.en.value' AS description,
aliases->>'$.en[*].value' AS aliases,
flatten([
[
flatten([
[
describe_snak(claim->'$.mainsnak')
],
flatten([
[
describe_snak(qualifier)
FOR qualifier IN (claim->('$.qualifiers.' || kk)->'$[*]')
IF ((qualifier->>'$.datavalue.type' != 'monolingualtext') OR (qualifier->>'$.datavalue.value.language' = 'en'))
AND (qualifier->>'$.datatype' != 'external-id')
]
FOR kk IN json_keys(claim->'$.qualifiers')
IF json_extract(claim, '$.qualifiers.' || kk) != 'null'::JSON
]),
])
FOR claim IN (claims->k->'$[*]')
IF (claim->>'$.rank' != 'deprecated')
AND ((claim->>'$.mainsnak.datatype') != 'external-id')
-- AND ((snak->>'$.mainsnak.datavalue.type') != 'monolingualtext' OR snak->>'$.mainsnak.datavalue.value.language' = 'en')
]
FOR k IN json_keys(claims)
IF json_extract(claims, '$.' || k) != 'null'::JSON
]) AS claims,
FROM read_json(getenv('DATA'), columns={
type: 'VARCHAR',
id: 'VARCHAR',
-- title: 'VARCHAR',
labels: 'JSON',
descriptions: 'JSON',
aliases: 'JSON',
claims: 'JSON',
modified: 'TIMESTAMP',
})
-- where id = 'Q42'
-- limit 1000
;
SELECT * FROM wikidata;
SELECT count(*) FROM wikidata;
CREATE index wikidata_id ON wikidata(id);
-- # Turn an entry' statements to human readable strings (w/ entity / property ids in parentheses)
-- select
-- FORMAT(
-- E'{}\n{}',
-- FORMAT('{} ({}): {}', a.label, a.id, a.description),
-- STRING_AGG(claim_desc, E'\n')
-- ) AS claims_desc,
-- from (
-- select
-- a.id,
-- a.label,
-- a.description,
-- claims,
-- from wikidata AS a
-- where id = 'Q42'
-- -- join unnest(['Q42', 'Q1']) AS iid ON (a.id = iid['unnest'])
-- -- Filter not used with the following:
-- -- where id IN ('Q1', 'Q42')
-- -- where id = 'Q1' OR id = 'Q42'
-- ) AS a,
-- (
-- SELECT
-- STRING_AGG(
-- FORMAT(
-- '- {} -> {}',
-- -- FORMAT('{} ({}): {}', k.label, k.id, k.description),
-- FORMAT('{} ({})', k.label, k.id),
-- IFNULL(
-- FORMAT('{} ({}): {}', v.label, v.id, v.description),
-- qual_value
-- )
-- ),
-- ', ') AS claim_desc,
-- from (
-- select
-- id,
-- label,
-- description,
-- pair['unnest'][1] AS qual_id,
-- pair['unnest'][2] AS qual_value,
-- from
-- unnest(a.claims) AS claim,
-- unnest(claim['unnest']) AS pair
-- )
-- inner join wikidata AS k ON (k.id = qual_id)
-- left outer join wikidata AS v ON (v.id = qual_value)
-- )
-- GROUP BY a.label, a.id, a.description
-- ;
-- select
-- -- a.label,
-- -- k.label,
-- FORMAT('{} ({}): {}', a.label, a.id, a.description),
-- FORMAT('{} ({}): {}', k.label, k.id, k.description),
-- -- FORMAT('{} ({})', a.label, a.description),
-- -- FORMAT('{} ({})', k.label, k.description),
-- IFNULL(FORMAT('{} ({}): {}', v.label, v.id, v.description), qual_value) AS value,
-- from (
-- select
-- id,
-- label,
-- description,
-- pair['unnest'][1] AS qual_id,
-- pair['unnest'][2] AS qual_value,
-- from (
-- select
-- a.id,
-- a.label,
-- a.description,
-- claims,
-- from wikidata AS a
-- where id = 'Q1'
-- ),
-- unnest(claims) AS claim,
-- unnest(claim['unnest']) AS pair
-- ) AS a
-- inner join wikidata AS k ON (k.id = qual_id)
-- left outer join wikidata AS v ON (v.id = qual_value)
-- ;
-- cat latest-all.json.gz | gunzip | DATA=/dev/stdin duckdb --init wikidata_duckdb.sql all.ddb
-- cat latest-all.json.gz | gunzip | jsonarray2jsonl | DATA=/dev/stdin duckdb --init wikidata_duckdb.sql all.ddb
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment