Last active
December 23, 2019 08:43
-
-
Save VladimirAlexiev/303cbe6af12f2b2103a68b074756d6df to your computer and use it in GitHub Desktop.
Count VIAF links per source, eg for https://www.wikidata.org/wiki/Wikidata:WikiProject_Authority_control#VIAF_Links_per_Source
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
gunzip -c viaf-20191104-links.txt.gz | perl viaf-links-count.pl > viaf-links-count-201911.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -n | |
# "@" indicates "special identifiers" as outlined below: | |
# 1. Duplicated identifiers: | |
# BNF@http://catalogue.bnf.fr/ark:/12148/cb155028263 BNF|15502826 | |
# DNB@http://d-nb.info/gnd/174361009 DNB|174361009 | |
# SZ@http://d-nb.info/gnd/1172505020 SZ|1172505020 | |
# NSZL@http://nektar.oszk.hu/resource/auth/32902 NSZL|000000000979 | |
# LC@n99000919 LC|n 99000919 | |
# LIH@LNB:CNYr; = By LIH|LNB:CNY_r_;=B_y_ | |
# ICCU@IT\ICCU\RAVV\040528 ICCU|RAVV040528 | |
# NLR@RU\NLR\AUTH\770159456 NLR|RU NLR AUTH 770159456 | |
# [email protected] BNC|a10682752 | |
# BNCHL@BNC10000000000000000285646 BNCHL|10000000000000000285646 | |
# [email protected] DBC|87097969688349 | |
# BNE@0XX5646206 BNE|XX5646206 | |
# BNE@1 XX5806035 BNE|XX5806035 | |
# BNE@2 XX5042388 BNE|XX5042388 | |
# 2. Semi-duplicated identifiers: some variants in the same database, probably resulting from some migration | |
# NUKAT@vtls000457356 NUKAT|n 01090237 (no web link apart from https://viaf.org/processed/NUKAT|n01090237) | |
# SELIBR@xv8cgp7g5cn1fms SELIBR|277004 (https://libris.kb.se/auth/277004 -> https://libris.kb.se/xv8cgp7g5cn1fms) | |
# https://www.wikidata.org/wiki/Property:P5587 vs https://www.wikidata.org/wiki/Property:P906 | |
# We cut out both cases 1 and 2: | |
m{\b(BNF|DNB|SZ|NSZL|LC|LIH|ICCU|BNC|BNCHL|NLR|DBC|BNE|NUKAT|SELIBR)[@]} and next; | |
# 3. Web links that are not duplicated | |
# Wikipedia@https://cs.wikipedia.org/wiki/Pavel_Hrach | |
# GeoNames@http://www.geonames.org/2927777 | |
# ORCID@http://orcid.org/0000-0001-9280-9007 | |
# Identities@https://www.worldcat.org/identities/viaf-100347282 | |
# IMAGINE@http://www.imj.org.il/imagine/thesaurus/artists/T69335.htm | |
m{Identities[@|].*/(\w+)-} and $a{"Identities-$1"}++ | |
or m{\t(.*?)[|@]} and $a{$1}++; | |
END { | |
for (sort keys %a) {printf "%10d %s\n",$a{$_},$_} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Count external-id prop instances that are VIAF Components | |
# Unfortunately times out or returns empty :-( | |
select ?prop (count(*) as ?c) { | |
values ?prop { | |
wdt:P3280 | |
wdt:P1017 | |
wdt:P4619 | |
wdt:P1015 | |
wdt:P1273 | |
wdt:P7369 | |
wdt:P950 | |
wdt:P268 | |
wdt:P7028 | |
wdt:P1048 | |
wdt:P5504 | |
wdt:P3846 | |
wdt:P227 | |
wdt:P1309 | |
wdt:P6394 | |
wdt:P2163 | |
wdt:P1566 | |
wdt:P396 | |
wdt:P213 | |
wdt:P245 | |
wdt:P5034 | |
wdt:P1670 | |
wdt:P244 | |
wdt:P1368 | |
wdt:P7026 | |
wdt:P7058 | |
wdt:P1946 | |
wdt:P349 | |
wdt:P271 | |
wdt:P691 | |
wdt:P409 | |
wdt:P3988 | |
wdt:P949 | |
wdt:P1695 | |
wdt:P7029 | |
wdt:P1375 | |
wdt:P951 | |
wdt:P1006 | |
wdt:P1207 | |
wdt:P496 | |
wdt:P7041 | |
wdt:P1005 | |
wdt:P3065 | |
wdt:P906 | |
wdt:P6934 | |
wdt:P269 | |
# wdt:P227 | |
wdt:P7039 | |
wdt:P7024 | |
# wdt:P1015 | |
} | |
[] ?prop [] | |
} group by ?prop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment