Skip to content

Instantly share code, notes, and snippets.

@phochste
Created June 3, 2015 09:34
Show Gist options
  • Save phochste/c87c81c79d8b8a6a2179 to your computer and use it in GitHub Desktop.
Save phochste/c87c81c79d8b8a6a2179 to your computer and use it in GitHub Desktop.
Match authors against VIAF using Catmandu and Linked Data Fragments
#!/usr/bin/env perl
#
# Match authors against VIAF
#
# License: http://dev.perl.org/licenses/artistic.html
#
# Author: Patrick Hochstenbach <[email protected]>
#
# Apr 2015
$|++;
use Catmandu::Sane;
use Catmandu;
use Catmandu::Fix::Inline::marc_map qw(:all);
use RDF::LDF;
use Data::Dumper;
use Cache::LRU;
use Getopt::Long;
Catmandu->load(':up');
my $store = 'rug01';
my $type = 'ALEPHSEQ';
my $fix = undef;
GetOptions("fix=s" => \$fix , "store=s" => \$store , "type=s" => \$type);
my $query = shift;
unless ($query) {
print STDERR <<EOF;
usage: $0 [--fix fix] [--type USMARC|JSON|XML|RAW|ALEPHSEQ] file
usage: $0 [--fix fix] [--store store] query
usage: $0 [--fix fix] [--store store] all
EOF
exit(1);
}
$query = undef if $query eq 'all';
my $viaf_endpoint = 'http://data.linkeddatafragments.org/viaf';
my $client = RDF::LDF->new(url => $viaf_endpoint);
my $cache = Cache::LRU->new(size => 10000);
my $iterator;
if (-r $query) {
$iterator = Catmandu->importer('MARC',file => $query, type => $type);
}
else {
$iterator = Catmandu->store($store)->bag->searcher(query => $query);
}
binmode(STDOUT,':encoding(UTF-8)');
&do_import($fix,$query);
sub do_import {
my $fix = shift;
my $query = shift;
my $fixer;
$fixer = Catmandu->fixer($fix) if defined $fix;
my $n = $iterator->each(sub {
my $item = shift;
my $id = $item->{_id};
my $record = $item->{record};
my $marc;
my $found = 0;
for my $field (@$record) {
my ($tag,$ind1,$ind2,@data) = @$field;
next unless ($tag eq '100' || $tag eq '700');
my $doc = { record => [['100',$ind1,$ind2,@data]] };
my @aut = flat(marc_map($doc,'100ad' , -split => 1 , -pluck => 1));
$marc .= &alephseq($id,@$field);
if (@aut == 0) {
$marc .= "\n";
next;
}
my ($name,$date) = @aut;
$name =~ s/,$//;
if (defined $date && $date =~/^(\d{4}-(\d{4})?)$/) {
$date = $1;
}
else {
$marc .= "\n";
next;
}
my @res = &get_viaf_id($name,$date);
if (@res == 1) {
my $uri = pop @res;
$marc .= "\t$uri\n";
$found = 1;
}
else {
my $num = int(@res);
# more than one hit
$marc .= "\n";
}
}
print $marc if $found;
});
print STDERR "Processed $n rug01 records\n";
}
sub alephseq {
my ($id,$tag,$ind1,$ind2,@data) = @_;
my $str = "$id $tag$ind1$ind2 L ";
for (my $i = 0 ; $i < @data ; $i += 2) {
if ($data[$i] eq '_') {
$str .= $data[$i+1];
}
else {
$str .= "\$\$" . $data[$i] . $data[$i+1];
}
}
$str;
}
sub get_viaf_id {
my ($name,$date) = @_;
my $key = "\"$name, $date\"";
if (defined(my $value = $cache->get($key))) {
return @$value;
}
else {
my $value = &ldf_query($key);
$cache->set($key => $value);
return @$value;
}
}
sub ldf_query {
my $object = shift;
my $it = $client->get_statements(undef, 'http://schema.org/alternateName', $object);
my @res = ();
while (my $st = $it->()) {
push @res, $st->subject->uri;
}
return \@res;
}
sub flat(@) {
return map { ref eq 'ARRAY' ? @$_ : $_ } @_;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment