Created
June 3, 2015 09:34
-
-
Save phochste/c87c81c79d8b8a6a2179 to your computer and use it in GitHub Desktop.
Match authors against VIAF using Catmandu and Linked Data Fragments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# | |
# Match authors against VIAF | |
# | |
# License: http://dev.perl.org/licenses/artistic.html | |
# | |
# Author: Patrick Hochstenbach <[email protected]> | |
# | |
# Apr 2015 | |
$|++; | |
use Catmandu::Sane; | |
use Catmandu; | |
use Catmandu::Fix::Inline::marc_map qw(:all); | |
use RDF::LDF; | |
use Data::Dumper; | |
use Cache::LRU; | |
use Getopt::Long; | |
Catmandu->load(':up'); | |
my $store = 'rug01'; | |
my $type = 'ALEPHSEQ'; | |
my $fix = undef; | |
GetOptions("fix=s" => \$fix , "store=s" => \$store , "type=s" => \$type); | |
my $query = shift; | |
unless ($query) { | |
print STDERR <<EOF; | |
usage: $0 [--fix fix] [--type USMARC|JSON|XML|RAW|ALEPHSEQ] file | |
usage: $0 [--fix fix] [--store store] query | |
usage: $0 [--fix fix] [--store store] all | |
EOF | |
exit(1); | |
} | |
$query = undef if $query eq 'all'; | |
my $viaf_endpoint = 'http://data.linkeddatafragments.org/viaf'; | |
my $client = RDF::LDF->new(url => $viaf_endpoint); | |
my $cache = Cache::LRU->new(size => 10000); | |
my $iterator; | |
if (-r $query) { | |
$iterator = Catmandu->importer('MARC',file => $query, type => $type); | |
} | |
else { | |
$iterator = Catmandu->store($store)->bag->searcher(query => $query); | |
} | |
binmode(STDOUT,':encoding(UTF-8)'); | |
&do_import($fix,$query); | |
sub do_import { | |
my $fix = shift; | |
my $query = shift; | |
my $fixer; | |
$fixer = Catmandu->fixer($fix) if defined $fix; | |
my $n = $iterator->each(sub { | |
my $item = shift; | |
my $id = $item->{_id}; | |
my $record = $item->{record}; | |
my $marc; | |
my $found = 0; | |
for my $field (@$record) { | |
my ($tag,$ind1,$ind2,@data) = @$field; | |
next unless ($tag eq '100' || $tag eq '700'); | |
my $doc = { record => [['100',$ind1,$ind2,@data]] }; | |
my @aut = flat(marc_map($doc,'100ad' , -split => 1 , -pluck => 1)); | |
$marc .= &alephseq($id,@$field); | |
if (@aut == 0) { | |
$marc .= "\n"; | |
next; | |
} | |
my ($name,$date) = @aut; | |
$name =~ s/,$//; | |
if (defined $date && $date =~/^(\d{4}-(\d{4})?)$/) { | |
$date = $1; | |
} | |
else { | |
$marc .= "\n"; | |
next; | |
} | |
my @res = &get_viaf_id($name,$date); | |
if (@res == 1) { | |
my $uri = pop @res; | |
$marc .= "\t$uri\n"; | |
$found = 1; | |
} | |
else { | |
my $num = int(@res); | |
# more than one hit | |
$marc .= "\n"; | |
} | |
} | |
print $marc if $found; | |
}); | |
print STDERR "Processed $n rug01 records\n"; | |
} | |
sub alephseq { | |
my ($id,$tag,$ind1,$ind2,@data) = @_; | |
my $str = "$id $tag$ind1$ind2 L "; | |
for (my $i = 0 ; $i < @data ; $i += 2) { | |
if ($data[$i] eq '_') { | |
$str .= $data[$i+1]; | |
} | |
else { | |
$str .= "\$\$" . $data[$i] . $data[$i+1]; | |
} | |
} | |
$str; | |
} | |
sub get_viaf_id { | |
my ($name,$date) = @_; | |
my $key = "\"$name, $date\""; | |
if (defined(my $value = $cache->get($key))) { | |
return @$value; | |
} | |
else { | |
my $value = &ldf_query($key); | |
$cache->set($key => $value); | |
return @$value; | |
} | |
} | |
sub ldf_query { | |
my $object = shift; | |
my $it = $client->get_statements(undef, 'http://schema.org/alternateName', $object); | |
my @res = (); | |
while (my $st = $it->()) { | |
push @res, $st->subject->uri; | |
} | |
return \@res; | |
} | |
sub flat(@) { | |
return map { ref eq 'ARRAY' ? @$_ : $_ } @_; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment