What's the advantage of MARC in binary format vs. xml format, except that it's hard to read? Ok, space can be an issue, e.g. when processing GND dumps that can be found on dnb.de.
Last active
August 9, 2021 05:26
-
-
Save gonter/b6525395d3328ff32e6427d7cdbb406b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use Util::JSON; | |
use Data::Dumper; | |
$Data::Dumper::Indent= 1; | |
use FileHandle; | |
binmode(STDOUT, ':utf8'); | |
my $fnm= shift(@ARGV); | |
my $fnm_out= 'test.xml'; | |
my $record_type="Authority"; | |
local *FO_xml; | |
if (defined ($fnm_out)) | |
{ | |
open (FO_xml, '>:utf8', $fnm_out) or die; | |
} | |
proc_marcbin($fnm); | |
exit(0); | |
sub proc_marcbin | |
{ | |
my $fnm= shift; | |
my $marcbin= Util::JSON::read_text($fnm); | |
my @records= split(/\x1d/, $marcbin); # group separator | |
my $rec_num= 0; | |
if (defined ($fnm_out)) | |
{ | |
print FO_xml <<EOX; | |
<?xml version="1.0" encoding="UTF-8"?><collection xmlns="http://www.loc.gov/MARC21/slim"> | |
EOX | |
} | |
foreach my $record (@records) | |
{ | |
$rec_num++; | |
# print "record $rec_num\n"; | |
my $record_xml= "<record type=\"$record_type\">\n"; | |
# hex_dump ($record); | |
my @fields= split(/\x1e/, $record); # record separator | |
my $field_num= 0; | |
my @tags= (); | |
my ($tag, $x1, $x2, $x3); | |
FIELD: foreach my $field (@fields) | |
{ | |
if ($field_num++ == 0) | |
{ | |
if ($field =~ m#(........................)(.*)#) | |
{ | |
my ($leader, $trailer)= ($1, $2); | |
print __LINE__, " leader=[$leader] trailer=[$trailer]\n"; | |
$trailer =~ s#(...)#push(@tags, $1); ''#ge; | |
print __LINE__, " leader=[$leader] trailer=[$trailer]\n"; | |
print __LINE__, " tags=[", join(' ', @tags), "]\n"; | |
print __LINE__, " tag_count=[", scalar(@tags), "]\n"; | |
$record_xml .= " <leader>$leader</leader>\n"; | |
} | |
next FIELD; | |
} | |
($tag, $x1, $x2, $x3)= splice(@tags, 0, 4); | |
if ($tag+0 <= 9) | |
{ | |
# hex_dump ($field); | |
# print __LINE__, " controlfield: tag=[$tag] x1=[$x1] x2=[$x2] x3=[$x3]\n"; | |
$record_xml .= " <controlfield tag=\"$tag\" x1=\"$x1\" x2=\"$x2\" x3=\"$x3\">" . $field . "</controlfield>\n"; | |
next FIELD; | |
} | |
# print "field $field_num\n"; | |
my @items= split(/\x1f/, $field); # unit separator | |
my $item_num= 0; | |
ITEM: foreach my $item (@items) | |
{ | |
$item_num++; | |
if ($item_num == 1) | |
{ | |
if ($item =~ m#^(.)(.)$#) | |
{ | |
my ($ind1, $ind2)= ($1, $2); | |
# print "tag=[$tag] ind1=[$ind1] ind2=[$ind2] x1=[$x1] x2=[$x2] x3=[$x3]\n"; | |
$record_xml .= " <datafield tag=\"$tag\" ind1=\"$ind1\" ind2=\"$ind2\" x1=\"$x1\" x2=\"$x2\" x3=\"$x3\">\n"; | |
next ITEM; | |
} | |
} | |
# print __LINE__, " item=[$item]\n"; | |
if ($item=~ m#(.)(.*)#) | |
{ | |
my ($code, $text)= ($1, $2); | |
$record_xml .= " <subfield code=\"$code\">$text</subfield>\n"; | |
} | |
} | |
$record_xml .= " </datafield>\n"; | |
} | |
print __LINE__, " remaining tags=[", join(' ', @tags), "]\n"; | |
$record_xml .= "</record>\n"; | |
if (defined ($fnm_out)) | |
{ | |
print FO_xml $record_xml; | |
} | |
else | |
{ | |
print "record as xml: [\n", $record_xml, "]\n"; | |
} | |
} | |
if (defined ($fnm_out)) | |
{ | |
print FO_xml <<EOX; | |
</collection> | |
EOX | |
} | |
} | |
# ---------------------------------------------------------------------------- | |
sub hex_dump | |
{ | |
my $data= shift; | |
local *FX= shift || *STDOUT; | |
my $off= 0; | |
my ($i, $c, $v); | |
my $run= 1; | |
DATA: while ($run) | |
{ | |
my $char= ''; | |
my $hex= ''; | |
my $offx= sprintf ('%08X', $off); | |
for ($i= 0; $i < 16; $i++) | |
{ | |
$c= substr ($data, $off+$i, 1); | |
if ($i == 8) | |
{ | |
$hex .= ' '; | |
} | |
if ($c ne '') | |
{ | |
# $data= substr ($data, 1); | |
$v= unpack ('C', $c); | |
$c= '.' if ($v < 0x20 || $v >= 0x7F); | |
$char .= $c; | |
$hex .= sprintf (' %02X', $v); | |
} | |
else | |
{ | |
$char .= ' '; | |
$hex .= ' '; | |
$run= 0; | |
} | |
} | |
print FX "$offx $hex |$char|\n"; | |
$off += 0x10; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment