Skip to content

Instantly share code, notes, and snippets.

@ostephens
Last active May 10, 2019 15:56
Show Gist options
  • Save ostephens/eb2ab3a4a210021e33d49647cf78b8d6 to your computer and use it in GitHub Desktop.
Save ostephens/eb2ab3a4a210021e33d49647cf78b8d6 to your computer and use it in GitHub Desktop.
example-marc-xml-parser.php
<?php
// TODO
//
// Change dc:subject to tags as with OpenLearn
// Add notes at top of each output file
// Add namespaces at top of each output file
// Add Table of Contents?
$books = "";
$serials = "";
$nonmusicrecs = "";
$musicrecs = "";
$videos = "";
$compfiles = "";
$unknowns = "";
$books_count = "0";
$serials_count = "0";
$nonmusicrecs_count = "0";
$musicrecs_count = "0";
$videos_count = "0";
$compfiles_count = "0";
$unknowns_count = "0";
$checkFile = "check.txt";
$checkfh = fopen($checkFile, 'w');
$booksFile = "books.txt";
$booksfh = fopen($booksFile, 'w');
$serialsFile = "serials.txt"; //Shouldn't be any of these
$serialsfh = fopen($serialsFile, 'w');
$nonmusicrecsFile = "nonmusicrecs.txt"; //This is recorded sound, not music
$nonmusicrecsfh = fopen($nonmusicrecsFile, 'w');
$musicrecsFile = "musicrecs.txt";
$musicrecsfh = fopen($musicrecsFile, 'w'); //This is recorded sound music
$videosFile = "videos.txt";
$videosfh = fopen($videosFile, 'w');
$compfilesFile = "compfiles.txt"; //Computer Files
$compfilesfh = fopen($compfilesFile, 'w');
$unknownsFiles = "unknowns.txt"; //Anything other than books,serials,a-v or computer files
$unknownsfh = fopen($unknownsFiles, 'w');
// Going to use XMLReader
// When parsing all records, DOMDocument runs out of memory
$reader = new XMLReader();
$reader->open("lucero.xml");
while ($reader->read()) {
switch ($reader->nodeType) {
case (XMLREADER::ELEMENT):
if ($reader->localName == "record") {
$node = $reader->expand();
// Create DOMDocument and load the xml to parse
$doc = new DOMDocument();
$n = $doc->importNode($node,true);
$doc->appendChild($n);
// Create DOMXPath object so we can use XPath queries
// Also register OAI MARC namespace as 'oai' for ease of reference
$xpath = new DOMXPath($doc);
$records = $doc->getElementsByTagName("record");
// Setup XPath statements for each bit of information we want to extract from records
// I've defined xpath for specific MARC fields, and then separate xpaths for each subfield
// N.B. done like this for clarity here, not really the best way of doing it
$xpath_leader = "./marc:leader/text()";
$xpath_001 = "./marc:controlfield[@tag='001']/text()";
$xpath_008 = "./marc:controlfield[@tag='008']/text()";
$xpath_020 = "./marc:datafield[@tag='020']";
$xpath_035 = "./marc:datafield[@tag='035']";
$xpath_084 = "./marc:datafield[@tag='084']";
$xpath_100 = "./marc:datafield[@tag='100']";
$xpath_110 = "./marc:datafield[@tag='110']";
$xpath_245 = "./marc:datafield[@tag='245']";
$xpath_260 = "./marc:datafield[@tag='260']";
$xpath_300 = "./marc:datafield[@tag='300']";
$xpath_500 = "./marc:datafield[@tag='500']";
$xpath_515 = "./marc:datafield[@tag='515']";
$xpath_562 = "./marc:datafield[@tag='562']";
$xpath_700 = "./marc:datafield[@tag='700']";
$xpath_710 = "./marc:datafield[@tag='710']";
$xpath_650 = "./marc:datafield[@tag='650']";
$xpath_653 = "./marc:datafield[@tag='653']";
$xpath_856 = "./marc:datafield[@tag='856']";
$xpath_a = "./marc:subfield[@code='a']/text()";
$xpath_b = "./marc:subfield[@code='b']/text()";
$xpath_c = "./marc:subfield[@code='c']/text()";
$xpath_u = "./marc:subfield[@code='u']/text()";
$xpath_v = "./marc:subfield[@code='v']/text()";
$xpath_x = "./marc:subfield[@code='x']/text()";
$xpath_y = "./marc:subfield[@code='y']/text()";
$xpath_z = "./marc:subfield[@code='z']/text()";
foreach( $records as $record ) {
// Lots of arrays to setup to hold various bits of information we will extract from record
$isbns_10 = array();
$isbns_13 = array();
$sups = array();
$acs = array();
$rcs = array();
$vcs = array();
$bcs = array();
$cdas = array();
$fts = array();
$cdrs = array();
$dvds = array();
$courses = array();
$subjects = array();
$lcsubjects_topical = array();
$lcsubjects_subdiv = array();
$lcsubject_strings = array();
$creator_persons = array();
$creator_organizations = array();
$creator_agents = array();
$publisher_places = array();
$publisher_organizations = array();
$dates = array();
$copyright_dates = array();
$approximate_dates = array();
$extents = array();
$urls = array();
$odd_dates = array();
$general_notes = array();
$doc_type = "";
$publisher_count = 0;
$nodeList_leader = $xpath->evaluate($xpath_leader,$record);
$content_leader = $nodeList_leader->item(0)->nodeValue;
$leader_type = substr($content_leader,6,2);
if ($leader_type == 'am') {
$doc_type = "a bibo:Book";
}
else if ($leader_type == 'as'){
$doc_type = "a bibo:Periodical";
}
else if ($leader_type == 'im'){
// This is a non-musical recording
// Media ontology collection?
$doc_type = "a bibo:AudioDocument";
}
else if ($leader_type == 'jm'){
// This is a musical recording
// Media ontology collection? Or Music Ontology collection?
// BIBO doesn't really have anything better than Audio Document to deal with music
// Ideally would look at music ontology and link to musicbrainz - although this seems to be more granular than our records
$doc_type = "a bibo:AudioDocument";
}
else if ($leader_type == 'gm'){
// This is a video of some type
$doc_type = "a bibo:AudioVisualDocument";
}
else if ($leader_type == 'mm'){
// This is actually a computer file, so for BIBO 'document' is the best we can do
// Should look at other ontologies for Computer Files (possible DOAP?)
$doc_type = "a biboDocument";
}
else {
$doc_type = "Unknown, Leader type: ".$leader_type;
}
$nodeList_001 = $xpath->evaluate($xpath_001,$record);
$content_001 = $nodeList_001->item(0)->nodeValue;
$id = "http://voyager.open.ac.uk/vwebv/holdingsInfo?bibId=".$content_001;
$nodeList_008 = $xpath->evaluate($xpath_008,$record);
if ($nodeList_008->length >0) {
$content_008 = $nodeList_008->item(0)->nodeValue;
if (substr($content_008,15,3) != ' ') {
$place_of_prod_pub_create = substr($content_008,15,3);
}
// Need to sort out dates from 008 - not included at the moment
$date_type = substr($content_008,6,1);
$first_date = substr($content_008,7,4);
$second_date = substr($content_008,11,4);
}
$nodeList_020 = $xpath->evaluate($xpath_020,$record);
foreach($nodeList_020 as $node_020) {
$nodeList_020a = $xpath->evaluate($xpath_a, $node_020);
if ($nodeList_020a->length > 0) {
// Differentiate ISBN-10 and ISBN-13 and add to appropriate array
$content_020a = $nodeList_020a->item(0)->nodeValue;
if (strlen($content_020a) == 10) {
array_push($isbns_10, $content_020a);
} else if (strlen($content_020a) == 13) {
array_push($isbns_13, $content_020a);
}
}
}
$nodeList_035 = $xpath->evaluate($xpath_035,$record);
foreach($nodeList_035 as $node_035) {
$nodeList_035a = $xpath->evaluate($xpath_a, $node_035);
if ($nodeList_035a->length > 0) {
// 035 Can contain a variety of identifiers - SUP is the only one of possible interest to us I think
$content_035a = $nodeList_035a->item(0)->nodeValue;
preg_match('/SUP ?[0-9]{6}/',$content_035a,$matches);
foreach($matches as $match) {
array_push($sups, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_084 = $xpath->evaluate($xpath_084,$record);
foreach($nodeList_084 as $node_084) {
$nodeList_084a = $xpath->evaluate($xpath_a, $node_084);
if ($nodeList_084a->length > 0) {
$content_084a = $nodeList_084a->item(0)->nodeValue;
if (preg_match('/Course(?! )/',$content_084a) > 0) {
$doc_type = "course";
}
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_084a,$matches,PREG_PATTERN_ORDER)) {
foreach ($matches[0] as $match) {
array_push($courses, $match);
}
}
preg_match_all('/FT ?[0-9]{4}/',$content_084a,$matches);
foreach($matches[0] as $match) {
array_push($fts, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDR ?[0-9]{4}/',$content_084a,$matches);
foreach($matches[0] as $match) {
array_push($cdrs, preg_replace('/\s/','//',$match));
}
preg_match_all('/DVD ?[0-9]{4}/',$content_084a,$matches);
foreach($matches[0] as $match) {
array_push($dvds, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_100 = $xpath->evaluate($xpath_100,$record);
foreach($nodeList_100 as $node_100) {
$nodeList_100a = $xpath->evaluate($xpath_a, $node_100);
if ($nodeList_100a->length > 0) {
$content_100a = $nodeList_100a->item(0)->nodeValue;
array_push($creator_persons, trim($content_100a, '.'));
}
}
$nodeList_110 = $xpath->evaluate($xpath_110,$record);
foreach($nodeList_110 as $node_110) {
$nodeList_110a = $xpath->evaluate($xpath_a, $node_110);
if ($nodeList_110a->length > 0) {
$content_110a = $nodeList_110a->item(0)->nodeValue;
if (preg_match('/Course(?! )/',$content_110a) > 0) {
$doc_type = "course";
}
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_110a,$matches,PREG_PATTERN_ORDER)) {
foreach ($matches[0] as $match) {
array_push($courses, $match);
}
array_push($creator_organizations, "The Open University");
}
else {
array_push($creator_agents, trim($content_110a, '.'));
}
preg_match_all('/FT ?[0-9]{4}/',$content_110a,$matches);
foreach($matches[0] as $match) {
array_push($fts, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDR ?[0-9]{4}/',$content_110a,$matches);
foreach($matches[0] as $match) {
array_push($cdrs, preg_replace('/\s/','//',$match));
}
preg_match_all('/DVD ?[0-9]{4}/',$content_110a,$matches);
foreach($matches[0] as $match) {
array_push($dvds, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_245 = $xpath->evaluate($xpath_245,$record);
foreach($nodeList_245 as $node_245) {
$nodeList_245a = $xpath->evaluate($xpath_a, $node_245);
// While it is possible to have records without a title (e.g. a letter) in our case everything should have a title in 245$$a
// If not, this will throw an error which should be followed up
$content_245a = $nodeList_245a->item(0)->nodeValue;
$title = $content_245a;
$nodeList_245b = $xpath->evaluate($xpath_b, $node_245);
if ($nodeList_245b->length > 0) {
$content_245b = $nodeList_245b->item(0)->nodeValue;
$title .= " ".$content_245b;
}
$title = trim($title, " /:");
$nodeList_245c = $xpath->evaluate($xpath_c, $node_245);
if ($nodeList_245c->length > 0) {
$content_245c = $nodeList_245c->item(0)->nodeValue;
$statement_responsibility = $content_245c;
}
else {
$statement_responsibility = "";
}
}
$nodeList_260 = $xpath->evaluate($xpath_260,$record);
foreach($nodeList_260 as $node_260) {
$nodeList_260a = $xpath->evaluate($xpath_a, $node_260);
if ($nodeList_260a->length > 0) {
$content_260a = $nodeList_260a->item(0)->nodeValue;
array_push($publisher_places, trim($content_260a,' ,:;'));
}
$nodeList_260b = $xpath->evaluate($xpath_b, $node_260);
if ($nodeList_260b->length > 0) {
$content_260b = $nodeList_260b->item(0)->nodeValue;
array_push($publisher_organizations, trim($content_260b,' ,:;'));
}
// 'Open University' not 'The Open University'
$nodeList_260c = $xpath->evaluate($xpath_c, $node_260);
if ($nodeList_260c->length > 0) {
$content_260c = $nodeList_260c->item(0)->nodeValue;
//before splitting need to check for patterns of (YYYY repr) or (YYYY repr.) and convert these to plain dates
$content_260c = preg_replace('/(\()/','',$content_260c);
$content_260c = preg_replace('/(\))/','',$content_260c);
$content_260c = preg_replace('/(repr)/','',$content_260c);
$array_260c = preg_split("/[\s\/,;]+/",$content_260c);
foreach ($array_260c as $date_260c) {
$date_260c = trim($date_260c,",.-");
if (preg_match('/^[0-9]{4}$/',$date_260c) >0 ) {
array_push($dates, $date_260c);
}
else if (preg_match('/^c[0-9]{4}$/',$date_260c) > 0 ) {
array_push($copyright_dates, substr($date_260c, 1, 4));
}
else if (preg_match('/\[[0-9]{4}\??\]/',$date_260c) > 0 ) {
array_push($approximate_dates, substr($date_260c, 1, 4));
}
// Main remaining issue is date ranges (with and without end dates) e.g. YYYY-YYYY
// Investigate YYYY-YYYY see if just courses
// Other issue is where only partial date known e.g. YY--. However only one example of this
else {
array_push($odd_dates, $date_260c);
}
}
}
}
$nodeList_300 = $xpath->evaluate($xpath_300,$record);
// Can we recognise durations?
foreach($nodeList_300 as $node_300) {
$nodeList_300a = $xpath->evaluate($xpath_a, $node_300);
$extent = "";
if ($nodeList_300a->length > 0) {
$content_300a = $nodeList_300a->item(0)->nodeValue;
array_push($extents, trim($content_300a,' :;'));
}
$nodeList_300b = $xpath->evaluate($xpath_b, $node_300);
if ($nodeList_300b->length > 0) {
$content_300b = $nodeList_300b->item(0)->nodeValue;
array_push($extents, trim($content_300b,' :;'));
}
}
$nodeList_500 = $xpath->evaluate($xpath_500,$record);
// All the 5XX fields are Notes fields
// 500 is 'General Note'
foreach($nodeList_500 as $node_500) {
$nodeList_500a = $xpath->evaluate($xpath_a, $node_500);
if ($nodeList_500a->length > 0) {
$content_500a = $nodeList_500a->item(0)->nodeValue;
array_push($general_notes, $content_500a);
// 500 Sometimes includes a SUP code
preg_match_all('/SUP ?[0-9]{6}/',$content_500a,$matches);
foreach($matches[0] as $match) {
array_push($sups, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_515 = $xpath->evaluate($xpath_515,$record);
// All the 5XX fields are Notes fields
// 515 is 'Numbering Peculiarities', but sometimes contains local identifiers for materials
foreach($nodeList_515 as $node_515) {
$nodeList_515a = $xpath->evaluate($xpath_a, $node_515);
if ($nodeList_515a->length > 0) {
$content_515a = $nodeList_515a->item(0)->nodeValue;
preg_match_all('/AC ?[0-9]{4}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($acs, preg_replace('/\s/','//',$match));
}
preg_match_all('/RC ?[0-9]{5}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($rcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/VC ?[0-9]{4}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($vcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/BC ?[0-9]{5}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($bcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDA ?[0-9]{4}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($cdas, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDR ?[0-9]{4}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($cdrs, preg_replace('/\s/','//',$match));
}
preg_match_all('/DVD ?[0-9]{5}/',$content_515a,$matches);
foreach($matches[0] as $match) {
array_push($dvds, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_562 = $xpath->evaluate($xpath_515,$record);
// All the 5XX fields are Notes fields
// 562 is 'Copy and Version Identification Note', but sometimes contains local identifiers for materials
foreach($nodeList_562 as $node_562) {
$nodeList_562a = $xpath->evaluate($xpath_a, $node_562);
if ($nodeList_562a->length > 0) {
$content_562a = $nodeList_562a->item(0)->nodeValue;
preg_match_all('/AC ?[0-9]{4}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($acs, preg_replace('/\s/','//',$match));
}
preg_match_all('/RC ?[0-9]{5}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($rcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/VC ?[0-9]{4}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($vcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/BC ?[0-9]{5}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($bcs, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDA ?[0-9]{4}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($cdas, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDR ?[0-9]{4}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($cdrs, preg_replace('/\s/','//',$match));
}
preg_match_all('/DVD ?[0-9]{5}/',$content_562a,$matches);
foreach($matches[0] as $match) {
array_push($dvds, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_650 = $xpath->evaluate($xpath_650,$record);
foreach($nodeList_650 as $node_650) {
$nodeList_650a = $xpath->evaluate($xpath_a, $node_650);
// All 650 fields should have at least $$a subfield
$content_650a = $nodeList_650a->item(0)->nodeValue;
array_push($lcsubjects_topical, trim($content_650a, "."));
$subject_str = trim($content_650a, ".");
$nodeList_650b = $xpath->evaluate($xpath_b, $node_650);
// Will get a node list even if empty, so check there is content before extracting
if ($nodeList_650b->length > 0) {
$content_650b = $nodeList_650b->item(0)->nodeValue;
array_push($lcsubjects_topical, trim($content_650b, "."));
$subject_str .= "--".trim($content_650b, ".");
}
$nodeList_650v = $xpath->evaluate($xpath_v, $node_650);
// Will get a node list even if empty, so check there is content before extracting
if ($nodeList_650v->length > 0) {
$content_650v = $nodeList_650v->item(0)->nodeValue;
array_push($lcsubjects_subdiv, trim($content_650v, "."));
$subject_str .= "--".trim($content_650v, ".");
}
$nodeList_650x = $xpath->evaluate($xpath_x, $node_650);
// Will get a node list even if empty, so check there is content before extracting
if ($nodeList_650x->length > 0) {
$content_650x = $nodeList_650x->item(0)->nodeValue;
array_push($lcsubjects_subdiv, trim($content_650x, "."));
$subject_str .= "--".trim($content_650x, ".");
}
$nodeList_650y = $xpath->evaluate($xpath_y, $node_650);
// Will get a node list even if empty, so check there is content before extracting
if ($nodeList_650y->length > 0) {
$content_650y = $nodeList_650y->item(0)->nodeValue;
array_push($lcsubjects_subdiv, trim($content_650y, "."));
$subject_str .= "--".trim($content_650y, ".");
}
$nodeList_650z = $xpath->evaluate($xpath_z, $node_650);
// Will get a node list even if empty, so check there is content before extracting
if ($nodeList_650z->length > 0) {
$content_650z = $nodeList_650z->item(0)->nodeValue;
array_push($lcsubjects_subdiv, trim($content_650z, "."));
$subject_str .= "--".trim($content_650z, ".");
}
if($subject_str <> trim($content_650a, ".")) {
array_push($lcsubject_strings, $subject_str);
}
}
$nodeList_653 = $xpath->evaluate($xpath_653,$record);
foreach($nodeList_653 as $node_653) {
$nodeList_653a = $xpath->evaluate($xpath_a, $node_653);
if ($nodeList_653a->length > 0) {
$content_653a = $nodeList_653a->item(0)->nodeValue;
array_push($subjects, $content_653a);
}
}
$nodeList_700 = $xpath->evaluate($xpath_700,$record);
foreach($nodeList_700 as $node_700) {
$nodeList_700a = $xpath->evaluate($xpath_a, $node_700);
if ($nodeList_700a->length > 0) {
$content_700a = $nodeList_700a->item(0)->nodeValue;
array_push($creator_persons, trim($content_700a, '.'));
}
}
$nodeList_710 = $xpath->evaluate($xpath_710,$record);
foreach($nodeList_710 as $node_710) {
$nodeList_710a = $xpath->evaluate($xpath_a, $node_710);
if ($nodeList_710a->length > 0) {
$content_710a = $nodeList_710a->item(0)->nodeValue;
if (preg_match('/Course(?! )/',$content_710a) > 0) {
$doc_type = "course";
}
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_710a,$matches,PREG_PATTERN_ORDER)) {
foreach ($matches[0] as $match) {
array_push($courses, $match);
}
array_push($creator_organizations, "The Open University");
}
else {
array_push($creator_agents, trim($content_710a, '.'));
}
preg_match_all('/FT ?[0-9]{4}/',$content_710a,$matches);
foreach($matches[0] as $match) {
array_push($fts, preg_replace('/\s/','//',$match));
}
preg_match_all('/CDR ?[0-9]{4}/',$content_710a,$matches);
foreach($matches[0] as $match) {
array_push($cdrs, preg_replace('/\s/','//',$match));
}
preg_match_all('/DVD ?[0-9]{4}/',$content_710a,$matches);
foreach($matches[0] as $match) {
array_push($dvds, preg_replace('/\s/','//',$match));
}
}
}
$nodeList_856 = $xpath->evaluate($xpath_856,$record);
foreach($nodeList_856 as $node_856) {
$nodeList_856u = $xpath->evaluate($xpath_u, $node_856);
if ($nodeList_856u->length > 0) {
$content_856u = $nodeList_856u->item(0)->nodeValue;
}
else {
$content_856u = "";
}
$nodeList_856z = $xpath->evaluate($xpath_z, $node_856);
if ($nodeList_856z->length > 0) {
$content_856z = $nodeList_856z->item(0)->nodeValue;
}
else {
$content_856z = "";
}
array_push($urls, array($content_856u,$content_856z));
}
// Now to output results
if ($doc_type == "course") {
// Only do this at the end as we have to parse 084, 110 and 710 before we know if this is a record for a course rather than an item
continue;
}
else if (count($courses) == 0) {
// If there wasn't a course code in the 084, 110 or 710, then not related to a course so ignore
continue;
}
else {
$date_types = array('b','c','d','e','i','k','m','n','p','q','r','s','t','u');
if (in_array($date_type, $date_types)) {
$date_compare = $id." ".$date_type." 1 ".$first_date." 2 ".$second_date;
foreach($dates as $date) {
$date_compare .= " 3 ".$date;
}
foreach($copyright_dates as $date) {
$date_compare .= " 4 ".$date;
}
foreach($approximate_dates as $date) {
$date_compare .= " 5 ".$date;
}
fwrite($checkfh, $date_compare."\n");
}
$content = "";
$content .= "<".$id.">\n";
$content .= $doc_type."\n";
$courses = array_unique($courses);
foreach($courses as $course) {
$content .= " ; hascourse \"".$course."\"\n";
}
$content .= " ; dc:title \"".$title."\"\n";
foreach($isbns_10 as $isbn_10) {
$content .= " ; bibo:isbn10 \"".$isbn_10."\" // ISBN10 \n";
}
foreach($isbns_13 as $isbn_13) {
$content .= " ; bibo:isbn13 \"".$isbn_13."\" // ISBN13 \n";
}
$sups = array_unique($sups);
foreach($sups as $sup) {
$content .= " ; dc:identifier \"".$sup."\" // SUP (Books without an ISBN should have a SUP)\n";
}
$acs = array_unique($acs);
foreach($acs as $ac) {
$content .= " ; dc:identifier \"".$ac."\" // AC code for Audio Cassette\n";
}
$rcs = array_unique($rcs);
foreach($rcs as $rc) {
$content .= " ; dc:identifier \"".$rc."\" // RC code for Radio programme on Audio Cassette\n";
}
$vcs = array_unique($vcs);
foreach($vcs as $vc) {
$content .= " ; dc:identifier \"".$vc."\" // VC code for Video Cassette\n";
}
$bcs = array_unique($bcs);
foreach($bcs as $bc) {
$content .= " ; dc:identifier \"".$bc."\" // BC code for 'broadcast' (TV) video cassette\n";
}
$cdas = array_unique($cdas);
foreach($cdas as $cda) {
$content .= " ; dc:identifier \"".$cda."\" // CDA code for Audio CD\n";
}
$fts = array_unique($fts);
foreach($fts as $ft) {
$content .= " ; dc:identifier \"".$ft."\" // FT code for Computer disks\n";
}
$cdrs = array_unique($cdrs);
foreach($cdrs as $cdr) {
$content .= " ; dc:identifier \"".$cdr."\" // CDR code for CD-ROM\n";
}
$dvds = array_unique($dvds);
foreach($dvds as $dvd) {
$content .= " ; dc:identifier \"".$dvd."\" // DVD code for DVDs\n";
}
foreach($dates as $date) {
$content .= " ; dc:date \"".$date."\"\n";
}
foreach($copyright_dates as $date) {
$content .= " ; dc:dateCopyrighted \"".$date."\"\n";
}
foreach($approximate_dates as $date) {
$content .= " ; dc:date \"".$date."?\" // This is only a 'probable date' - there is some level of uncertainty\n";
}
foreach($odd_dates as $date) {
}
foreach($creator_persons as $creator_person) {
$content .= " ; dc:creator [ a foaf:Person ; foaf:name \"".$creator_person."\" ]\n";
}
foreach($creator_organizations as $creator_organization) {
$content .= " ; dc:creator [ a foaf:Organization ; foaf:name \"".$creator_organization."\" ]\n";
}
foreach($creator_agents as $creator_agent) {
$content .= " ; dc:creator [ a foaf:Agent ; foaf:name \"".$creator_agent."\" ]\n";
}
if (strlen($statement_responsibility) > 0) {
$content .= " ; rdfs:comment \"".$statement_responsibility."\"\n";
}
foreach($extents as $extent) {
$content .= " ; dc:extent \"".$extent."\"\n";
}
foreach($publisher_organizations as $publisher_organization) {
$content .= " ; dc:publisher [ a foaf:Organization ; foaf:name \"".$publisher_organization."\" ]\n";
}
foreach($publisher_places as $publisher_place) {
$content .= " ; isbd:P1016 \"".$publisher_place."\" // isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n";
}
if (strlen($place_of_prod_pub_create) > 0) {
$content .= " ; isbd:P1016 \"".$place_of_prod_pub_create."\" // Encoded. Should be a URI for this code at http://id.loc.gov/vocabulary/countries/".$place_of_prod_pub_create.", also at http://marccodes.heroku.com/countries/".$place_of_prod_pub_create." (with sameAs to geonames etc.) but not sure how to represent\n";
$content .= " // isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n";
}
foreach($general_notes as $general_note) {
$content .= " ; dc:description \"".$general_note."\"\n";
}
$lcsubjects_topical = array_unique($lcsubjects_topical);
foreach($lcsubjects_topical as $lcsubject_topical) {
$content .= " ; dc:subject \"".$lcsubject_topical."\" // This is from 650a or 650b (Topical term). There should be a URI for this heading on id.loc.gov - use version from 'Topical Term' concept type not 'General Subdivision'\n";
}
$lcsubjects_subdiv = array_unique($lcsubjects_subdiv);
foreach($lcsubjects_subdiv as $lcsubject_subdiv) {
$content .= " ; dc:subject \"".$lcsubject_subdiv."\" // This is from 650v,650x,650y or 650z (Subdivision). There should be a URI for this heading on id.loc.gov - use version from 'xxx Subdivision' concept type not 'Topical Term'\n";
}
foreach($lcsubject_strings as $lcsubject) {
$content .= " ; dc:subject \"".$lcsubject."\" // Not clear how combined headings should be handled. Some will have a URI on id.loc.gov, but not all\n";
}
foreach($subjects as $subject) {
$content .= " ; dc:subject \"".$subject."\" // This is a local Keyword - Use tag ontology? Like OpenLearn?\n";
}
foreach($urls as $url) {
$content .= " ; foaf:page \"".$url[0]."\" // A URL associated with resource in some way. If there is a note in the catalogue it is reproduced here: \"".$url[1]."\"\n";
}
$content .= ".\n";
}
switch ($leader_type) {
case "am":
$books .= $content;
$books_count += 1;
break;
case "as":
$serials .= $content;
$serials_count += 1;
break;
case "im":
$nonmusicrecs .= $content;
$nonmusicrecs_count += 1;
break;
case "jm":
$musicrecs .= $content;
$musicrecs_count += 1;
break;
case "mm":
$compfiles .= $content;
$compfiles_count += 1;
break;
case "gm":
$videos .= $content;
$videos_count += 1;
break;
default:
$unknowns .= $content;
$unknowns_count += 1;
}
}
} // end of XMLRecord if
} // end of XMLRecord switch
} // end of XMLRecord while (to read xml in)
fwrite($booksfh,$books);
fwrite($serialsfh,$serials);
fwrite($nonmusicrecsfh,$nonmusicrecs);
fwrite($musicrecsfh,$musicrecs);
fwrite($compfilesfh, $compfiles);
fwrite($videosfh, $videos);
fwrite($unknownsfh, $unknowns);
fclose($booksfh);
fclose($serialsfh);
fclose($nonmusicrecsfh);
fclose($musicrecsfh);
fclose($compfilesfh);
fclose($videosfh);
fclose($unknownsfh);
fwrite($checkfh, "Books: ".$books_count."\nSerials: ".$serials_count."\nNon-Musical Recordings: ".$nonmusicrecs_count."\nMusical Recordings: ".$musicrecs_count."\nComputer Files: ".$compfiles_count."\nVideos: ".$videos_count."\nUnknown: ".$unknowns_count);
fclose($checkfh);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment