Last active
May 10, 2019 15:56
-
-
Save ostephens/eb2ab3a4a210021e33d49647cf78b8d6 to your computer and use it in GitHub Desktop.
example-marc-xml-parser.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// TODO | |
// | |
// Change dc:subject to tags as with OpenLearn | |
// Add notes at top of each output file | |
// Add namespaces at top of each output file | |
// Add Table of Contents? | |
$books = ""; | |
$serials = ""; | |
$nonmusicrecs = ""; | |
$musicrecs = ""; | |
$videos = ""; | |
$compfiles = ""; | |
$unknowns = ""; | |
$books_count = "0"; | |
$serials_count = "0"; | |
$nonmusicrecs_count = "0"; | |
$musicrecs_count = "0"; | |
$videos_count = "0"; | |
$compfiles_count = "0"; | |
$unknowns_count = "0"; | |
$checkFile = "check.txt"; | |
$checkfh = fopen($checkFile, 'w'); | |
$booksFile = "books.txt"; | |
$booksfh = fopen($booksFile, 'w'); | |
$serialsFile = "serials.txt"; //Shouldn't be any of these | |
$serialsfh = fopen($serialsFile, 'w'); | |
$nonmusicrecsFile = "nonmusicrecs.txt"; //This is recorded sound, not music | |
$nonmusicrecsfh = fopen($nonmusicrecsFile, 'w'); | |
$musicrecsFile = "musicrecs.txt"; | |
$musicrecsfh = fopen($musicrecsFile, 'w'); //This is recorded sound music | |
$videosFile = "videos.txt"; | |
$videosfh = fopen($videosFile, 'w'); | |
$compfilesFile = "compfiles.txt"; //Computer Files | |
$compfilesfh = fopen($compfilesFile, 'w'); | |
$unknownsFiles = "unknowns.txt"; //Anything other than books,serials,a-v or computer files | |
$unknownsfh = fopen($unknownsFiles, 'w'); | |
// Going to use XMLReader | |
// When parsing all records, DOMDocument runs out of memory | |
$reader = new XMLReader(); | |
$reader->open("lucero.xml"); | |
while ($reader->read()) { | |
switch ($reader->nodeType) { | |
case (XMLREADER::ELEMENT): | |
if ($reader->localName == "record") { | |
$node = $reader->expand(); | |
// Create DOMDocument and load the xml to parse | |
$doc = new DOMDocument(); | |
$n = $doc->importNode($node,true); | |
$doc->appendChild($n); | |
// Create DOMXPath object so we can use XPath queries | |
// Also register OAI MARC namespace as 'oai' for ease of reference | |
$xpath = new DOMXPath($doc); | |
$records = $doc->getElementsByTagName("record"); | |
// Setup XPath statements for each bit of information we want to extract from records | |
// I've defined xpath for specific MARC fields, and then separate xpaths for each subfield | |
// N.B. done like this for clarity here, not really the best way of doing it | |
$xpath_leader = "./marc:leader/text()"; | |
$xpath_001 = "./marc:controlfield[@tag='001']/text()"; | |
$xpath_008 = "./marc:controlfield[@tag='008']/text()"; | |
$xpath_020 = "./marc:datafield[@tag='020']"; | |
$xpath_035 = "./marc:datafield[@tag='035']"; | |
$xpath_084 = "./marc:datafield[@tag='084']"; | |
$xpath_100 = "./marc:datafield[@tag='100']"; | |
$xpath_110 = "./marc:datafield[@tag='110']"; | |
$xpath_245 = "./marc:datafield[@tag='245']"; | |
$xpath_260 = "./marc:datafield[@tag='260']"; | |
$xpath_300 = "./marc:datafield[@tag='300']"; | |
$xpath_500 = "./marc:datafield[@tag='500']"; | |
$xpath_515 = "./marc:datafield[@tag='515']"; | |
$xpath_562 = "./marc:datafield[@tag='562']"; | |
$xpath_700 = "./marc:datafield[@tag='700']"; | |
$xpath_710 = "./marc:datafield[@tag='710']"; | |
$xpath_650 = "./marc:datafield[@tag='650']"; | |
$xpath_653 = "./marc:datafield[@tag='653']"; | |
$xpath_856 = "./marc:datafield[@tag='856']"; | |
$xpath_a = "./marc:subfield[@code='a']/text()"; | |
$xpath_b = "./marc:subfield[@code='b']/text()"; | |
$xpath_c = "./marc:subfield[@code='c']/text()"; | |
$xpath_u = "./marc:subfield[@code='u']/text()"; | |
$xpath_v = "./marc:subfield[@code='v']/text()"; | |
$xpath_x = "./marc:subfield[@code='x']/text()"; | |
$xpath_y = "./marc:subfield[@code='y']/text()"; | |
$xpath_z = "./marc:subfield[@code='z']/text()"; | |
foreach( $records as $record ) { | |
// Lots of arrays to setup to hold various bits of information we will extract from record | |
$isbns_10 = array(); | |
$isbns_13 = array(); | |
$sups = array(); | |
$acs = array(); | |
$rcs = array(); | |
$vcs = array(); | |
$bcs = array(); | |
$cdas = array(); | |
$fts = array(); | |
$cdrs = array(); | |
$dvds = array(); | |
$courses = array(); | |
$subjects = array(); | |
$lcsubjects_topical = array(); | |
$lcsubjects_subdiv = array(); | |
$lcsubject_strings = array(); | |
$creator_persons = array(); | |
$creator_organizations = array(); | |
$creator_agents = array(); | |
$publisher_places = array(); | |
$publisher_organizations = array(); | |
$dates = array(); | |
$copyright_dates = array(); | |
$approximate_dates = array(); | |
$extents = array(); | |
$urls = array(); | |
$odd_dates = array(); | |
$general_notes = array(); | |
$doc_type = ""; | |
$publisher_count = 0; | |
$nodeList_leader = $xpath->evaluate($xpath_leader,$record); | |
$content_leader = $nodeList_leader->item(0)->nodeValue; | |
$leader_type = substr($content_leader,6,2); | |
if ($leader_type == 'am') { | |
$doc_type = "a bibo:Book"; | |
} | |
else if ($leader_type == 'as'){ | |
$doc_type = "a bibo:Periodical"; | |
} | |
else if ($leader_type == 'im'){ | |
// This is a non-musical recording | |
// Media ontology collection? | |
$doc_type = "a bibo:AudioDocument"; | |
} | |
else if ($leader_type == 'jm'){ | |
// This is a musical recording | |
// Media ontology collection? Or Music Ontology collection? | |
// BIBO doesn't really have anything better than Audio Document to deal with music | |
// Ideally would look at music ontology and link to musicbrainz - although this seems to be more granular than our records | |
$doc_type = "a bibo:AudioDocument"; | |
} | |
else if ($leader_type == 'gm'){ | |
// This is a video of some type | |
$doc_type = "a bibo:AudioVisualDocument"; | |
} | |
else if ($leader_type == 'mm'){ | |
// This is actually a computer file, so for BIBO 'document' is the best we can do | |
// Should look at other ontologies for Computer Files (possible DOAP?) | |
$doc_type = "a biboDocument"; | |
} | |
else { | |
$doc_type = "Unknown, Leader type: ".$leader_type; | |
} | |
$nodeList_001 = $xpath->evaluate($xpath_001,$record); | |
$content_001 = $nodeList_001->item(0)->nodeValue; | |
$id = "http://voyager.open.ac.uk/vwebv/holdingsInfo?bibId=".$content_001; | |
$nodeList_008 = $xpath->evaluate($xpath_008,$record); | |
if ($nodeList_008->length >0) { | |
$content_008 = $nodeList_008->item(0)->nodeValue; | |
if (substr($content_008,15,3) != ' ') { | |
$place_of_prod_pub_create = substr($content_008,15,3); | |
} | |
// Need to sort out dates from 008 - not included at the moment | |
$date_type = substr($content_008,6,1); | |
$first_date = substr($content_008,7,4); | |
$second_date = substr($content_008,11,4); | |
} | |
$nodeList_020 = $xpath->evaluate($xpath_020,$record); | |
foreach($nodeList_020 as $node_020) { | |
$nodeList_020a = $xpath->evaluate($xpath_a, $node_020); | |
if ($nodeList_020a->length > 0) { | |
// Differentiate ISBN-10 and ISBN-13 and add to appropriate array | |
$content_020a = $nodeList_020a->item(0)->nodeValue; | |
if (strlen($content_020a) == 10) { | |
array_push($isbns_10, $content_020a); | |
} else if (strlen($content_020a) == 13) { | |
array_push($isbns_13, $content_020a); | |
} | |
} | |
} | |
$nodeList_035 = $xpath->evaluate($xpath_035,$record); | |
foreach($nodeList_035 as $node_035) { | |
$nodeList_035a = $xpath->evaluate($xpath_a, $node_035); | |
if ($nodeList_035a->length > 0) { | |
// 035 Can contain a variety of identifiers - SUP is the only one of possible interest to us I think | |
$content_035a = $nodeList_035a->item(0)->nodeValue; | |
preg_match('/SUP ?[0-9]{6}/',$content_035a,$matches); | |
foreach($matches as $match) { | |
array_push($sups, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_084 = $xpath->evaluate($xpath_084,$record); | |
foreach($nodeList_084 as $node_084) { | |
$nodeList_084a = $xpath->evaluate($xpath_a, $node_084); | |
if ($nodeList_084a->length > 0) { | |
$content_084a = $nodeList_084a->item(0)->nodeValue; | |
if (preg_match('/Course(?! )/',$content_084a) > 0) { | |
$doc_type = "course"; | |
} | |
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_084a,$matches,PREG_PATTERN_ORDER)) { | |
foreach ($matches[0] as $match) { | |
array_push($courses, $match); | |
} | |
} | |
preg_match_all('/FT ?[0-9]{4}/',$content_084a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($fts, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDR ?[0-9]{4}/',$content_084a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdrs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/DVD ?[0-9]{4}/',$content_084a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($dvds, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_100 = $xpath->evaluate($xpath_100,$record); | |
foreach($nodeList_100 as $node_100) { | |
$nodeList_100a = $xpath->evaluate($xpath_a, $node_100); | |
if ($nodeList_100a->length > 0) { | |
$content_100a = $nodeList_100a->item(0)->nodeValue; | |
array_push($creator_persons, trim($content_100a, '.')); | |
} | |
} | |
$nodeList_110 = $xpath->evaluate($xpath_110,$record); | |
foreach($nodeList_110 as $node_110) { | |
$nodeList_110a = $xpath->evaluate($xpath_a, $node_110); | |
if ($nodeList_110a->length > 0) { | |
$content_110a = $nodeList_110a->item(0)->nodeValue; | |
if (preg_match('/Course(?! )/',$content_110a) > 0) { | |
$doc_type = "course"; | |
} | |
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_110a,$matches,PREG_PATTERN_ORDER)) { | |
foreach ($matches[0] as $match) { | |
array_push($courses, $match); | |
} | |
array_push($creator_organizations, "The Open University"); | |
} | |
else { | |
array_push($creator_agents, trim($content_110a, '.')); | |
} | |
preg_match_all('/FT ?[0-9]{4}/',$content_110a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($fts, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDR ?[0-9]{4}/',$content_110a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdrs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/DVD ?[0-9]{4}/',$content_110a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($dvds, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_245 = $xpath->evaluate($xpath_245,$record); | |
foreach($nodeList_245 as $node_245) { | |
$nodeList_245a = $xpath->evaluate($xpath_a, $node_245); | |
// While it is possible to have records without a title (e.g. a letter) in our case everything should have a title in 245$$a | |
// If not, this will throw an error which should be followed up | |
$content_245a = $nodeList_245a->item(0)->nodeValue; | |
$title = $content_245a; | |
$nodeList_245b = $xpath->evaluate($xpath_b, $node_245); | |
if ($nodeList_245b->length > 0) { | |
$content_245b = $nodeList_245b->item(0)->nodeValue; | |
$title .= " ".$content_245b; | |
} | |
$title = trim($title, " /:"); | |
$nodeList_245c = $xpath->evaluate($xpath_c, $node_245); | |
if ($nodeList_245c->length > 0) { | |
$content_245c = $nodeList_245c->item(0)->nodeValue; | |
$statement_responsibility = $content_245c; | |
} | |
else { | |
$statement_responsibility = ""; | |
} | |
} | |
$nodeList_260 = $xpath->evaluate($xpath_260,$record); | |
foreach($nodeList_260 as $node_260) { | |
$nodeList_260a = $xpath->evaluate($xpath_a, $node_260); | |
if ($nodeList_260a->length > 0) { | |
$content_260a = $nodeList_260a->item(0)->nodeValue; | |
array_push($publisher_places, trim($content_260a,' ,:;')); | |
} | |
$nodeList_260b = $xpath->evaluate($xpath_b, $node_260); | |
if ($nodeList_260b->length > 0) { | |
$content_260b = $nodeList_260b->item(0)->nodeValue; | |
array_push($publisher_organizations, trim($content_260b,' ,:;')); | |
} | |
// 'Open University' not 'The Open University' | |
$nodeList_260c = $xpath->evaluate($xpath_c, $node_260); | |
if ($nodeList_260c->length > 0) { | |
$content_260c = $nodeList_260c->item(0)->nodeValue; | |
//before splitting need to check for patterns of (YYYY repr) or (YYYY repr.) and convert these to plain dates | |
$content_260c = preg_replace('/(\()/','',$content_260c); | |
$content_260c = preg_replace('/(\))/','',$content_260c); | |
$content_260c = preg_replace('/(repr)/','',$content_260c); | |
$array_260c = preg_split("/[\s\/,;]+/",$content_260c); | |
foreach ($array_260c as $date_260c) { | |
$date_260c = trim($date_260c,",.-"); | |
if (preg_match('/^[0-9]{4}$/',$date_260c) >0 ) { | |
array_push($dates, $date_260c); | |
} | |
else if (preg_match('/^c[0-9]{4}$/',$date_260c) > 0 ) { | |
array_push($copyright_dates, substr($date_260c, 1, 4)); | |
} | |
else if (preg_match('/\[[0-9]{4}\??\]/',$date_260c) > 0 ) { | |
array_push($approximate_dates, substr($date_260c, 1, 4)); | |
} | |
// Main remaining issue is date ranges (with and without end dates) e.g. YYYY-YYYY | |
// Investigate YYYY-YYYY see if just courses | |
// Other issue is where only partial date known e.g. YY--. However only one example of this | |
else { | |
array_push($odd_dates, $date_260c); | |
} | |
} | |
} | |
} | |
$nodeList_300 = $xpath->evaluate($xpath_300,$record); | |
// Can we recognise durations? | |
foreach($nodeList_300 as $node_300) { | |
$nodeList_300a = $xpath->evaluate($xpath_a, $node_300); | |
$extent = ""; | |
if ($nodeList_300a->length > 0) { | |
$content_300a = $nodeList_300a->item(0)->nodeValue; | |
array_push($extents, trim($content_300a,' :;')); | |
} | |
$nodeList_300b = $xpath->evaluate($xpath_b, $node_300); | |
if ($nodeList_300b->length > 0) { | |
$content_300b = $nodeList_300b->item(0)->nodeValue; | |
array_push($extents, trim($content_300b,' :;')); | |
} | |
} | |
$nodeList_500 = $xpath->evaluate($xpath_500,$record); | |
// All the 5XX fields are Notes fields | |
// 500 is 'General Note' | |
foreach($nodeList_500 as $node_500) { | |
$nodeList_500a = $xpath->evaluate($xpath_a, $node_500); | |
if ($nodeList_500a->length > 0) { | |
$content_500a = $nodeList_500a->item(0)->nodeValue; | |
array_push($general_notes, $content_500a); | |
// 500 Sometimes includes a SUP code | |
preg_match_all('/SUP ?[0-9]{6}/',$content_500a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($sups, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_515 = $xpath->evaluate($xpath_515,$record); | |
// All the 5XX fields are Notes fields | |
// 515 is 'Numbering Peculiarities', but sometimes contains local identifiers for materials | |
foreach($nodeList_515 as $node_515) { | |
$nodeList_515a = $xpath->evaluate($xpath_a, $node_515); | |
if ($nodeList_515a->length > 0) { | |
$content_515a = $nodeList_515a->item(0)->nodeValue; | |
preg_match_all('/AC ?[0-9]{4}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($acs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/RC ?[0-9]{5}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($rcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/VC ?[0-9]{4}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($vcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/BC ?[0-9]{5}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($bcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDA ?[0-9]{4}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdas, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDR ?[0-9]{4}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdrs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/DVD ?[0-9]{5}/',$content_515a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($dvds, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_562 = $xpath->evaluate($xpath_515,$record); | |
// All the 5XX fields are Notes fields | |
// 562 is 'Copy and Version Identification Note', but sometimes contains local identifiers for materials | |
foreach($nodeList_562 as $node_562) { | |
$nodeList_562a = $xpath->evaluate($xpath_a, $node_562); | |
if ($nodeList_562a->length > 0) { | |
$content_562a = $nodeList_562a->item(0)->nodeValue; | |
preg_match_all('/AC ?[0-9]{4}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($acs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/RC ?[0-9]{5}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($rcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/VC ?[0-9]{4}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($vcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/BC ?[0-9]{5}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($bcs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDA ?[0-9]{4}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdas, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDR ?[0-9]{4}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdrs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/DVD ?[0-9]{5}/',$content_562a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($dvds, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_650 = $xpath->evaluate($xpath_650,$record); | |
foreach($nodeList_650 as $node_650) { | |
$nodeList_650a = $xpath->evaluate($xpath_a, $node_650); | |
// All 650 fields should have at least $$a subfield | |
$content_650a = $nodeList_650a->item(0)->nodeValue; | |
array_push($lcsubjects_topical, trim($content_650a, ".")); | |
$subject_str = trim($content_650a, "."); | |
$nodeList_650b = $xpath->evaluate($xpath_b, $node_650); | |
// Will get a node list even if empty, so check there is content before extracting | |
if ($nodeList_650b->length > 0) { | |
$content_650b = $nodeList_650b->item(0)->nodeValue; | |
array_push($lcsubjects_topical, trim($content_650b, ".")); | |
$subject_str .= "--".trim($content_650b, "."); | |
} | |
$nodeList_650v = $xpath->evaluate($xpath_v, $node_650); | |
// Will get a node list even if empty, so check there is content before extracting | |
if ($nodeList_650v->length > 0) { | |
$content_650v = $nodeList_650v->item(0)->nodeValue; | |
array_push($lcsubjects_subdiv, trim($content_650v, ".")); | |
$subject_str .= "--".trim($content_650v, "."); | |
} | |
$nodeList_650x = $xpath->evaluate($xpath_x, $node_650); | |
// Will get a node list even if empty, so check there is content before extracting | |
if ($nodeList_650x->length > 0) { | |
$content_650x = $nodeList_650x->item(0)->nodeValue; | |
array_push($lcsubjects_subdiv, trim($content_650x, ".")); | |
$subject_str .= "--".trim($content_650x, "."); | |
} | |
$nodeList_650y = $xpath->evaluate($xpath_y, $node_650); | |
// Will get a node list even if empty, so check there is content before extracting | |
if ($nodeList_650y->length > 0) { | |
$content_650y = $nodeList_650y->item(0)->nodeValue; | |
array_push($lcsubjects_subdiv, trim($content_650y, ".")); | |
$subject_str .= "--".trim($content_650y, "."); | |
} | |
$nodeList_650z = $xpath->evaluate($xpath_z, $node_650); | |
// Will get a node list even if empty, so check there is content before extracting | |
if ($nodeList_650z->length > 0) { | |
$content_650z = $nodeList_650z->item(0)->nodeValue; | |
array_push($lcsubjects_subdiv, trim($content_650z, ".")); | |
$subject_str .= "--".trim($content_650z, "."); | |
} | |
if($subject_str <> trim($content_650a, ".")) { | |
array_push($lcsubject_strings, $subject_str); | |
} | |
} | |
$nodeList_653 = $xpath->evaluate($xpath_653,$record); | |
foreach($nodeList_653 as $node_653) { | |
$nodeList_653a = $xpath->evaluate($xpath_a, $node_653); | |
if ($nodeList_653a->length > 0) { | |
$content_653a = $nodeList_653a->item(0)->nodeValue; | |
array_push($subjects, $content_653a); | |
} | |
} | |
$nodeList_700 = $xpath->evaluate($xpath_700,$record); | |
foreach($nodeList_700 as $node_700) { | |
$nodeList_700a = $xpath->evaluate($xpath_a, $node_700); | |
if ($nodeList_700a->length > 0) { | |
$content_700a = $nodeList_700a->item(0)->nodeValue; | |
array_push($creator_persons, trim($content_700a, '.')); | |
} | |
} | |
$nodeList_710 = $xpath->evaluate($xpath_710,$record); | |
foreach($nodeList_710 as $node_710) { | |
$nodeList_710a = $xpath->evaluate($xpath_a, $node_710); | |
if ($nodeList_710a->length > 0) { | |
$content_710a = $nodeList_710a->item(0)->nodeValue; | |
if (preg_match('/Course(?! )/',$content_710a) > 0) { | |
$doc_type = "course"; | |
} | |
else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_710a,$matches,PREG_PATTERN_ORDER)) { | |
foreach ($matches[0] as $match) { | |
array_push($courses, $match); | |
} | |
array_push($creator_organizations, "The Open University"); | |
} | |
else { | |
array_push($creator_agents, trim($content_710a, '.')); | |
} | |
preg_match_all('/FT ?[0-9]{4}/',$content_710a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($fts, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/CDR ?[0-9]{4}/',$content_710a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($cdrs, preg_replace('/\s/','//',$match)); | |
} | |
preg_match_all('/DVD ?[0-9]{4}/',$content_710a,$matches); | |
foreach($matches[0] as $match) { | |
array_push($dvds, preg_replace('/\s/','//',$match)); | |
} | |
} | |
} | |
$nodeList_856 = $xpath->evaluate($xpath_856,$record); | |
foreach($nodeList_856 as $node_856) { | |
$nodeList_856u = $xpath->evaluate($xpath_u, $node_856); | |
if ($nodeList_856u->length > 0) { | |
$content_856u = $nodeList_856u->item(0)->nodeValue; | |
} | |
else { | |
$content_856u = ""; | |
} | |
$nodeList_856z = $xpath->evaluate($xpath_z, $node_856); | |
if ($nodeList_856z->length > 0) { | |
$content_856z = $nodeList_856z->item(0)->nodeValue; | |
} | |
else { | |
$content_856z = ""; | |
} | |
array_push($urls, array($content_856u,$content_856z)); | |
} | |
// Now to output results | |
if ($doc_type == "course") { | |
// Only do this at the end as we have to parse 084, 110 and 710 before we know if this is a record for a course rather than an item | |
continue; | |
} | |
else if (count($courses) == 0) { | |
// If there wasn't a course code in the 084, 110 or 710, then not related to a course so ignore | |
continue; | |
} | |
else { | |
$date_types = array('b','c','d','e','i','k','m','n','p','q','r','s','t','u'); | |
if (in_array($date_type, $date_types)) { | |
$date_compare = $id." ".$date_type." 1 ".$first_date." 2 ".$second_date; | |
foreach($dates as $date) { | |
$date_compare .= " 3 ".$date; | |
} | |
foreach($copyright_dates as $date) { | |
$date_compare .= " 4 ".$date; | |
} | |
foreach($approximate_dates as $date) { | |
$date_compare .= " 5 ".$date; | |
} | |
fwrite($checkfh, $date_compare."\n"); | |
} | |
$content = ""; | |
$content .= "<".$id.">\n"; | |
$content .= $doc_type."\n"; | |
$courses = array_unique($courses); | |
foreach($courses as $course) { | |
$content .= " ; hascourse \"".$course."\"\n"; | |
} | |
$content .= " ; dc:title \"".$title."\"\n"; | |
foreach($isbns_10 as $isbn_10) { | |
$content .= " ; bibo:isbn10 \"".$isbn_10."\" // ISBN10 \n"; | |
} | |
foreach($isbns_13 as $isbn_13) { | |
$content .= " ; bibo:isbn13 \"".$isbn_13."\" // ISBN13 \n"; | |
} | |
$sups = array_unique($sups); | |
foreach($sups as $sup) { | |
$content .= " ; dc:identifier \"".$sup."\" // SUP (Books without an ISBN should have a SUP)\n"; | |
} | |
$acs = array_unique($acs); | |
foreach($acs as $ac) { | |
$content .= " ; dc:identifier \"".$ac."\" // AC code for Audio Cassette\n"; | |
} | |
$rcs = array_unique($rcs); | |
foreach($rcs as $rc) { | |
$content .= " ; dc:identifier \"".$rc."\" // RC code for Radio programme on Audio Cassette\n"; | |
} | |
$vcs = array_unique($vcs); | |
foreach($vcs as $vc) { | |
$content .= " ; dc:identifier \"".$vc."\" // VC code for Video Cassette\n"; | |
} | |
$bcs = array_unique($bcs); | |
foreach($bcs as $bc) { | |
$content .= " ; dc:identifier \"".$bc."\" // BC code for 'broadcast' (TV) video cassette\n"; | |
} | |
$cdas = array_unique($cdas); | |
foreach($cdas as $cda) { | |
$content .= " ; dc:identifier \"".$cda."\" // CDA code for Audio CD\n"; | |
} | |
$fts = array_unique($fts); | |
foreach($fts as $ft) { | |
$content .= " ; dc:identifier \"".$ft."\" // FT code for Computer disks\n"; | |
} | |
$cdrs = array_unique($cdrs); | |
foreach($cdrs as $cdr) { | |
$content .= " ; dc:identifier \"".$cdr."\" // CDR code for CD-ROM\n"; | |
} | |
$dvds = array_unique($dvds); | |
foreach($dvds as $dvd) { | |
$content .= " ; dc:identifier \"".$dvd."\" // DVD code for DVDs\n"; | |
} | |
foreach($dates as $date) { | |
$content .= " ; dc:date \"".$date."\"\n"; | |
} | |
foreach($copyright_dates as $date) { | |
$content .= " ; dc:dateCopyrighted \"".$date."\"\n"; | |
} | |
foreach($approximate_dates as $date) { | |
$content .= " ; dc:date \"".$date."?\" // This is only a 'probable date' - there is some level of uncertainty\n"; | |
} | |
foreach($odd_dates as $date) { | |
} | |
foreach($creator_persons as $creator_person) { | |
$content .= " ; dc:creator [ a foaf:Person ; foaf:name \"".$creator_person."\" ]\n"; | |
} | |
foreach($creator_organizations as $creator_organization) { | |
$content .= " ; dc:creator [ a foaf:Organization ; foaf:name \"".$creator_organization."\" ]\n"; | |
} | |
foreach($creator_agents as $creator_agent) { | |
$content .= " ; dc:creator [ a foaf:Agent ; foaf:name \"".$creator_agent."\" ]\n"; | |
} | |
if (strlen($statement_responsibility) > 0) { | |
$content .= " ; rdfs:comment \"".$statement_responsibility."\"\n"; | |
} | |
foreach($extents as $extent) { | |
$content .= " ; dc:extent \"".$extent."\"\n"; | |
} | |
foreach($publisher_organizations as $publisher_organization) { | |
$content .= " ; dc:publisher [ a foaf:Organization ; foaf:name \"".$publisher_organization."\" ]\n"; | |
} | |
foreach($publisher_places as $publisher_place) { | |
$content .= " ; isbd:P1016 \"".$publisher_place."\" // isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n"; | |
} | |
if (strlen($place_of_prod_pub_create) > 0) { | |
$content .= " ; isbd:P1016 \"".$place_of_prod_pub_create."\" // Encoded. Should be a URI for this code at http://id.loc.gov/vocabulary/countries/".$place_of_prod_pub_create.", also at http://marccodes.heroku.com/countries/".$place_of_prod_pub_create." (with sameAs to geonames etc.) but not sure how to represent\n"; | |
$content .= " // isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n"; | |
} | |
foreach($general_notes as $general_note) { | |
$content .= " ; dc:description \"".$general_note."\"\n"; | |
} | |
$lcsubjects_topical = array_unique($lcsubjects_topical); | |
foreach($lcsubjects_topical as $lcsubject_topical) { | |
$content .= " ; dc:subject \"".$lcsubject_topical."\" // This is from 650a or 650b (Topical term). There should be a URI for this heading on id.loc.gov - use version from 'Topical Term' concept type not 'General Subdivision'\n"; | |
} | |
$lcsubjects_subdiv = array_unique($lcsubjects_subdiv); | |
foreach($lcsubjects_subdiv as $lcsubject_subdiv) { | |
$content .= " ; dc:subject \"".$lcsubject_subdiv."\" // This is from 650v,650x,650y or 650z (Subdivision). There should be a URI for this heading on id.loc.gov - use version from 'xxx Subdivision' concept type not 'Topical Term'\n"; | |
} | |
foreach($lcsubject_strings as $lcsubject) { | |
$content .= " ; dc:subject \"".$lcsubject."\" // Not clear how combined headings should be handled. Some will have a URI on id.loc.gov, but not all\n"; | |
} | |
foreach($subjects as $subject) { | |
$content .= " ; dc:subject \"".$subject."\" // This is a local Keyword - Use tag ontology? Like OpenLearn?\n"; | |
} | |
foreach($urls as $url) { | |
$content .= " ; foaf:page \"".$url[0]."\" // A URL associated with resource in some way. If there is a note in the catalogue it is reproduced here: \"".$url[1]."\"\n"; | |
} | |
$content .= ".\n"; | |
} | |
switch ($leader_type) { | |
case "am": | |
$books .= $content; | |
$books_count += 1; | |
break; | |
case "as": | |
$serials .= $content; | |
$serials_count += 1; | |
break; | |
case "im": | |
$nonmusicrecs .= $content; | |
$nonmusicrecs_count += 1; | |
break; | |
case "jm": | |
$musicrecs .= $content; | |
$musicrecs_count += 1; | |
break; | |
case "mm": | |
$compfiles .= $content; | |
$compfiles_count += 1; | |
break; | |
case "gm": | |
$videos .= $content; | |
$videos_count += 1; | |
break; | |
default: | |
$unknowns .= $content; | |
$unknowns_count += 1; | |
} | |
} | |
} // end of XMLRecord if | |
} // end of XMLRecord switch | |
} // end of XMLRecord while (to read xml in) | |
fwrite($booksfh,$books); | |
fwrite($serialsfh,$serials); | |
fwrite($nonmusicrecsfh,$nonmusicrecs); | |
fwrite($musicrecsfh,$musicrecs); | |
fwrite($compfilesfh, $compfiles); | |
fwrite($videosfh, $videos); | |
fwrite($unknownsfh, $unknowns); | |
fclose($booksfh); | |
fclose($serialsfh); | |
fclose($nonmusicrecsfh); | |
fclose($musicrecsfh); | |
fclose($compfilesfh); | |
fclose($videosfh); | |
fclose($unknownsfh); | |
fwrite($checkfh, "Books: ".$books_count."\nSerials: ".$serials_count."\nNon-Musical Recordings: ".$nonmusicrecs_count."\nMusical Recordings: ".$musicrecs_count."\nComputer Files: ".$compfiles_count."\nVideos: ".$videos_count."\nUnknown: ".$unknowns_count); | |
fclose($checkfh); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment