ostephens · May 10, 2019 15:56
diff --git a/example-marc-xml-parser.php b/example-marc-xml-parser.php
 <?php
 	// TODO
 	//
 	// Change dc:subject to tags as with OpenLearn
 	// Add notes at top of each output file
 	// Add namespaces at top of each output file
 	// Add Table of Contents?
 	
 	$books = "";
 	$serials = "";
 	$nonmusicrecs = "";
 	$musicrecs = "";
 	$videos = "";
 	$compfiles = "";
 	$unknowns = "";

 	$books_count = "0";
 	$serials_count = "0";
 	$nonmusicrecs_count = "0";
 	$musicrecs_count = "0";
 	$videos_count = "0";
 	$compfiles_count = "0";
 	$unknowns_count = "0";

 	
 	$checkFile = "check.txt";
 	$checkfh = fopen($checkFile, 'w');
 	
 	$booksFile = "books.txt";
 	$booksfh = fopen($booksFile, 'w');
 	$serialsFile = "serials.txt"; //Shouldn't be any of these
 	$serialsfh = fopen($serialsFile, 'w');
 	$nonmusicrecsFile = "nonmusicrecs.txt"; //This is recorded sound, not music
 	$nonmusicrecsfh = fopen($nonmusicrecsFile, 'w');
 	$musicrecsFile = "musicrecs.txt";
 	$musicrecsfh = fopen($musicrecsFile, 'w'); //This is recorded sound music
 	$videosFile = "videos.txt";
 	$videosfh = fopen($videosFile, 'w');
 	$compfilesFile = "compfiles.txt"; //Computer Files
 	$compfilesfh = fopen($compfilesFile, 'w');
 	$unknownsFiles = "unknowns.txt"; //Anything other than books,serials,a-v or computer files
 	$unknownsfh = fopen($unknownsFiles, 'w');
 	
 	// Going to use XMLReader
 	// When parsing all records, DOMDocument runs out of memory
 	$reader = new XMLReader();
 	$reader->open("lucero.xml");
 	
 	while ($reader->read()) {
 		switch ($reader->nodeType) {
 			case (XMLREADER::ELEMENT):
 				if ($reader->localName == "record") {
 					$node = $reader->expand();
 					
 					// Create DOMDocument and load the xml to parse
 					$doc = new DOMDocument();
 					$n = $doc->importNode($node,true);
 					$doc->appendChild($n);

 					// Create DOMXPath object so we can use XPath queries
 					// Also register OAI MARC namespace as 'oai' for ease of reference
 					$xpath =  new DOMXPath($doc);
 					
 					$records = $doc->getElementsByTagName("record");

 					// Setup XPath statements for each bit of information we want to extract from records
 					// I've defined xpath for specific MARC fields, and then separate xpaths for each subfield
 					// N.B. done like this for clarity here, not really the best way of doing it
 					$xpath_leader = "./marc:leader/text()";
 					$xpath_001 = "./marc:controlfield[@tag='001']/text()";
 					$xpath_008 = "./marc:controlfield[@tag='008']/text()";
 					$xpath_020 = "./marc:datafield[@tag='020']";
 					$xpath_035 = "./marc:datafield[@tag='035']";
 					$xpath_084 = "./marc:datafield[@tag='084']";
 					$xpath_100 = "./marc:datafield[@tag='100']";
 					$xpath_110 = "./marc:datafield[@tag='110']";
 					$xpath_245 = "./marc:datafield[@tag='245']";
 					$xpath_260 = "./marc:datafield[@tag='260']";
 					$xpath_300 = "./marc:datafield[@tag='300']";
 					$xpath_500 = "./marc:datafield[@tag='500']";
 					$xpath_515 = "./marc:datafield[@tag='515']";
 					$xpath_562 = "./marc:datafield[@tag='562']";
 					$xpath_700 = "./marc:datafield[@tag='700']";
 					$xpath_710 = "./marc:datafield[@tag='710']";
 					$xpath_650 = "./marc:datafield[@tag='650']";
 					$xpath_653 = "./marc:datafield[@tag='653']";
 					$xpath_856 = "./marc:datafield[@tag='856']";

 					$xpath_a = "./marc:subfield[@code='a']/text()";
 					$xpath_b = "./marc:subfield[@code='b']/text()";
 					$xpath_c = "./marc:subfield[@code='c']/text()";
 					$xpath_u = "./marc:subfield[@code='u']/text()";
 					$xpath_v = "./marc:subfield[@code='v']/text()";
 					$xpath_x = "./marc:subfield[@code='x']/text()";
 					$xpath_y = "./marc:subfield[@code='y']/text()";
 					$xpath_z = "./marc:subfield[@code='z']/text()";

 					foreach( $records as $record ) {
 						// Lots of arrays to setup to hold various bits of information we will extract from record
 						$isbns_10 = array();
 						$isbns_13 = array();
 						$sups = array();
 						$acs = array();
 						$rcs = array();
 						$vcs = array();
 						$bcs = array();
 						$cdas = array();
 						$fts = array();
 						$cdrs = array();
 						$dvds = array();
 						$courses = array();
 						$subjects = array();
 						$lcsubjects_topical = array();
 						$lcsubjects_subdiv = array();
 						$lcsubject_strings = array();
 						$creator_persons = array();
 						$creator_organizations = array();
 						$creator_agents = array();
 						$publisher_places = array();
 						$publisher_organizations = array();
 						$dates = array();
 						$copyright_dates = array();
 						$approximate_dates = array();
 						$extents = array();
 						$urls = array();
 						$odd_dates = array();
 						$general_notes = array();
 						$doc_type = "";
 						$publisher_count = 0;
 						
 						$nodeList_leader = $xpath->evaluate($xpath_leader,$record);
 						$content_leader = $nodeList_leader->item(0)->nodeValue;
 						$leader_type = substr($content_leader,6,2);
 						if ($leader_type == 'am') {
 							$doc_type = "a bibo:Book";
 						}
 						else if ($leader_type == 'as'){
 							$doc_type = "a bibo:Periodical";
 						}
 						else if ($leader_type == 'im'){
 							// This is a non-musical recording
 							// Media ontology collection?
 							$doc_type = "a bibo:AudioDocument";
 						}
 						else if ($leader_type == 'jm'){
 							// This is a musical recording
 							// Media ontology collection? Or Music Ontology collection?
 							// BIBO doesn't really have anything better than Audio Document to deal with music
 							// Ideally would look at music ontology and link to musicbrainz - although this seems to be more granular than our records
 							$doc_type = "a bibo:AudioDocument";
 						}
 						else if ($leader_type == 'gm'){
 							// This is a video of some type
 							$doc_type = "a bibo:AudioVisualDocument";
 						}
 						else if ($leader_type == 'mm'){
 							// This is actually a computer file, so for BIBO 'document' is the best we can do
 							// Should look at other ontologies for Computer Files (possible DOAP?)
 							$doc_type = "a biboDocument";
 						}
 						else {
 							$doc_type = "Unknown, Leader type: ".$leader_type;
 						}
 						
 						$nodeList_001 = $xpath->evaluate($xpath_001,$record);
 						$content_001 = $nodeList_001->item(0)->nodeValue;
 						$id = "http://voyager.open.ac.uk/vwebv/holdingsInfo?bibId=".$content_001;
 						
 						$nodeList_008 = $xpath->evaluate($xpath_008,$record);
 						if ($nodeList_008->length >0) {
 							$content_008 = $nodeList_008->item(0)->nodeValue;
 							if (substr($content_008,15,3) != '   ') {
 								$place_of_prod_pub_create = substr($content_008,15,3);
 							}
 							// Need to sort out dates from 008 - not included at the moment
 							$date_type = substr($content_008,6,1);
 							$first_date = substr($content_008,7,4);
 							$second_date = substr($content_008,11,4);
 						}
 						
 						

 						$nodeList_020 = $xpath->evaluate($xpath_020,$record);
 						foreach($nodeList_020 as $node_020) {
 							$nodeList_020a = $xpath->evaluate($xpath_a, $node_020);
 							if ($nodeList_020a->length > 0) {
 							// Differentiate ISBN-10 and ISBN-13 and add to appropriate array
 								$content_020a = $nodeList_020a->item(0)->nodeValue;
 								if (strlen($content_020a) == 10) {
 									array_push($isbns_10, $content_020a);
 								} else if (strlen($content_020a) == 13) {
 									array_push($isbns_13, $content_020a);
 								}
 							}
 						}
 						
 						$nodeList_035 = $xpath->evaluate($xpath_035,$record);
 						foreach($nodeList_035 as $node_035) {
 							$nodeList_035a = $xpath->evaluate($xpath_a, $node_035);
 							if ($nodeList_035a->length > 0) {
 							// 035 Can contain a variety of identifiers - SUP is the only one of possible interest to us I think
 								$content_035a = $nodeList_035a->item(0)->nodeValue;
 								preg_match('/SUP ?[0-9]{6}/',$content_035a,$matches);
 								foreach($matches as $match) {
 									array_push($sups, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 						
 						$nodeList_084 = $xpath->evaluate($xpath_084,$record);
 						foreach($nodeList_084 as $node_084) {
 							$nodeList_084a = $xpath->evaluate($xpath_a, $node_084);
 							if ($nodeList_084a->length > 0) {
 								$content_084a = $nodeList_084a->item(0)->nodeValue;
 								if (preg_match('/Course(?! )/',$content_084a) > 0) {
 									$doc_type = "course";
 								}
 								else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_084a,$matches,PREG_PATTERN_ORDER)) {
 									foreach ($matches[0] as $match) {
 										array_push($courses, $match);
 									}
 								}
 								preg_match_all('/FT ?[0-9]{4}/',$content_084a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($fts, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDR ?[0-9]{4}/',$content_084a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdrs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/DVD ?[0-9]{4}/',$content_084a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($dvds, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 						
 						$nodeList_100 = $xpath->evaluate($xpath_100,$record);
 						foreach($nodeList_100 as $node_100) {
 							$nodeList_100a = $xpath->evaluate($xpath_a, $node_100);
 							if ($nodeList_100a->length > 0) {
 								$content_100a = $nodeList_100a->item(0)->nodeValue;
 								array_push($creator_persons, trim($content_100a, '.'));
 							}
 						}
 						
 						$nodeList_110 = $xpath->evaluate($xpath_110,$record);
 						foreach($nodeList_110 as $node_110) {
 							$nodeList_110a = $xpath->evaluate($xpath_a, $node_110);
 							if ($nodeList_110a->length > 0) {
 								$content_110a = $nodeList_110a->item(0)->nodeValue;
 								if (preg_match('/Course(?! )/',$content_110a) > 0) {
 									$doc_type = "course"; 
 								}
 								else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_110a,$matches,PREG_PATTERN_ORDER)) {
 									foreach ($matches[0] as $match) {
 										array_push($courses, $match);
 									}
 									array_push($creator_organizations, "The Open University");
 								}
 								else {
 									array_push($creator_agents, trim($content_110a, '.'));
 								}
 								preg_match_all('/FT ?[0-9]{4}/',$content_110a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($fts, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDR ?[0-9]{4}/',$content_110a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdrs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/DVD ?[0-9]{4}/',$content_110a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($dvds, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 					
 						$nodeList_245 = $xpath->evaluate($xpath_245,$record);
 						foreach($nodeList_245 as $node_245) {
 							$nodeList_245a = $xpath->evaluate($xpath_a, $node_245);
 							// While it is possible to have records without a title (e.g. a letter) in our case everything should have a title in 245$$a
 							// If not, this will throw an error which should be followed up
 							$content_245a = $nodeList_245a->item(0)->nodeValue;
 							$title = $content_245a;

 							$nodeList_245b = $xpath->evaluate($xpath_b, $node_245);
 							if ($nodeList_245b->length > 0) {
 								$content_245b = $nodeList_245b->item(0)->nodeValue;
 								$title .= " ".$content_245b;
 							}
 							$title = trim($title, " /:");
 							$nodeList_245c = $xpath->evaluate($xpath_c, $node_245);
 							if ($nodeList_245c->length > 0) {
 								$content_245c = $nodeList_245c->item(0)->nodeValue;
 								$statement_responsibility = $content_245c;
 							}
 							else {
 								$statement_responsibility = "";
 							}
 						}
 						
 						$nodeList_260 = $xpath->evaluate($xpath_260,$record);
 						foreach($nodeList_260 as $node_260) {
 							$nodeList_260a = $xpath->evaluate($xpath_a, $node_260);
 							if ($nodeList_260a->length > 0) {
 								$content_260a = $nodeList_260a->item(0)->nodeValue;
 								array_push($publisher_places, trim($content_260a,' ,:;'));
 							}
 							$nodeList_260b = $xpath->evaluate($xpath_b, $node_260);
 							if ($nodeList_260b->length > 0) {
 								$content_260b = $nodeList_260b->item(0)->nodeValue;
 								array_push($publisher_organizations, trim($content_260b,' ,:;'));
 							}
 							// 'Open University' not 'The Open University'
 							$nodeList_260c = $xpath->evaluate($xpath_c, $node_260);
 							if ($nodeList_260c->length > 0) {
 								$content_260c = $nodeList_260c->item(0)->nodeValue;
 								//before splitting need to check for patterns of (YYYY repr) or (YYYY repr.) and convert these to plain dates
 								$content_260c = preg_replace('/(\()/','',$content_260c);
 								$content_260c = preg_replace('/(\))/','',$content_260c);
 								$content_260c = preg_replace('/(repr)/','',$content_260c);
 								$array_260c = preg_split("/[\s\/,;]+/",$content_260c);
 								foreach ($array_260c as $date_260c) {
 									$date_260c = trim($date_260c,",.-");
 									if (preg_match('/^[0-9]{4}$/',$date_260c) >0 ) {
 										array_push($dates, $date_260c);
 									}
 									else if (preg_match('/^c[0-9]{4}$/',$date_260c) > 0 ) {
 										array_push($copyright_dates, substr($date_260c, 1, 4));
 									}
 									else if (preg_match('/\[[0-9]{4}\??\]/',$date_260c) > 0 ) {
 										array_push($approximate_dates, substr($date_260c, 1, 4));
 									}
 									// Main remaining issue is date ranges (with and without end dates) e.g. YYYY-YYYY
 									// Investigate YYYY-YYYY see if just courses
 									// Other issue is where only partial date known e.g. YY--. However only one example of this
 									else {
 										array_push($odd_dates, $date_260c);
 									}
 								}
 							}
 						}
 						
 						$nodeList_300 = $xpath->evaluate($xpath_300,$record);
 						// Can we recognise durations?
 						foreach($nodeList_300 as $node_300) {
 							$nodeList_300a = $xpath->evaluate($xpath_a, $node_300);
 							$extent = "";
 							if ($nodeList_300a->length > 0) {
 								$content_300a = $nodeList_300a->item(0)->nodeValue;
 								array_push($extents, trim($content_300a,' :;'));
 							}
 							$nodeList_300b = $xpath->evaluate($xpath_b, $node_300);
 							if ($nodeList_300b->length > 0) {
 								$content_300b = $nodeList_300b->item(0)->nodeValue;
 								array_push($extents, trim($content_300b,' :;'));
 							}
 						}
 						
 						$nodeList_500 = $xpath->evaluate($xpath_500,$record);
 						// All the 5XX fields are Notes fields
 						// 500 is 'General Note'
 						foreach($nodeList_500 as $node_500) {
 							$nodeList_500a = $xpath->evaluate($xpath_a, $node_500);
 							if ($nodeList_500a->length > 0) {
 								$content_500a = $nodeList_500a->item(0)->nodeValue;
 								array_push($general_notes, $content_500a);
 								// 500 Sometimes includes a SUP code
 								preg_match_all('/SUP ?[0-9]{6}/',$content_500a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($sups, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 						
 						$nodeList_515 = $xpath->evaluate($xpath_515,$record);
 						// All the 5XX fields are Notes fields
 						// 515 is 'Numbering Peculiarities', but sometimes contains local identifiers for materials
 						foreach($nodeList_515 as $node_515) {
 							$nodeList_515a = $xpath->evaluate($xpath_a, $node_515);
 							if ($nodeList_515a->length > 0) {
 								$content_515a = $nodeList_515a->item(0)->nodeValue;
 								preg_match_all('/AC ?[0-9]{4}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($acs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/RC ?[0-9]{5}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($rcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/VC ?[0-9]{4}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($vcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/BC ?[0-9]{5}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($bcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDA ?[0-9]{4}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdas, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDR ?[0-9]{4}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdrs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/DVD ?[0-9]{5}/',$content_515a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($dvds, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 						
 						$nodeList_562 = $xpath->evaluate($xpath_515,$record);
 						// All the 5XX fields are Notes fields
 						// 562 is 'Copy and Version Identification Note', but sometimes contains local identifiers for materials
 						foreach($nodeList_562 as $node_562) {
 							$nodeList_562a = $xpath->evaluate($xpath_a, $node_562);
 							if ($nodeList_562a->length > 0) {
 								$content_562a = $nodeList_562a->item(0)->nodeValue;
 								preg_match_all('/AC ?[0-9]{4}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($acs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/RC ?[0-9]{5}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($rcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/VC ?[0-9]{4}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($vcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/BC ?[0-9]{5}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($bcs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDA ?[0-9]{4}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdas, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDR ?[0-9]{4}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdrs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/DVD ?[0-9]{5}/',$content_562a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($dvds, preg_replace('/\s/','//',$match));
 								}
 							}
 						}

 						$nodeList_650 = $xpath->evaluate($xpath_650,$record);
 						foreach($nodeList_650 as $node_650) {
 							$nodeList_650a = $xpath->evaluate($xpath_a, $node_650);
 							// All 650 fields should have at least $$a subfield
 							$content_650a = $nodeList_650a->item(0)->nodeValue;
 							array_push($lcsubjects_topical, trim($content_650a, "."));
 							$subject_str = trim($content_650a, ".");
 							$nodeList_650b = $xpath->evaluate($xpath_b, $node_650);
 							// Will get a node list even if empty, so check there is content before extracting
 							if ($nodeList_650b->length > 0) {
 								$content_650b = $nodeList_650b->item(0)->nodeValue;
 								array_push($lcsubjects_topical, trim($content_650b, "."));
 								$subject_str .= "--".trim($content_650b, ".");
 							}
 							$nodeList_650v = $xpath->evaluate($xpath_v, $node_650);
 							// Will get a node list even if empty, so check there is content before extracting
 							if ($nodeList_650v->length > 0) {
 								$content_650v = $nodeList_650v->item(0)->nodeValue;
 								array_push($lcsubjects_subdiv, trim($content_650v, "."));
 								$subject_str .= "--".trim($content_650v, ".");
 							}
 							$nodeList_650x = $xpath->evaluate($xpath_x, $node_650);
 							// Will get a node list even if empty, so check there is content before extracting
 							if ($nodeList_650x->length > 0) {
 								$content_650x = $nodeList_650x->item(0)->nodeValue;
 								array_push($lcsubjects_subdiv, trim($content_650x, "."));
 								$subject_str .= "--".trim($content_650x, ".");
 							}
 							$nodeList_650y = $xpath->evaluate($xpath_y, $node_650);
 							// Will get a node list even if empty, so check there is content before extracting
 							if ($nodeList_650y->length > 0) {
 								$content_650y = $nodeList_650y->item(0)->nodeValue;
 								array_push($lcsubjects_subdiv, trim($content_650y, "."));
 								$subject_str .= "--".trim($content_650y, ".");
 							}
 							$nodeList_650z = $xpath->evaluate($xpath_z, $node_650);
 							// Will get a node list even if empty, so check there is content before extracting
 							if ($nodeList_650z->length > 0) {
 								$content_650z = $nodeList_650z->item(0)->nodeValue;
 								array_push($lcsubjects_subdiv, trim($content_650z, "."));
 								$subject_str .= "--".trim($content_650z, ".");
 							}
 							if($subject_str <> trim($content_650a, ".")) {
 								array_push($lcsubject_strings, $subject_str);
 							}
 						}
 						
 						$nodeList_653 = $xpath->evaluate($xpath_653,$record);
 						foreach($nodeList_653 as $node_653) {
 							$nodeList_653a = $xpath->evaluate($xpath_a, $node_653);
 							if ($nodeList_653a->length > 0) {
 								$content_653a = $nodeList_653a->item(0)->nodeValue;
 								array_push($subjects, $content_653a);
 							}
 						}
 						
 						$nodeList_700 = $xpath->evaluate($xpath_700,$record);
 						foreach($nodeList_700 as $node_700) {
 							$nodeList_700a = $xpath->evaluate($xpath_a, $node_700);
 							if ($nodeList_700a->length > 0) {
 								$content_700a = $nodeList_700a->item(0)->nodeValue;
 								array_push($creator_persons, trim($content_700a, '.'));
 							}
 						}
 						
 						$nodeList_710 = $xpath->evaluate($xpath_710,$record);
 						foreach($nodeList_710 as $node_710) {
 							$nodeList_710a = $xpath->evaluate($xpath_a, $node_710);
 							if ($nodeList_710a->length > 0) {
 								$content_710a = $nodeList_710a->item(0)->nodeValue;
 								if (preg_match('/Course(?! )/',$content_710a) > 0) {
 									$doc_type = "course";
 								}
 								else if (preg_match_all('/[ABDEHKLMPQSTUWY][ABDEHKLMPSTUXYZ]?[CDEGHJKMNPRSTWXYZ]?[HLNRSTX]?[0-9]{3}[ABCDEFGHIJLMPSTVXYZ]?/',$content_710a,$matches,PREG_PATTERN_ORDER)) {
 									foreach ($matches[0] as $match) {
 										array_push($courses, $match);
 									}
 									array_push($creator_organizations, "The Open University");
 								}
 								else {
 									array_push($creator_agents, trim($content_710a, '.'));
 								}
 								preg_match_all('/FT ?[0-9]{4}/',$content_710a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($fts, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/CDR ?[0-9]{4}/',$content_710a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($cdrs, preg_replace('/\s/','//',$match));
 								}
 								preg_match_all('/DVD ?[0-9]{4}/',$content_710a,$matches);
 								foreach($matches[0] as $match) {
 									array_push($dvds, preg_replace('/\s/','//',$match));
 								}
 							}
 						}
 						
 						$nodeList_856 = $xpath->evaluate($xpath_856,$record);
 						foreach($nodeList_856 as $node_856) {
 							$nodeList_856u = $xpath->evaluate($xpath_u, $node_856);
 							if ($nodeList_856u->length > 0) {
 								$content_856u = $nodeList_856u->item(0)->nodeValue;
 							}
 							else {
 								$content_856u = "";
 							}
 							$nodeList_856z = $xpath->evaluate($xpath_z, $node_856);
 							if ($nodeList_856z->length > 0) {
 								$content_856z = $nodeList_856z->item(0)->nodeValue;
 							}
 							else {
 								$content_856z = "";
 							}
 							array_push($urls, array($content_856u,$content_856z));
 						}

 						// Now to output results
 						if ($doc_type == "course") {
 							// Only do this at the end as we have to parse 084, 110 and 710 before we know if this is a record for a course rather than an item
 							continue;
 						}
 						else if (count($courses) == 0) {
 							// If there wasn't a course code in the 084, 110 or 710, then not related to a course so ignore
 							continue;
 						}
 						else {
 							$date_types = array('b','c','d','e','i','k','m','n','p','q','r','s','t','u');
 							if (in_array($date_type, $date_types)) {
 								$date_compare = $id."	".$date_type."	1	".$first_date."	2	".$second_date;
 								foreach($dates as $date) {
 									$date_compare .= "	3	".$date;
 								}
 								foreach($copyright_dates as $date) {
 									$date_compare .= "	4	".$date;
 								}
 								foreach($approximate_dates as $date) {
 									$date_compare .= "	5	".$date;
 								}
 								fwrite($checkfh, $date_compare."\n");
 							}
 						$content = "";
 						$content .= "<".$id.">\n";
 						$content .= $doc_type."\n";
 						$courses = array_unique($courses);
 						foreach($courses as $course) {
 							$content .= "		; hascourse \"".$course."\"\n";
 						} 
 						$content .= "		; dc:title \"".$title."\"\n";
 						foreach($isbns_10 as $isbn_10) {
 							$content .= "		; bibo:isbn10 \"".$isbn_10."\" // ISBN10 \n";
 						}
 						foreach($isbns_13 as $isbn_13) {
 							$content .= "		; bibo:isbn13 \"".$isbn_13."\" // ISBN13 \n";
 						}
 						$sups = array_unique($sups);
 						foreach($sups as $sup) {
 							$content .= "		; dc:identifier \"".$sup."\" // SUP (Books without an ISBN should have a SUP)\n";
 						}
 						$acs = array_unique($acs);
 						foreach($acs as $ac) {
 							$content .= "		; dc:identifier \"".$ac."\" // AC code for Audio Cassette\n";
 						}
 						$rcs = array_unique($rcs);
 						foreach($rcs as $rc) {
 							$content .= "		; dc:identifier \"".$rc."\" // RC code for Radio programme on Audio Cassette\n";
 						}
 						$vcs = array_unique($vcs);
 						foreach($vcs as $vc) {
 							$content .= "		; dc:identifier \"".$vc."\" // VC code for Video Cassette\n";
 						}
 						$bcs = array_unique($bcs);
 						foreach($bcs as $bc) {
 							$content .= "		; dc:identifier \"".$bc."\" // BC code for 'broadcast' (TV) video cassette\n";
 						}
 						$cdas = array_unique($cdas);
 						foreach($cdas as $cda) {
 							$content .= "		; dc:identifier \"".$cda."\" // CDA code for Audio CD\n";
 						}
 						$fts = array_unique($fts);
 						foreach($fts as $ft) {
 							$content .= "		; dc:identifier \"".$ft."\" // FT code for Computer disks\n";
 						}
 						$cdrs = array_unique($cdrs);
 						foreach($cdrs as $cdr) {
 							$content .= "		; dc:identifier \"".$cdr."\" // CDR code for CD-ROM\n";
 						}
 						$dvds = array_unique($dvds);
 						foreach($dvds as $dvd) {
 							$content .= "		; dc:identifier \"".$dvd."\" // DVD code for DVDs\n";
 						}
 						foreach($dates as $date) {
 							$content .= "		; dc:date \"".$date."\"\n";
 						}
 						foreach($copyright_dates as $date) {
 							$content .= "		; dc:dateCopyrighted \"".$date."\"\n";
 						}
 						foreach($approximate_dates as $date) {
 							$content .= "		; dc:date \"".$date."?\" // This is only a 'probable date' - there is some level of uncertainty\n";
 						}
 						foreach($odd_dates as $date) {
 						}
 						foreach($creator_persons as $creator_person) {
 							$content .= "		; dc:creator [ a foaf:Person ; foaf:name \"".$creator_person."\" ]\n";
 						}
 						foreach($creator_organizations as $creator_organization) {
 							$content .= "		; dc:creator [ a foaf:Organization ; foaf:name \"".$creator_organization."\" ]\n";
 						}
 						foreach($creator_agents as $creator_agent) {
 							$content .= "		; dc:creator [ a foaf:Agent ; foaf:name \"".$creator_agent."\" ]\n";
 						}
 						if (strlen($statement_responsibility) > 0) {
 							$content .= "		; rdfs:comment \"".$statement_responsibility."\"\n";
 						}
 						foreach($extents as $extent) {
 							$content .= "		; dc:extent \"".$extent."\"\n";
 						}
 						foreach($publisher_organizations as $publisher_organization) {
 							$content .= "		; dc:publisher [ a foaf:Organization ; foaf:name \"".$publisher_organization."\" ]\n";
 						}
 						foreach($publisher_places as $publisher_place) {
 							$content .= "		; isbd:P1016 \"".$publisher_place."\" // isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n";
 						}
 						if (strlen($place_of_prod_pub_create) > 0) {
 							$content .= "		; isbd:P1016 \"".$place_of_prod_pub_create."\" // Encoded. Should be a URI for this code at http://id.loc.gov/vocabulary/countries/".$place_of_prod_pub_create.", also at http://marccodes.heroku.com/countries/".$place_of_prod_pub_create." (with sameAs to geonames etc.) but not sure how to represent\n";
 							$content .= "		// isbd is a proposed ontology - see details for isbd:P1016 at http://metadataregistry.org/schemahistory/list/schema_property_id/1957.html - it is what the BL are using (even though I'm not keen)\n";
 						}
 						foreach($general_notes as $general_note) {
 							$content .= "		; dc:description	\"".$general_note."\"\n";
 						}
 						$lcsubjects_topical = array_unique($lcsubjects_topical);
 						foreach($lcsubjects_topical as $lcsubject_topical) {
 							$content .= "		; dc:subject \"".$lcsubject_topical."\"	// This is from 650a or 650b (Topical term). There should be a URI for this heading on id.loc.gov - use version from 'Topical Term' concept type not 'General Subdivision'\n";
 						}
 						$lcsubjects_subdiv = array_unique($lcsubjects_subdiv);
 						foreach($lcsubjects_subdiv as $lcsubject_subdiv) {
 							$content .= "		; dc:subject \"".$lcsubject_subdiv."\"	// This is from 650v,650x,650y or 650z (Subdivision). There should be a URI for this heading on id.loc.gov - use version from 'xxx Subdivision' concept type not 'Topical Term'\n";
 						}

 						foreach($lcsubject_strings as $lcsubject) {
 							$content .= "		; dc:subject \"".$lcsubject."\" // Not clear how combined headings should be handled. Some will have a URI on id.loc.gov, but not all\n";
 						}
 						foreach($subjects as $subject) {
 							$content .= "		; dc:subject \"".$subject."\" // This is a local Keyword - Use tag ontology? Like OpenLearn?\n";
 						}
 						foreach($urls as $url) {
 							$content .= "		; foaf:page \"".$url[0]."\" // A URL associated with resource in some way. If there is a note in the catalogue it is reproduced here: \"".$url[1]."\"\n";
 						}
 						$content .= ".\n";
 						}
 						
 						switch ($leader_type) {
 							case "am":
 								$books .= $content;
 								$books_count += 1;
 								break;
 							case "as":
 								$serials .= $content;
 								$serials_count += 1;
 								break;
 							case "im":
 								$nonmusicrecs .= $content;
 								$nonmusicrecs_count += 1;
 								break;
 							case "jm":
 								$musicrecs .= $content;
 								$musicrecs_count += 1;
 								break;
 							case "mm":
 								$compfiles .= $content;
 								$compfiles_count += 1;
 								break;
 							case "gm":
 								$videos .= $content;
 								$videos_count += 1;
 								break;
 							default:
 								$unknowns .= $content;
 								$unknowns_count += 1;
 						}
 					}
 				} // end of XMLRecord if
 			} // end of XMLRecord switch
 		} // end of XMLRecord while (to read xml in)
 	fwrite($booksfh,$books);
 	fwrite($serialsfh,$serials);
 	fwrite($nonmusicrecsfh,$nonmusicrecs);
 	fwrite($musicrecsfh,$musicrecs);
 	fwrite($compfilesfh, $compfiles);
 	fwrite($videosfh, $videos);
 	fwrite($unknownsfh, $unknowns);
 	fclose($booksfh);
 	fclose($serialsfh);
 	fclose($nonmusicrecsfh);
 	fclose($musicrecsfh);
 	fclose($compfilesfh);
 	fclose($videosfh);
 	fclose($unknownsfh);
 	
 	fwrite($checkfh, "Books: ".$books_count."\nSerials: ".$serials_count."\nNon-Musical Recordings: ".$nonmusicrecs_count."\nMusical Recordings: ".$musicrecs_count."\nComputer Files: ".$compfiles_count."\nVideos: ".$videos_count."\nUnknown: ".$unknowns_count);
 	
 	fclose($checkfh);
 ?>