Last active
November 18, 2022 21:31
-
-
Save stojg/3045663 to your computer and use it in GitHub Desktop.
Parsing a huge XML with closure in PHP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// An example on how to parse massive XML files with PHP by chunking it up to avoid running out of memory | |
// Open the XML | |
$handle = fopen('file.xml', 'r'); | |
// Get the nodestring incrementally from the xml file by defining a callback | |
// In this case using a anon function. | |
nodeStringFromXMLFile($handle, '<item>', '</item>', function($nodeText){ | |
// Transform the XMLString into an array and | |
print_r(getArrayFromXMLString($nodeText)); | |
}); | |
fclose($handle); | |
/** | |
* For every node that starts with $startNode and ends with $endNode call $callback | |
* with the string as an argument | |
* | |
* Note: Sometimes it returns two nodes instead of a single one, this could easily be | |
* handled by the callback though. This function primary job is to split a large file | |
* into manageable XML nodes. | |
* | |
* the callback will receive one parameter, the XML node(s) as a string | |
* | |
* @param resource $handle - a file handle | |
* @param string $startNode - what is the start node name e.g <item> | |
* @param string $endNode - what is the end node name e.g </item> | |
* @param callable $callback - an anonymous function | |
*/ | |
function nodeStringFromXMLFile($handle, $startNode, $endNode, $callback=null) { | |
$cursorPos = 0; | |
while(true) { | |
// Find start position | |
$startPos = getPos($handle, $startNode, $cursorPos); | |
// We reached the end of the file or an error | |
if($startPos === false) { | |
break; | |
} | |
// Find where the node ends | |
$endPos = getPos($handle, $endNode, $startPos) + mb_strlen($endNode); | |
// Jump back to the start position | |
fseek($handle, $startPos); | |
// Read the data | |
$data = fread($handle, ($endPos-$startPos)); | |
// pass the $data into the callback | |
$callback($data); | |
// next iteration starts reading from here | |
$cursorPos = ftell($handle); | |
} | |
} | |
/** | |
* This function will return the first string it could find in a resource that matches the $string. | |
* | |
* By using a $startFrom it recurses and seeks $chunk bytes at a time to avoid reading the | |
* whole file at once. | |
* | |
* @param resource $handle - typically a file handle | |
* @param string $string - what string to search for | |
* @param int $startFrom - strpos to start searching from | |
* @param int $chunk - chunk to read before rereading again | |
* @return int|bool - Will return false if there are EOL or errors | |
*/ | |
function getPos($handle, $string, $startFrom=0, $chunk=1024, $prev='') { | |
// Set the file cursor on the startFrom position | |
fseek($handle, $startFrom, SEEK_SET); | |
// Read data | |
$data = fread($handle, $chunk); | |
// Try to find the search $string in this chunk | |
$stringPos = mb_strpos($prev.$data, $string); | |
// We found the string, return the position | |
if($stringPos !== false ) { | |
return $stringPos+$startFrom - mb_strlen($prev); | |
} | |
// We reached the end of the file | |
if(feof($handle)) { | |
return false; | |
} | |
// Recurse to read more data until we find the search $string it or run out of disk | |
return getPos($handle, $string, $chunk+$startFrom, $chunk, $data); | |
} | |
/** | |
* Turn a string version of XML and turn it into an array by using the | |
* SimpleXML | |
* | |
* @param string $nodeAsString - a string representation of a XML node | |
* @return array | |
*/ | |
function getArrayFromXMLString($nodeAsString) { | |
$simpleXML = simplexml_load_string($nodeAsString); | |
if(libxml_get_errors()) { | |
user_error('Libxml throws some errors.', implode(',', libxml_get_errors())); | |
} | |
return simplexml2array($simpleXML); | |
} | |
/** | |
* Turns a SimpleXMLElement into an array | |
* | |
* @param SimpleXMLelem $xml | |
* @return array | |
*/ | |
function simplexml2array($xml) { | |
if(is_object($xml) && get_class($xml) == 'SimpleXMLElement') { | |
$attributes = $xml->attributes(); | |
foreach($attributes as $k=>$v) { | |
$a[$k] = (string) $v; | |
} | |
$x = $xml; | |
$xml = get_object_vars($xml); | |
} | |
if(is_array($xml)) { | |
if(count($xml) == 0) { | |
return (string) $x; | |
} | |
$r = array(); | |
foreach($xml as $key=>$value) { | |
$r[$key] = simplexml2array($value); | |
} | |
// Ignore attributes | |
if (isset($a)) { | |
$r['@attributes'] = $a; | |
} | |
return $r; | |
} | |
return (string) $xml; | |
} |
this is a great start for me. thanks so much
Error "simplexml_load_string(): namespace error : Namespace prefix commons on preference-order is not defined" is showing how to fix it?
Hi , how i can use nodeStringFromXMLFile with xml atributes like an ID? Thanks a lot.
Example:
"<reservation id="60613"><reservationNumber>38058</reservationNumber></reservation>"
I also have internal error and go to logs and i can't see it. Can you help me please?
Thank you so much, I have been stuck for almost a week now, and I really could not work, and with just this now I can start working.
Thank you, you are an inspiration.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
increase nested level in php.in