Last active
November 14, 2016 13:29
-
-
Save kyleparisi/ce1607c960102b4da3fef3d9d715eeed to your computer and use it in GitHub Desktop.
This php script takes a local xml file input and stdouts the equivalent tsv.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* This php script takes a local xml file input and stdouts the equivalent csv. | |
*/ | |
$shortopts = ""; | |
$shortopts .= "f:"; // Required value | |
$options = getopt($shortopts); | |
if (count($options) === 0) { | |
throw new ErrorException('File to process required. Please pass -f <file>'); | |
} | |
$file = $options['f']; | |
class Parse { | |
public $file; | |
public $namespaces; | |
public $xml; | |
/** | |
* @param $file | |
*/ | |
public function __construct($file) { | |
$this->file = $file; | |
} | |
/** | |
* @param array $elements | |
* @return $this | |
* @throws Exception | |
*/ | |
public function moveToFirstNodeNamed($elements = array('item', 'row', 'product', 'items', 'entry')) { | |
$this->xml = new XMLReader; | |
if (!$this->xml->open("$this->file")) { | |
throw new Exception('Problem with read xml file ' . $file); | |
} | |
while ($this->xml->read() && !in_array(strtolower($this->xml->name), $elements)); | |
$node = new SimpleXMLElement($this->xml->readOuterXML()); | |
$this->namespaces = $node->getNamespaces(true); | |
return $this; | |
} | |
} | |
class ParseHeaders extends Parse { | |
/** | |
* @param $node | |
* @return array | |
*/ | |
private function getAllNamespaceKeysFromNode($node) { | |
$keys = array(); | |
$namespaceKeys = array_keys($this->namespaces); | |
foreach ($namespaceKeys as $key) { | |
$keysFromRow = get_object_vars($node->children($this->namespaces[$key])); | |
$keys = array_merge($keys, $keysFromRow); | |
} | |
return $keys; | |
} | |
/** | |
* @return array | |
*/ | |
public function getHeaders() { | |
$accumulator = array(); | |
while ($this->xml->name !== '') { | |
$node = new SimpleXMLElement($this->xml->readOuterXML()); | |
$namespaceHeaders = array_flip(array_keys($this->getAllNamespaceKeysFromNode($node))); | |
$elementHeaders = array_flip(array_keys(get_object_vars($node->children()))); | |
$accumulator = array_merge($accumulator, $elementHeaders, $namespaceHeaders); | |
// $node iterator | |
$this->xml->next($this->xml->name); | |
} | |
return array_keys($accumulator); | |
} | |
} | |
$parser = new ParseHeaders($file); | |
$headers = $parser->moveToFirstNodeNamed()->getHeaders(); | |
class ParseBody extends Parse { | |
/** | |
* @param $value | |
* @return string | |
*/ | |
private function remove_chars($value) { | |
$value = htmlspecialchars_decode($value); | |
// ----- remove html elements ----- | |
$value = trim(preg_replace('/<[^>]*>/', ' ', $value)); | |
$value = preg_replace('/[[:^print:]]/', '', $value); | |
// ----- remove control characters ----- | |
$value = str_replace("\r", ' ', $value); // --- replace with empty space | |
$value = str_replace("\n", ' ', $value); // --- replace with space | |
$value = str_replace("\t", ' ', $value); // --- replace with space | |
// ----- remove double space & white characters ----- | |
$value = preg_replace('/( )+/', ' ', $value); | |
$value = preg_replace('/(\s)+/', ' ', $value); | |
// ----- replace MS special characters ----- | |
$search = array('/‘/u', '/’/u', '/“/u', '/”/u', '/—/u', '/ /u', '/&/u', '/"/u', '/'/u', '/®/u', '/™/u', '/¾/u', '/½/u', '/¼/u', '/–/u', '/°/u'); | |
$replace = array('\'', '\'', '"', '"', '-', '', '&', '"', '\'', '', '', '3/4', '1/2', '1/4', '-', ''); | |
$value = preg_replace($search, $replace, $value); | |
//make sure _all_ html entities are converted to the plain ascii equivalents - it appears | |
//in some MS headers, some html entities are encoded and some aren't | |
$value = html_entity_decode(strip_tags(html_entity_decode($value, ENT_QUOTES, 'UTF-8'))); | |
return $value; | |
} | |
/** | |
* @param $headers | |
*/ | |
public function mapBodyWithHeaders($headers) { | |
$stdout = fopen('php://stdout', 'w'); | |
fputcsv($stdout, $headers, chr(9), '"'); | |
$map = array_flip($headers); | |
foreach($map as $key => $value) { | |
$map[$key] = null; | |
} | |
while ($this->xml->name != '') { | |
$node = new SimpleXMLElement($this->xml->readOuterXML()); | |
// handle elements | |
$elements = get_object_vars($node->children()); | |
foreach($elements as $key => $value) { | |
if(is_array($elements[$key])) { | |
$map[$key] = implode(' || ', $elements[$key]); | |
} | |
else { | |
$map[$key] = (string) $value; | |
} | |
} | |
// handle namespaces | |
foreach($this->namespaces as $key =>$parser) { | |
// Get each item in namespace | |
$curr = get_object_vars($node->children($this->namespaces[$key])); | |
// Loop through namespace elements, converting array elements into string elements for csv insertion | |
foreach($curr as $key => $value) { | |
if(is_array($curr[$key])) { | |
$map[$key] = implode(' || ', $curr[$key]); | |
} | |
else { | |
$map[$key] = (string) $value; | |
} | |
} | |
} | |
// if a field needs to be cleaned do the following | |
// $map['title'] = $this->remove_chars($map['title']); | |
fputcsv($stdout, $map, chr(9), '"'); | |
// iterators | |
$this->xml->next($this->xml->name); | |
} | |
} | |
} | |
$parser = new ParseBody($file); | |
$parser->moveToFirstNodeNamed()->mapBodyWithHeaders($headers); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment