Skip to content

Instantly share code, notes, and snippets.

@kyleparisi
Last active November 14, 2016 13:29
Show Gist options
  • Save kyleparisi/ce1607c960102b4da3fef3d9d715eeed to your computer and use it in GitHub Desktop.
Save kyleparisi/ce1607c960102b4da3fef3d9d715eeed to your computer and use it in GitHub Desktop.
This php script takes a local xml file input and stdouts the equivalent tsv.
<?php
/**
* This php script takes a local xml file input and stdouts the equivalent csv.
*/
$shortopts = "";
$shortopts .= "f:"; // Required value
$options = getopt($shortopts);
if (count($options) === 0) {
throw new ErrorException('File to process required. Please pass -f <file>');
}
$file = $options['f'];
class Parse {
public $file;
public $namespaces;
public $xml;
/**
* @param $file
*/
public function __construct($file) {
$this->file = $file;
}
/**
* @param array $elements
* @return $this
* @throws Exception
*/
public function moveToFirstNodeNamed($elements = array('item', 'row', 'product', 'items', 'entry')) {
$this->xml = new XMLReader;
if (!$this->xml->open("$this->file")) {
throw new Exception('Problem with read xml file ' . $file);
}
while ($this->xml->read() && !in_array(strtolower($this->xml->name), $elements));
$node = new SimpleXMLElement($this->xml->readOuterXML());
$this->namespaces = $node->getNamespaces(true);
return $this;
}
}
class ParseHeaders extends Parse {
/**
* @param $node
* @return array
*/
private function getAllNamespaceKeysFromNode($node) {
$keys = array();
$namespaceKeys = array_keys($this->namespaces);
foreach ($namespaceKeys as $key) {
$keysFromRow = get_object_vars($node->children($this->namespaces[$key]));
$keys = array_merge($keys, $keysFromRow);
}
return $keys;
}
/**
* @return array
*/
public function getHeaders() {
$accumulator = array();
while ($this->xml->name !== '') {
$node = new SimpleXMLElement($this->xml->readOuterXML());
$namespaceHeaders = array_flip(array_keys($this->getAllNamespaceKeysFromNode($node)));
$elementHeaders = array_flip(array_keys(get_object_vars($node->children())));
$accumulator = array_merge($accumulator, $elementHeaders, $namespaceHeaders);
// $node iterator
$this->xml->next($this->xml->name);
}
return array_keys($accumulator);
}
}
$parser = new ParseHeaders($file);
$headers = $parser->moveToFirstNodeNamed()->getHeaders();
class ParseBody extends Parse {
/**
* @param $value
* @return string
*/
private function remove_chars($value) {
$value = htmlspecialchars_decode($value);
// ----- remove html elements -----
$value = trim(preg_replace('/<[^>]*>/', ' ', $value));
$value = preg_replace('/[[:^print:]]/', '', $value);
// ----- remove control characters -----
$value = str_replace("\r", ' ', $value); // --- replace with empty space
$value = str_replace("\n", ' ', $value); // --- replace with space
$value = str_replace("\t", ' ', $value); // --- replace with space
// ----- remove double space & white characters -----
$value = preg_replace('/( )+/', ' ', $value);
$value = preg_replace('/(\s)+/', ' ', $value);
// ----- replace MS special characters -----
$search = array('/&lsquo;/u', '/&rsquo;/u', '/&ldquo;/u', '/&rdquo;/u', '/&mdash;/u', '/&nbsp;/u', '/&amp;/u', '/&quot;/u', '/&apos;/u', '/&reg;/u', '/&trade;/u', '/&frac34;/u', '/&frac12;/u', '/&frac14;/u', '/&ndash;/u', '/&deg;/u');
$replace = array('\'', '\'', '"', '"', '-', '', '&', '"', '\'', '', '', '3/4', '1/2', '1/4', '-', '');
$value = preg_replace($search, $replace, $value);
//make sure _all_ html entities are converted to the plain ascii equivalents - it appears
//in some MS headers, some html entities are encoded and some aren't
$value = html_entity_decode(strip_tags(html_entity_decode($value, ENT_QUOTES, 'UTF-8')));
return $value;
}
/**
* @param $headers
*/
public function mapBodyWithHeaders($headers) {
$stdout = fopen('php://stdout', 'w');
fputcsv($stdout, $headers, chr(9), '"');
$map = array_flip($headers);
foreach($map as $key => $value) {
$map[$key] = null;
}
while ($this->xml->name != '') {
$node = new SimpleXMLElement($this->xml->readOuterXML());
// handle elements
$elements = get_object_vars($node->children());
foreach($elements as $key => $value) {
if(is_array($elements[$key])) {
$map[$key] = implode(' || ', $elements[$key]);
}
else {
$map[$key] = (string) $value;
}
}
// handle namespaces
foreach($this->namespaces as $key =>$parser) {
// Get each item in namespace
$curr = get_object_vars($node->children($this->namespaces[$key]));
// Loop through namespace elements, converting array elements into string elements for csv insertion
foreach($curr as $key => $value) {
if(is_array($curr[$key])) {
$map[$key] = implode(' || ', $curr[$key]);
}
else {
$map[$key] = (string) $value;
}
}
}
// if a field needs to be cleaned do the following
// $map['title'] = $this->remove_chars($map['title']);
fputcsv($stdout, $map, chr(9), '"');
// iterators
$this->xml->next($this->xml->name);
}
}
}
$parser = new ParseBody($file);
$parser->moveToFirstNodeNamed()->mapBodyWithHeaders($headers);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment