kategray · April 21, 2017 00:42
diff --git a/extract.php b/extract.php
 #!/usr/bin/env php
 <?php
 /**
 * Note: This requires the html5-php library.
 * https://github.com/html5lib/html5lib-php
 */

 /*
 * Exit Codes
 */
 define ('ERROR_NONE',       0);
 define ('ERROR_USAGE',      1);
 define ('ERROR_NOSUCHFILE', 2);
 define ('ERROR_PARSING',    3);

 // Ensure usage is correct
 if ($_SERVER['argc'] != 2) {
 	echo sprintf ("Usage: %s [input file]\n", $_SERVER['argv'][0]);
 	exit (ERROR_USAGE);
 }

 // Ensure that a proper filename was passed
 $input_file = $_SERVER['argv'][1];
 if (!file_exists ($input_file) || !is_readable ($input_file)) {
 	echo sprintf ("Input file must exist and be readable.\n");
 	exit (ERROR_NOSUCHFILE);
 }

 // Support library.  This uses proper separators to load html5lib-php/library/HTML5/Parser.php
 require_once  __DIR__ . join(DIRECTORY_SEPARATOR, array ('', 'html5lib-php', 'library', 'HTML5', 'Parser.php'));

 // Load the document
 $doc = HTML5_Parser::parse (file_get_contents ($input_file));
 $xpath = new DOMXpath($doc);

 // Look for divs with the class of "col-md-4 mrgn-sm-sm"
 $elements = $xpath->query('//div[contains(@class, "col-md-4") and contains(@class, "mrgn-sm-sm")]');
 if (count ($elements) != 1) {
 	echo "Unable to parse document.  There should be exactly one tag with the class col-md5 mrgn-sm-sm.\n";
 	exit (ERROR_PARSING);
 }
 $element = $elements->item(0);
 unset ($elements);

 /**
 * Temporary variable used to store the current attribute
 */
 $attr_name = NULL;

 /**
 * Stores the current business
 */
 $business = array();

 // Now that we have the section we want, extract the data
 foreach ($element->childNodes as $child_node) {
 	// h2 tags are labels
 	if ($child_node instanceof DOMElement) {
 		if ($child_node->tagName == 'h2') {
 			// Clean any whitespace or colons
 			$attr_name = trim ($attr_name, " \t\n\r\0\x0B:");

 			// Convert to key for storage by converting "Mailing Address:" to "mailing_address"
 			$attr_name = str_replace (' ', '_', strtolower ($child_node->nodeValue));
 		} else if ($child_node->tagName == 'address') {
 			// The key will come from the H2 tag, and the value will come from the address tag
 			$address = $child_node->nodeValue;

 			/*
 			 * Clean up white space a little bit.
 			 */
 			// Eliminate any DOS-style line endings, and replace all multiple newlines and tabs with a single newline
 			$address = preg_replace ("/[\t\r\n]+/", "\n", $address);

 			// Eliminate any spaces after a newline
 			$address = preg_replace ("/\n[\s]+/", "\n", $address);

 			// Remove leading and trailing spaces
 			$address = trim ($address);

 			// Save the data
 			$business[$attr_name] = $address;
 		} else if ($child_node->tagName == 'p') {
 			/*
 			 * Paragraphs are a little different.  They have a <abbr> tag, then the value.
 			 */
 			foreach ($child_node->childNodes as $sub_node) {
 				if (($sub_node instanceof DOMElement) && $sub_node->tagName == 'abbr') {
 					// The title element contains the full value
 					$attr_name = strtolower ($sub_node->getAttribute ('title'));
 				} else if ($sub_node instanceof DOMText) {
 					// Clean any leading or trailing whitespace or colons
 					$value = trim ($sub_node->nodeValue, " \t\n\r\0\x0B:");

 					// Save this to the business
 					$business[$attr_name] = $value;

 					// Clean up (not necessary, but still good practice)
 					unset ($value);
 				}
 			}
 		}
 	}
 }

 var_dump ($business);
	#!/usr/bin/env php
	<?php
	/**
	* Note: This requires the html5-php library.
	* https://github.com/html5lib/html5lib-php
	*/

	/*
	* Exit Codes
	*/
	define ('ERROR_NONE', 0);
	define ('ERROR_USAGE', 1);
	define ('ERROR_NOSUCHFILE', 2);
	define ('ERROR_PARSING', 3);

	// Ensure usage is correct
	if ($_SERVER['argc'] != 2) {
	echo sprintf ("Usage: %s [input file]\n", $_SERVER['argv'][0]);
	exit (ERROR_USAGE);
	}

	// Ensure that a proper filename was passed
	$input_file = $_SERVER['argv'][1];
	if (!file_exists ($input_file) \|\| !is_readable ($input_file)) {
	echo sprintf ("Input file must exist and be readable.\n");
	exit (ERROR_NOSUCHFILE);
	}

	// Support library. This uses proper separators to load html5lib-php/library/HTML5/Parser.php
	require_once __DIR__ . join(DIRECTORY_SEPARATOR, array ('', 'html5lib-php', 'library', 'HTML5', 'Parser.php'));

	// Load the document
	$doc = HTML5_Parser::parse (file_get_contents ($input_file));
	$xpath = new DOMXpath($doc);

	// Look for divs with the class of "col-md-4 mrgn-sm-sm"
	$elements = $xpath->query('//div[contains(@class, "col-md-4") and contains(@class, "mrgn-sm-sm")]');
	if (count ($elements) != 1) {
	echo "Unable to parse document. There should be exactly one tag with the class col-md5 mrgn-sm-sm.\n";
	exit (ERROR_PARSING);
	}
	$element = $elements->item(0);
	unset ($elements);

	/**
	* Temporary variable used to store the current attribute
	*/
	$attr_name = NULL;

	/**
	* Stores the current business
	*/
	$business = array();

	// Now that we have the section we want, extract the data
	foreach ($element->childNodes as $child_node) {
	// h2 tags are labels
	if ($child_node instanceof DOMElement) {
	if ($child_node->tagName == 'h2') {
	// Clean any whitespace or colons
	$attr_name = trim ($attr_name, " \t\n\r\0\x0B:");

	// Convert to key for storage by converting "Mailing Address:" to "mailing_address"
	$attr_name = str_replace (' ', '_', strtolower ($child_node->nodeValue));
	} else if ($child_node->tagName == 'address') {
	// The key will come from the H2 tag, and the value will come from the address tag
	$address = $child_node->nodeValue;

	/*
	* Clean up white space a little bit.
	*/
	// Eliminate any DOS-style line endings, and replace all multiple newlines and tabs with a single newline
	$address = preg_replace ("/[\t\r\n]+/", "\n", $address);

	// Eliminate any spaces after a newline
	$address = preg_replace ("/\n[\s]+/", "\n", $address);

	// Remove leading and trailing spaces
	$address = trim ($address);

	// Save the data
	$business[$attr_name] = $address;
	} else if ($child_node->tagName == 'p') {
	/*
	* Paragraphs are a little different. They have a <abbr> tag, then the value.
	*/
	foreach ($child_node->childNodes as $sub_node) {
	if (($sub_node instanceof DOMElement) && $sub_node->tagName == 'abbr') {
	// The title element contains the full value
	$attr_name = strtolower ($sub_node->getAttribute ('title'));
	} else if ($sub_node instanceof DOMText) {
	// Clean any leading or trailing whitespace or colons
	$value = trim ($sub_node->nodeValue, " \t\n\r\0\x0B:");

	// Save this to the business
	$business[$attr_name] = $value;

	// Clean up (not necessary, but still good practice)
	unset ($value);
	}
	}
	}
	}
	}

	var_dump ($business);