Last active
April 21, 2017 00:42
-
-
Save kategray/fd4c9bd4229bc9a792bbe0b311e14e44 to your computer and use it in GitHub Desktop.
Extract text from the Canadian business directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php | |
<?php | |
/** | |
* Note: This requires the html5-php library. | |
* https://github.com/html5lib/html5lib-php | |
*/ | |
/* | |
* Exit Codes | |
*/ | |
define ('ERROR_NONE', 0); | |
define ('ERROR_USAGE', 1); | |
define ('ERROR_NOSUCHFILE', 2); | |
define ('ERROR_PARSING', 3); | |
// Ensure usage is correct | |
if ($_SERVER['argc'] != 2) { | |
echo sprintf ("Usage: %s [input file]\n", $_SERVER['argv'][0]); | |
exit (ERROR_USAGE); | |
} | |
// Ensure that a proper filename was passed | |
$input_file = $_SERVER['argv'][1]; | |
if (!file_exists ($input_file) || !is_readable ($input_file)) { | |
echo sprintf ("Input file must exist and be readable.\n"); | |
exit (ERROR_NOSUCHFILE); | |
} | |
// Support library. This uses proper separators to load html5lib-php/library/HTML5/Parser.php | |
require_once __DIR__ . join(DIRECTORY_SEPARATOR, array ('', 'html5lib-php', 'library', 'HTML5', 'Parser.php')); | |
// Load the document | |
$doc = HTML5_Parser::parse (file_get_contents ($input_file)); | |
$xpath = new DOMXpath($doc); | |
// Look for divs with the class of "col-md-4 mrgn-sm-sm" | |
$elements = $xpath->query('//div[contains(@class, "col-md-4") and contains(@class, "mrgn-sm-sm")]'); | |
if (count ($elements) != 1) { | |
echo "Unable to parse document. There should be exactly one tag with the class col-md5 mrgn-sm-sm.\n"; | |
exit (ERROR_PARSING); | |
} | |
$element = $elements->item(0); | |
unset ($elements); | |
/** | |
* Temporary variable used to store the current attribute | |
*/ | |
$attr_name = NULL; | |
/** | |
* Stores the current business | |
*/ | |
$business = array(); | |
// Now that we have the section we want, extract the data | |
foreach ($element->childNodes as $child_node) { | |
// h2 tags are labels | |
if ($child_node instanceof DOMElement) { | |
if ($child_node->tagName == 'h2') { | |
// Clean any whitespace or colons | |
$attr_name = trim ($attr_name, " \t\n\r\0\x0B:"); | |
// Convert to key for storage by converting "Mailing Address:" to "mailing_address" | |
$attr_name = str_replace (' ', '_', strtolower ($child_node->nodeValue)); | |
} else if ($child_node->tagName == 'address') { | |
// The key will come from the H2 tag, and the value will come from the address tag | |
$address = $child_node->nodeValue; | |
/* | |
* Clean up white space a little bit. | |
*/ | |
// Eliminate any DOS-style line endings, and replace all multiple newlines and tabs with a single newline | |
$address = preg_replace ("/[\t\r\n]+/", "\n", $address); | |
// Eliminate any spaces after a newline | |
$address = preg_replace ("/\n[\s]+/", "\n", $address); | |
// Remove leading and trailing spaces | |
$address = trim ($address); | |
// Save the data | |
$business[$attr_name] = $address; | |
} else if ($child_node->tagName == 'p') { | |
/* | |
* Paragraphs are a little different. They have a <abbr> tag, then the value. | |
*/ | |
foreach ($child_node->childNodes as $sub_node) { | |
if (($sub_node instanceof DOMElement) && $sub_node->tagName == 'abbr') { | |
// The title element contains the full value | |
$attr_name = strtolower ($sub_node->getAttribute ('title')); | |
} else if ($sub_node instanceof DOMText) { | |
// Clean any leading or trailing whitespace or colons | |
$value = trim ($sub_node->nodeValue, " \t\n\r\0\x0B:"); | |
// Save this to the business | |
$business[$attr_name] = $value; | |
// Clean up (not necessary, but still good practice) | |
unset ($value); | |
} | |
} | |
} | |
} | |
} | |
var_dump ($business); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment