Skip to content

Instantly share code, notes, and snippets.

@kategray
Last active April 21, 2017 00:42
Show Gist options
  • Save kategray/fd4c9bd4229bc9a792bbe0b311e14e44 to your computer and use it in GitHub Desktop.
Save kategray/fd4c9bd4229bc9a792bbe0b311e14e44 to your computer and use it in GitHub Desktop.
Extract text from the Canadian business directory
#!/usr/bin/env php
<?php
/**
* Note: This requires the html5-php library.
* https://github.com/html5lib/html5lib-php
*/
/*
* Exit Codes
*/
define ('ERROR_NONE', 0);
define ('ERROR_USAGE', 1);
define ('ERROR_NOSUCHFILE', 2);
define ('ERROR_PARSING', 3);
// Ensure usage is correct
if ($_SERVER['argc'] != 2) {
echo sprintf ("Usage: %s [input file]\n", $_SERVER['argv'][0]);
exit (ERROR_USAGE);
}
// Ensure that a proper filename was passed
$input_file = $_SERVER['argv'][1];
if (!file_exists ($input_file) || !is_readable ($input_file)) {
echo sprintf ("Input file must exist and be readable.\n");
exit (ERROR_NOSUCHFILE);
}
// Support library. This uses proper separators to load html5lib-php/library/HTML5/Parser.php
require_once __DIR__ . join(DIRECTORY_SEPARATOR, array ('', 'html5lib-php', 'library', 'HTML5', 'Parser.php'));
// Load the document
$doc = HTML5_Parser::parse (file_get_contents ($input_file));
$xpath = new DOMXpath($doc);
// Look for divs with the class of "col-md-4 mrgn-sm-sm"
$elements = $xpath->query('//div[contains(@class, "col-md-4") and contains(@class, "mrgn-sm-sm")]');
if (count ($elements) != 1) {
echo "Unable to parse document. There should be exactly one tag with the class col-md5 mrgn-sm-sm.\n";
exit (ERROR_PARSING);
}
$element = $elements->item(0);
unset ($elements);
/**
* Temporary variable used to store the current attribute
*/
$attr_name = NULL;
/**
* Stores the current business
*/
$business = array();
// Now that we have the section we want, extract the data
foreach ($element->childNodes as $child_node) {
// h2 tags are labels
if ($child_node instanceof DOMElement) {
if ($child_node->tagName == 'h2') {
// Clean any whitespace or colons
$attr_name = trim ($attr_name, " \t\n\r\0\x0B:");
// Convert to key for storage by converting "Mailing Address:" to "mailing_address"
$attr_name = str_replace (' ', '_', strtolower ($child_node->nodeValue));
} else if ($child_node->tagName == 'address') {
// The key will come from the H2 tag, and the value will come from the address tag
$address = $child_node->nodeValue;
/*
* Clean up white space a little bit.
*/
// Eliminate any DOS-style line endings, and replace all multiple newlines and tabs with a single newline
$address = preg_replace ("/[\t\r\n]+/", "\n", $address);
// Eliminate any spaces after a newline
$address = preg_replace ("/\n[\s]+/", "\n", $address);
// Remove leading and trailing spaces
$address = trim ($address);
// Save the data
$business[$attr_name] = $address;
} else if ($child_node->tagName == 'p') {
/*
* Paragraphs are a little different. They have a <abbr> tag, then the value.
*/
foreach ($child_node->childNodes as $sub_node) {
if (($sub_node instanceof DOMElement) && $sub_node->tagName == 'abbr') {
// The title element contains the full value
$attr_name = strtolower ($sub_node->getAttribute ('title'));
} else if ($sub_node instanceof DOMText) {
// Clean any leading or trailing whitespace or colons
$value = trim ($sub_node->nodeValue, " \t\n\r\0\x0B:");
// Save this to the business
$business[$attr_name] = $value;
// Clean up (not necessary, but still good practice)
unset ($value);
}
}
}
}
}
var_dump ($business);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment