Skip to content

Instantly share code, notes, and snippets.

@mondalaci
Last active December 16, 2015 02:39
Show Gist options
  • Save mondalaci/5364150 to your computer and use it in GitHub Desktop.
Save mondalaci/5364150 to your computer and use it in GitHub Desktop.
Convert the HTML of the Mechanical Keyboard Guide forum thread of OverClock.net to Atom feed.
<?php
// It's a webscraper that converts the most recent posts of the
// Mechanical Keyboard Guide forum thread of Overclock.net to RSS.
//
// This piece of code used to work but not anymore because the site has changed its
// layout but it's expected - chasing a moving target is the nature of webscraping.
//
// MySQL table schema:
// CREATE TABLE IF NOT EXISTS `mechanical_keyboard_guide` (
// `id` INT(11) NOT NULL,
// `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
// PRIMARY KEY (`id`)
// ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
include 'config.php'; // include $database_{servername, username, password, dbname}
$start_url = 'http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide-10000.html';
function DOMinnerHTML($element)
{
// Borrowed from php.net
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child) {
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML .= trim($tmp_dom->saveHTML()) . ' ';
}
return $innerHTML;
}
function page_to_entries($html)
{
$domdocument = new DOMDocument();
@$domdocument->loadHTML($html);
$domxpath = new DomXPath($domdocument);
$xpath = '/html/body/div[2]/div/div/div/div/div/table';
$nodelist = $domxpath->query($xpath);
$entries = array();
foreach ($nodelist as $node) {
$link_node = $node->firstChild->childNodes->Item(2)->childNodes->Item(1);
$id = $link_node->textContent;
$url = $link_node->getAttribute('href');
$username = $node->childNodes->Item(1)->firstChild->childNodes->Item(1)->textContent;
$comment = DOMInnerHTML($node->childNodes->Item(1)->childNodes->Item(2)->childNodes->Item(8));
$entry = array('id'=>$id, 'url'=>$url, 'username'=>$username, 'comment'=>$comment);
$entries[] = $entry;
}
return $entries;
}
function query($sql)
{
if (($result=mysql_query($sql)) === false) {
die(mysql_error());
}
return $result;
}
function register_entry_and_get_timestamp($id)
{
if (!is_numeric($id)) {
die("Entry ID is not numeric!");
}
$result = query("INSERT IGNORE INTO mechanical_keyboard_guide SET id=$id");
$result = query("SELECT timestamp FROM mechanical_keyboard_guide WHERE id=$id");
$row = mysql_fetch_assoc($result);
$timestamp = strtr($row['timestamp'], ' ', 'T') . 'Z';
return $timestamp;
}
function add_updated_timestamp(&$entries)
{
for ($i=0; $i<count ($entries); $i++) {
$entries[$i]['updated'] = register_entry_and_get_timestamp($entries[$i]['id']);
}
}
function print_entry($entry)
{
$url = $entry['url'];
$username = htmlspecialchars($entry['username']);
$comment = $entry['comment'];
$updated = $entry['updated'];
print "<entry>\n";
print " <title>$username</title>\n";
print " <link href=\"$url\"/>\n";
print " <id>$url</id>\n";
print " <updated>$updated</updated>\n";
print " <summary type=\"html\">< ![CDATA[$comment]]></summary>\n";
print "\n";
}
// Set up MySQL connection.
if (mysql_connect($database_servername, $database_username, $database_password) === false) {
die('Failed to connect to the MySQL server. Please check the $database_servername, $database_username, $database_password variables in config.php');
}
if (mysql_select_db($database_dbname) === false) {
die('Failed to select the MySQL database. Please check the $database_dbname variable in config.php');
}
// Set up cURL.
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
// Fetch last page.
curl_setopt($ch, CURLOPT_URL, $start_url);
$last_page_html = curl_exec($ch);
// Get the page ID of the page before the last page.
$last_page_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
preg_match('/-([0-9]+)\.html$/', $last_page_url, $matches);
$last_page_id = $matches[1];
$almost_last_page_id = $last_page_id - 1;
// Fetch the page before the last page.
$almost_last_page_url = "http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide-$almost_last_page_id.html";
curl_setopt($ch, CURLOPT_URL, $almost_last_page_url);
$almost_last_page_html = curl_exec($ch);
$almost_last_page_entries = page_to_entries($almost_last_page_html);
$last_page_entries = page_to_entries($last_page_html);
$entries = array_merge($almost_last_page_entries, $last_page_entries);
add_updated_timestamp($entries);
$last_updated_timestamp = $entries[count($entries)-1]['updated'];
print '< ?xml version="1.0" encoding="utf-8" ?>';
?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Overclock.net's Mechanical Keyboards Guide</title>
<subtitle>Overclock.net's Mechanical Keyboards Guide</subtitle>
<link href="http://www.overclock.net/computer-peripherals/491752-mechanical-keyboard-guide.html"/>
<updated>< ?php print $last_updated_timestamp ?></updated>
<author>
<name>Overclock.net's Mechanical Keyboards Guide</name>
<email>[email protected]</email>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
< ?php foreach ($entries as $entry) print_entry($entry) ?>
</feed>
</count>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment