Skip to content

Instantly share code, notes, and snippets.

@xlanor
Last active September 19, 2017 16:23
Show Gist options
  • Save xlanor/1298cd3bc0a950069edbded87c8dd960 to your computer and use it in GitHub Desktop.
Save xlanor/1298cd3bc0a950069edbded87c8dd960 to your computer and use it in GitHub Desktop.
<?php
include('simple_html_dom.php');
include('connection.php');
$sitemap = 'http://www.todayonline.com/sitemap';
$rsslist = array();
$htmlsite = file_get_html($sitemap);
$zerolv = $htmlsite->find('div[class=zerolevel]');
//TodayOnline's sitemap dom has
foreach ($zerolv as $row)
{
$acont = $row->find('a');
foreach ($acont as $cat)
{
$front = "http://www.todayonline.com";
$href = $cat->href;
$concantated = $front.$href;
$rsslist[] = array('categoryurl'=>$concantated);
}
}
$firstlv = $htmlsite->find('div[class=firstlevel]');
foreach ($firstlv as $row)
{
$acont = $row->find('a');
foreach ($acont as $cat)
{
$front = "http://www.todayonline.com";
$href = $cat->href;
$concantated = $front.$href;
$rsslist[] = array('categoryurl'=>$concantated);
}
}
$array = explode("\n", file_get_contents('log.txt'));
try
{
foreach($rsslist as $list)
{
$parsedurl = $list['categoryurl']."?page=";
$counter = 0;
$checkingvar = "true";
while ($checkingvar == "true")
{
$url = $parsedurl.$counter;
if(in_array($url, $array))
{
echo $url.'has been checked before'.PHP_EOL;
$counter++;
}
else
{
$html = file_get_html($url);
$content_main = $html->find('div[id=content-main]');
foreach ($content_main as $contentdiv)
{
if(count($contentdiv->find('article')) > 0)
{
foreach ($contentdiv->find('article') as $articlediv)
{
foreach ($articlediv->find('header') as $header)
{
foreach($header->find('h2[class=node__title node-title]') as $h2)
{
foreach($h2->find('a') as $a)
{
$extractedurl = $a->href;
$fullurl = "http://www.todayonline.com".$extractedurl;
//Today doesnt show the full date on the page. No choice, we're going to have to visit the bloody page and pull the damn thing out.
}
}
foreach($header->find('span[property=dc:title]') as $t)
{
$title=html_entity_decode($t->content,ENT_QUOTES | ENT_XML1);
}
$checkq = "SELECT url_link FROM combinedarticle WHERE url_link = :link";
$checkx = $dbh->prepare($checkq);
$checkx->bindValue(':link',$fullurl);
$checkx->execute();
$row = $checkx->fetchAll();
if (!$row)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_URL, $fullurl);
curl_setopt($curl, CURLOPT_REFERER, $fullurl);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
$str = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err)
{
file_put_contents('exception.log', $fullurl."returned a 403 \n", FILE_APPEND | LOCK_EX);
echo 'Placed exception in log. continuing on'.PHP_EOL;
}
else
{
$htmldate = new simple_html_dom();
$htmldate->load($str);
foreach($htmldate->find('div[class=authoring full-date]') as $datediv)
{
foreach ($datediv->find('span[class=date-label]') as $datelbl)
{
if($datelbl->innertext == "Updated: ")
{
foreach($datediv->find('span[class=date-value]') as $dateval)
{
$pubdate = date("Y-m-d H:i:s", strtotime(str_replace(",","", $dateval->innertext)));
}
}
}
}
$exploded = explode("/",$extractedurl);
$capcategory = ucfirst(strtolower($exploded[1]));
$insertstq = "INSERT INTO combinedarticle VALUES(NULL,:title,:link,:pubtime,:cat,3)";
$insertstx = $dbh->prepare($insertstq);
$insertstx->bindParam(':title',$title);
$insertstx->bindParam(':link',$fullurl);
$insertstx->bindParam(':pubtime', $pubdate);
$insertstx->bindParam(':cat',$capcategory);
$insertstx->execute();
echo $fullurl." scraped".PHP_EOL;
echo "Sleeping for 1 second to avoid being timed".PHP_EOL;
sleep(1); //sleep for one second to prevent today from timing us out.
}
}
else
{
echo $fullurl." already exists".PHP_EOL;
}
}
}
}
else
{
$checkingvar = "false";
}
file_put_contents('log.txt', $url."\n", FILE_APPEND | LOCK_EX);
$counter++;
}
}
}
echo $list["categoryurl"]." has been hashed".PHP_EOL;
echo "Begin waiting for 10 seconds...";
sleep(10);
}
}
catch(Exception $e)
{
file_put_contents('exception.txt', $e."\n", FILE_APPEND | LOCK_EX);
echo 'Placed exception in log. continuing on'.PHP_EOL;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment