Last active
September 19, 2017 16:23
-
-
Save xlanor/1298cd3bc0a950069edbded87c8dd960 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
include('simple_html_dom.php'); | |
include('connection.php'); | |
$sitemap = 'http://www.todayonline.com/sitemap'; | |
$rsslist = array(); | |
$htmlsite = file_get_html($sitemap); | |
$zerolv = $htmlsite->find('div[class=zerolevel]'); | |
//TodayOnline's sitemap dom has | |
foreach ($zerolv as $row) | |
{ | |
$acont = $row->find('a'); | |
foreach ($acont as $cat) | |
{ | |
$front = "http://www.todayonline.com"; | |
$href = $cat->href; | |
$concantated = $front.$href; | |
$rsslist[] = array('categoryurl'=>$concantated); | |
} | |
} | |
$firstlv = $htmlsite->find('div[class=firstlevel]'); | |
foreach ($firstlv as $row) | |
{ | |
$acont = $row->find('a'); | |
foreach ($acont as $cat) | |
{ | |
$front = "http://www.todayonline.com"; | |
$href = $cat->href; | |
$concantated = $front.$href; | |
$rsslist[] = array('categoryurl'=>$concantated); | |
} | |
} | |
$array = explode("\n", file_get_contents('log.txt')); | |
try | |
{ | |
foreach($rsslist as $list) | |
{ | |
$parsedurl = $list['categoryurl']."?page="; | |
$counter = 0; | |
$checkingvar = "true"; | |
while ($checkingvar == "true") | |
{ | |
$url = $parsedurl.$counter; | |
if(in_array($url, $array)) | |
{ | |
echo $url.'has been checked before'.PHP_EOL; | |
$counter++; | |
} | |
else | |
{ | |
$html = file_get_html($url); | |
$content_main = $html->find('div[id=content-main]'); | |
foreach ($content_main as $contentdiv) | |
{ | |
if(count($contentdiv->find('article')) > 0) | |
{ | |
foreach ($contentdiv->find('article') as $articlediv) | |
{ | |
foreach ($articlediv->find('header') as $header) | |
{ | |
foreach($header->find('h2[class=node__title node-title]') as $h2) | |
{ | |
foreach($h2->find('a') as $a) | |
{ | |
$extractedurl = $a->href; | |
$fullurl = "http://www.todayonline.com".$extractedurl; | |
//Today doesnt show the full date on the page. No choice, we're going to have to visit the bloody page and pull the damn thing out. | |
} | |
} | |
foreach($header->find('span[property=dc:title]') as $t) | |
{ | |
$title=html_entity_decode($t->content,ENT_QUOTES | ENT_XML1); | |
} | |
$checkq = "SELECT url_link FROM combinedarticle WHERE url_link = :link"; | |
$checkx = $dbh->prepare($checkq); | |
$checkx->bindValue(':link',$fullurl); | |
$checkx->execute(); | |
$row = $checkx->fetchAll(); | |
if (!$row) | |
{ | |
$curl = curl_init(); | |
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); | |
curl_setopt($curl, CURLOPT_HEADER, false); | |
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($curl, CURLOPT_URL, $fullurl); | |
curl_setopt($curl, CURLOPT_REFERER, $fullurl); | |
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE); | |
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4"); | |
$str = curl_exec($curl); | |
$err = curl_error($curl); | |
curl_close($curl); | |
if ($err) | |
{ | |
file_put_contents('exception.log', $fullurl."returned a 403 \n", FILE_APPEND | LOCK_EX); | |
echo 'Placed exception in log. continuing on'.PHP_EOL; | |
} | |
else | |
{ | |
$htmldate = new simple_html_dom(); | |
$htmldate->load($str); | |
foreach($htmldate->find('div[class=authoring full-date]') as $datediv) | |
{ | |
foreach ($datediv->find('span[class=date-label]') as $datelbl) | |
{ | |
if($datelbl->innertext == "Updated: ") | |
{ | |
foreach($datediv->find('span[class=date-value]') as $dateval) | |
{ | |
$pubdate = date("Y-m-d H:i:s", strtotime(str_replace(",","", $dateval->innertext))); | |
} | |
} | |
} | |
} | |
$exploded = explode("/",$extractedurl); | |
$capcategory = ucfirst(strtolower($exploded[1])); | |
$insertstq = "INSERT INTO combinedarticle VALUES(NULL,:title,:link,:pubtime,:cat,3)"; | |
$insertstx = $dbh->prepare($insertstq); | |
$insertstx->bindParam(':title',$title); | |
$insertstx->bindParam(':link',$fullurl); | |
$insertstx->bindParam(':pubtime', $pubdate); | |
$insertstx->bindParam(':cat',$capcategory); | |
$insertstx->execute(); | |
echo $fullurl." scraped".PHP_EOL; | |
echo "Sleeping for 1 second to avoid being timed".PHP_EOL; | |
sleep(1); //sleep for one second to prevent today from timing us out. | |
} | |
} | |
else | |
{ | |
echo $fullurl." already exists".PHP_EOL; | |
} | |
} | |
} | |
} | |
else | |
{ | |
$checkingvar = "false"; | |
} | |
file_put_contents('log.txt', $url."\n", FILE_APPEND | LOCK_EX); | |
$counter++; | |
} | |
} | |
} | |
echo $list["categoryurl"]." has been hashed".PHP_EOL; | |
echo "Begin waiting for 10 seconds..."; | |
sleep(10); | |
} | |
} | |
catch(Exception $e) | |
{ | |
file_put_contents('exception.txt', $e."\n", FILE_APPEND | LOCK_EX); | |
echo 'Placed exception in log. continuing on'.PHP_EOL; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment