Last active
December 30, 2015 13:39
-
-
Save poizan42/7836618 to your computer and use it in GitHub Desktop.
mx.dk recent news scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class MetroXpress | |
{ | |
const userAgent = null; | |
private static $monthMap = Array( | |
'januar' => 1, | |
'februar' => 2, | |
'marts' => 3, | |
'april' => 4, | |
'maj' => 5, | |
'juni' => 6, | |
'juli' => 7, | |
'august' => 8, | |
'september' => 9, | |
'oktober' => 10, | |
'november' => 11, | |
'december' => 12 | |
); | |
private static function GetContext($method = 'GET') | |
{ | |
$httpOpts = Array('method' => $method); | |
if (self::userAgent !== null) | |
$httpOpts['header'] = 'User-Agent: '.self::userAgent; | |
$opts = Array('http' => $httpOpts); | |
return stream_context_create($opts); | |
} | |
private static function GetClassSelector($class) | |
{ | |
return '[contains(concat(" ", normalize-space(@class), " "), " '.$class.' ")]'; | |
} | |
private static function ParseDate($date) | |
{ | |
// Alternative format: Af Bo Poulsen - 06. december 2013 22:28; Opdateret: 06.12.2013 22:47 | |
// Alternativt format igen: Af Michael Bo Mortensen - 01/01-14 18:54 | |
if (preg_match('@(\d\d)/(\d\d)-(\d\d)\s+(\d?\d):(\d\d)@', $date, $matches)) | |
{ | |
list($x, $day, $month, $yearShort, $hour, $min) = $matches; | |
$year = $yearShort + 2000; | |
} | |
else | |
{ | |
$monthMatch = '(' . implode('|', array_keys(self::$monthMap)) . ')'; | |
$dateMatch = '/(\d\d)\.\s+'.$monthMatch.'\s+(\d\d\d\d)\s+(\d?\d):(\d\d)/'; | |
if (!preg_match($dateMatch, $date, $matches)) | |
throw new \Exception("Failed parsing date '$date'"); | |
list($x, $day, $monthName, $year, $hour, $min) = $matches; | |
$month = @self::$monthMap[$monthName]; | |
if ($month === null) | |
throw new \Exception("Failed parsing month '$monthName'"); | |
} | |
$timezone = new \DateTimeZone('Europe/Copenhagen'); | |
return new \DateTime($day.'-'.str_pad($month, 2, '0', STR_PAD_LEFT).'-'.$year.'T'.$hour.':'.$min, | |
$timezone); | |
} | |
/* Get a news item from MetroXpress. | |
$section: e.g. 'nyheder/kobenhavn' or 'sport/sportsnyheder' or 'nyheder/danmark' | |
$id: the id of the news item | |
Each entry is an associative array with the keys: | |
'id', 'section', 'title', 'pubDate', 'link', 'description' */ | |
public static function GetNewsItem($section, $id) | |
{ | |
$context = self::GetContext(); | |
$link = "http://www.mx.dk/$section/story/$id"; | |
$html = file_get_contents($link, false, $context); | |
if ($html === false) | |
throw new \Exception('Receiving news item failed'); | |
$dom = new DOMDocument(); | |
$internal_errors = libxml_use_internal_errors(true); | |
$dom->loadHTML($html); | |
libxml_clear_errors(); | |
libxml_use_internal_errors($internal_errors); | |
$xPath = new DOMXPath($dom); | |
$storyHeadSelector = 'div'.self::GetClassSelector('story_head'); | |
$storyTitlesSelector = 'div'.self::GetClassSelector('story_titles'); | |
$storyTitlesNodeList = $xPath->query("//$storyHeadSelector/$storyTitlesSelector"); | |
if ($storyTitlesNodeList->length == 0) | |
throw new \Exception('Missing story_titles div'); | |
$storyTitlesNode = $storyTitlesNodeList->item(0); | |
$publishedDateNodeList = $xPath->query('div'.self::GetClassSelector('published'), $storyTitlesNode); | |
if ($publishedDateNodeList->length == 0) | |
throw new \Exception('Missing published date div'); | |
$publishedDate = self::ParseDate($publishedDateNodeList->item(0)->nodeValue); | |
$publishedDate->setTimezone(new \DateTimeZone('UTC')); | |
$pubDate = $publishedDate->format('D, d M Y H:i:s').' GMT'; | |
$titleNodeList = $xPath->query('h1', $storyTitlesNode); | |
if ($titleNodeList->length == 0) | |
throw new \Exception('Missing title'); | |
$title = trim($titleNodeList->item(0)->nodeValue); | |
$descriptionNodeList = $xPath->query('h3', $storyTitlesNode); | |
if ($descriptionNodeList->length == 0) | |
throw new \Exception('Missing description'); | |
$description = trim($descriptionNodeList->item(0)->nodeValue); | |
return Array( | |
'id' => $id, | |
'section' => $section, | |
'title' => $title, | |
'pubDate' => $pubDate, | |
'link' => $link, | |
'description' => $description); | |
} | |
/* Returns a list containing section and id of the 9 most recent news entries. */ | |
public static function GetLatestNewsIds() | |
{ | |
$context = self::GetContext(); | |
$html = file_get_contents('http://www.mx.dk/nyheder/'); | |
if ($html === false) | |
throw new \Exception('Receiving news item failed'); | |
$dom = new DOMDocument(); | |
$internal_errors = libxml_use_internal_errors(true); | |
$dom->loadHTML($html); | |
libxml_clear_errors(); | |
libxml_use_internal_errors($internal_errors); | |
$xPath = new DOMXPath($dom); | |
$teaserSelector = 'div'.self::GetClassSelector('teaser'); | |
$galleryBarSelector = 'div'.self::GetClassSelector('gallerybar'); | |
$latestNewsNodeList = $xPath->query('//'.$teaserSelector.'[.//*[text()="Seneste nyt"]]/' | |
.$galleryBarSelector.'//a/@href'); | |
if ($latestNewsNodeList->length == 0) | |
throw new \Exception('Missing latest news'); | |
$newsIds = Array(); | |
foreach ($latestNewsNodeList as $href) | |
{ | |
if (!preg_match('@/(\w+)/(\w+)/story/(\d+)@', $href->value, $matches)) | |
throw new \Exception("Could not parse url '{$href->value}'"); | |
$newsIds[] = Array('section' => $matches[1].'/'.$matches[2], 'id' => $matches[3]); | |
} | |
return $newsIds; | |
} | |
/* Get the news entries for the 9 most recent news items */ | |
public static function GetLatestNews() | |
{ | |
$newsEntries = Array(); | |
$newsIds = self::GetLatestNewsIds(); | |
foreach ($newsIds as $newsId) | |
{ | |
try | |
{ | |
$newsEntries[] = self::GetNewsItem($newsId['section'], $newsId['id']); | |
} | |
catch (\Exception $e) | |
{ | |
$newsEntries[] = Array('id' => $newsId['id'], 'section' => $newsId['section'], 'error' => $e); | |
} | |
} | |
return $newsEntries; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment