poizan42 · December 30, 2015 13:39
diff --git a/mx-scrape.php b/mx-scrape.php
 <?php

 class MetroXpress
 {

 	const userAgent = null;
 	private static $monthMap = Array(
 		'januar' => 1,
 		'februar' => 2,
 		'marts' => 3,
 		'april' => 4,
 		'maj' => 5,
 		'juni' => 6,
 		'juli' => 7,
 		'august' => 8,
 		'september' => 9,
 		'oktober' => 10,
 		'november' => 11,
 		'december' => 12
 	);
 		
 	private static function GetContext($method = 'GET')
 	{
 		$httpOpts = Array('method' => $method);
 		if (self::userAgent !== null)
 			$httpOpts['header'] = 'User-Agent: '.self::userAgent;
 		$opts = Array('http' => $httpOpts);
 		return stream_context_create($opts);
 	}

 	private static function GetClassSelector($class)
 	{
 		return '[contains(concat(" ", normalize-space(@class), " "), " '.$class.' ")]';
 	}

 	private static function ParseDate($date)
 	{
 		// Alternative format: Af Bo Poulsen  - 06. december 2013 22:28; Opdateret: 06.12.2013 22:47 
 		// Alternativt format igen: Af Michael Bo Mortensen - 01/01-14 18:54
 		if (preg_match('@(\d\d)/(\d\d)-(\d\d)\s+(\d?\d):(\d\d)@', $date, $matches))
 		{
 			list($x, $day, $month, $yearShort, $hour, $min) = $matches;
 			$year = $yearShort + 2000;
 		}
 		else
 		{
 			$monthMatch = '(' . implode('|', array_keys(self::$monthMap)) . ')';
 			$dateMatch = '/(\d\d)\.\s+'.$monthMatch.'\s+(\d\d\d\d)\s+(\d?\d):(\d\d)/';

 			if (!preg_match($dateMatch, $date, $matches))
 				throw new \Exception("Failed parsing date '$date'");
 			list($x, $day, $monthName, $year, $hour, $min) = $matches;
 			$month = @self::$monthMap[$monthName];
 			if ($month === null)
 				throw new \Exception("Failed parsing month '$monthName'");
 		}
 		$timezone = new \DateTimeZone('Europe/Copenhagen'); 
 		return new \DateTime($day.'-'.str_pad($month, 2, '0', STR_PAD_LEFT).'-'.$year.'T'.$hour.':'.$min,
 			$timezone);
 	}

 	/* Get a news item from MetroXpress.
 	   $section: e.g. 'nyheder/kobenhavn' or 'sport/sportsnyheder' or 'nyheder/danmark'
 	   $id: the id of the news item
 	   Each entry is an associative array with the keys:
 		 'id', 'section', 'title', 'pubDate', 'link', 'description' */
 	public static function GetNewsItem($section, $id)
 	{
 		$context = self::GetContext();
 		$link = "http://www.mx.dk/$section/story/$id";
 		$html = file_get_contents($link, false, $context);
 		if ($html === false)
 			throw new \Exception('Receiving news item failed');
 		$dom = new DOMDocument();
 		$internal_errors = libxml_use_internal_errors(true);
 		$dom->loadHTML($html);
 		libxml_clear_errors();
 		libxml_use_internal_errors($internal_errors);
 		$xPath = new DOMXPath($dom);

 		$storyHeadSelector = 'div'.self::GetClassSelector('story_head');
 		$storyTitlesSelector = 'div'.self::GetClassSelector('story_titles');
 		$storyTitlesNodeList = $xPath->query("//$storyHeadSelector/$storyTitlesSelector");
 		if ($storyTitlesNodeList->length == 0)
 			throw new \Exception('Missing story_titles div');
 		$storyTitlesNode = $storyTitlesNodeList->item(0);
 		$publishedDateNodeList = $xPath->query('div'.self::GetClassSelector('published'), $storyTitlesNode);
 		if ($publishedDateNodeList->length == 0)
 			throw new \Exception('Missing published date div');
 		$publishedDate = self::ParseDate($publishedDateNodeList->item(0)->nodeValue);
 		$publishedDate->setTimezone(new \DateTimeZone('UTC'));
 		$pubDate = $publishedDate->format('D, d M Y H:i:s').' GMT';
 		
 		$titleNodeList = $xPath->query('h1', $storyTitlesNode);
 		if ($titleNodeList->length == 0)
 			throw new \Exception('Missing title');
 		$title = trim($titleNodeList->item(0)->nodeValue);
 		
 		$descriptionNodeList = $xPath->query('h3', $storyTitlesNode);
 		if ($descriptionNodeList->length == 0)
 			throw new \Exception('Missing description');
 		$description = trim($descriptionNodeList->item(0)->nodeValue);

 		return Array(
 			'id' => $id,
 			'section' => $section,
 			'title' => $title,
 			'pubDate' => $pubDate,
 			'link' => $link,
 			'description' => $description);
 	}
 	/* Returns a list containing section and id of the 9 most recent news entries. */
 	public static function GetLatestNewsIds()
 	{
 		$context = self::GetContext();
 		$html = file_get_contents('http://www.mx.dk/nyheder/');
 		if ($html === false)
 			throw new \Exception('Receiving news item failed');
 		
 		$dom = new DOMDocument();
 		$internal_errors = libxml_use_internal_errors(true);
 		$dom->loadHTML($html);
 		libxml_clear_errors();
 		libxml_use_internal_errors($internal_errors);
 		$xPath = new DOMXPath($dom);

 		$teaserSelector = 'div'.self::GetClassSelector('teaser');
 		$galleryBarSelector = 'div'.self::GetClassSelector('gallerybar');
 		$latestNewsNodeList = $xPath->query('//'.$teaserSelector.'[.//*[text()="Seneste nyt"]]/'
 			.$galleryBarSelector.'//a/@href');
 		if ($latestNewsNodeList->length == 0)
 			throw new \Exception('Missing latest news');

 		$newsIds = Array();
 		foreach ($latestNewsNodeList as $href)
 		{
 			if (!preg_match('@/(\w+)/(\w+)/story/(\d+)@', $href->value, $matches))
 				throw new \Exception("Could not parse url '{$href->value}'");
 			$newsIds[] = Array('section' => $matches[1].'/'.$matches[2], 'id' => $matches[3]);
 		}
 		return $newsIds;
 	}
 	/* Get the news entries for the 9 most recent news items */
 	public static function GetLatestNews()
 	{
 		$newsEntries = Array();
 		$newsIds = self::GetLatestNewsIds();
 		foreach ($newsIds as $newsId)
 		{
 			try
 			{
 				$newsEntries[] = self::GetNewsItem($newsId['section'], $newsId['id']);
 			}
 			catch (\Exception $e)
 			{
 				$newsEntries[] = Array('id' => $newsId['id'], 'section' => $newsId['section'], 'error' => $e);
 			}
 		}
 		return $newsEntries;
 	}
 }
	<?php

	class MetroXpress
	{

	const userAgent = null;
	private static $monthMap = Array(
	'januar' => 1,
	'februar' => 2,
	'marts' => 3,
	'april' => 4,
	'maj' => 5,
	'juni' => 6,
	'juli' => 7,
	'august' => 8,
	'september' => 9,
	'oktober' => 10,
	'november' => 11,
	'december' => 12
	);

	private static function GetContext($method = 'GET')
	{
	$httpOpts = Array('method' => $method);
	if (self::userAgent !== null)
	$httpOpts['header'] = 'User-Agent: '.self::userAgent;
	$opts = Array('http' => $httpOpts);
	return stream_context_create($opts);
	}

	private static function GetClassSelector($class)
	{
	return '[contains(concat(" ", normalize-space(@class), " "), " '.$class.' ")]';
	}

	private static function ParseDate($date)
	{
	// Alternative format: Af Bo Poulsen - 06. december 2013 22:28; Opdateret: 06.12.2013 22:47
	// Alternativt format igen: Af Michael Bo Mortensen - 01/01-14 18:54
	if (preg_match('@(\d\d)/(\d\d)-(\d\d)\s+(\d?\d):(\d\d)@', $date, $matches))
	{
	list($x, $day, $month, $yearShort, $hour, $min) = $matches;
	$year = $yearShort + 2000;
	}
	else
	{
	$monthMatch = '(' . implode('\|', array_keys(self::$monthMap)) . ')';
	$dateMatch = '/(\d\d)\.\s+'.$monthMatch.'\s+(\d\d\d\d)\s+(\d?\d):(\d\d)/';

	if (!preg_match($dateMatch, $date, $matches))
	throw new \Exception("Failed parsing date '$date'");
	list($x, $day, $monthName, $year, $hour, $min) = $matches;
	$month = @self::$monthMap[$monthName];
	if ($month === null)
	throw new \Exception("Failed parsing month '$monthName'");
	}
	$timezone = new \DateTimeZone('Europe/Copenhagen');
	return new \DateTime($day.'-'.str_pad($month, 2, '0', STR_PAD_LEFT).'-'.$year.'T'.$hour.':'.$min,
	$timezone);
	}

	/* Get a news item from MetroXpress.
	$section: e.g. 'nyheder/kobenhavn' or 'sport/sportsnyheder' or 'nyheder/danmark'
	$id: the id of the news item
	Each entry is an associative array with the keys:
	'id', 'section', 'title', 'pubDate', 'link', 'description' */
	public static function GetNewsItem($section, $id)
	{
	$context = self::GetContext();
	$link = "http://www.mx.dk/$section/story/$id";
	$html = file_get_contents($link, false, $context);
	if ($html === false)
	throw new \Exception('Receiving news item failed');
	$dom = new DOMDocument();
	$internal_errors = libxml_use_internal_errors(true);
	$dom->loadHTML($html);
	libxml_clear_errors();
	libxml_use_internal_errors($internal_errors);
	$xPath = new DOMXPath($dom);

	$storyHeadSelector = 'div'.self::GetClassSelector('story_head');
	$storyTitlesSelector = 'div'.self::GetClassSelector('story_titles');
	$storyTitlesNodeList = $xPath->query("//$storyHeadSelector/$storyTitlesSelector");
	if ($storyTitlesNodeList->length == 0)
	throw new \Exception('Missing story_titles div');
	$storyTitlesNode = $storyTitlesNodeList->item(0);
	$publishedDateNodeList = $xPath->query('div'.self::GetClassSelector('published'), $storyTitlesNode);
	if ($publishedDateNodeList->length == 0)
	throw new \Exception('Missing published date div');
	$publishedDate = self::ParseDate($publishedDateNodeList->item(0)->nodeValue);
	$publishedDate->setTimezone(new \DateTimeZone('UTC'));
	$pubDate = $publishedDate->format('D, d M Y H:i:s').' GMT';

	$titleNodeList = $xPath->query('h1', $storyTitlesNode);
	if ($titleNodeList->length == 0)
	throw new \Exception('Missing title');
	$title = trim($titleNodeList->item(0)->nodeValue);

	$descriptionNodeList = $xPath->query('h3', $storyTitlesNode);
	if ($descriptionNodeList->length == 0)
	throw new \Exception('Missing description');
	$description = trim($descriptionNodeList->item(0)->nodeValue);

	return Array(
	'id' => $id,
	'section' => $section,
	'title' => $title,
	'pubDate' => $pubDate,
	'link' => $link,
	'description' => $description);
	}
	/* Returns a list containing section and id of the 9 most recent news entries. */
	public static function GetLatestNewsIds()
	{
	$context = self::GetContext();
	$html = file_get_contents('http://www.mx.dk/nyheder/');
	if ($html === false)
	throw new \Exception('Receiving news item failed');

	$dom = new DOMDocument();
	$internal_errors = libxml_use_internal_errors(true);
	$dom->loadHTML($html);
	libxml_clear_errors();
	libxml_use_internal_errors($internal_errors);
	$xPath = new DOMXPath($dom);

	$teaserSelector = 'div'.self::GetClassSelector('teaser');
	$galleryBarSelector = 'div'.self::GetClassSelector('gallerybar');
	$latestNewsNodeList = $xPath->query('//'.$teaserSelector.'[.//*[text()="Seneste nyt"]]/'
	.$galleryBarSelector.'//a/@href');
	if ($latestNewsNodeList->length == 0)
	throw new \Exception('Missing latest news');

	$newsIds = Array();
	foreach ($latestNewsNodeList as $href)
	{
	if (!preg_match('@/(\w+)/(\w+)/story/(\d+)@', $href->value, $matches))
	throw new \Exception("Could not parse url '{$href->value}'");
	$newsIds[] = Array('section' => $matches[1].'/'.$matches[2], 'id' => $matches[3]);
	}
	return $newsIds;
	}
	/* Get the news entries for the 9 most recent news items */
	public static function GetLatestNews()
	{
	$newsEntries = Array();
	$newsIds = self::GetLatestNewsIds();
	foreach ($newsIds as $newsId)
	{
	try
	{
	$newsEntries[] = self::GetNewsItem($newsId['section'], $newsId['id']);
	}
	catch (\Exception $e)
	{
	$newsEntries[] = Array('id' => $newsId['id'], 'section' => $newsId['section'], 'error' => $e);
	}
	}
	return $newsEntries;
	}
	}