Skip to content

Instantly share code, notes, and snippets.

@markwatson
Created June 13, 2009 23:58
Show Gist options
  • Save markwatson/129479 to your computer and use it in GitHub Desktop.
Save markwatson/129479 to your computer and use it in GitHub Desktop.
<?php
/**
* Intelligent News Parser Library
*
* @package Parse_news
* @category Libraries
* @author Mark Watson
* @link http://markwatson.us
*
* A simple library that parses the html from news articles to pull out metadata.
*/
class Parse_news
{
$html = null;
/**
* Super_auth constructor
*
* @access public
*/
function __construct($url=null)
{
if ($url != null)
$this->html = file_get_contents($url);
}
function get_authors()
{
$is_author = preg_match('/>[Bb][Yy][ ]+.+</',$this->html,$author);
if (!empty($is_author) && strlen($author[0]) < 255){
$author = strip_tags($author[0]);
$author = substr($author,3);
$author = trim($author);
$authors = explode('and', $author);
if (count($authors) == 1)
$authors = explode('AND', $author);
// if all caps go ahead and fix
foreach ($authors as $key => $a) {
preg_match('/[^a-z]+/',$a, $caps);
if ( !empty($caps) )
$authors[$key] = trim(ucwords(strtolower($a)));
else $authors[$key] = trim($a);
}
} else $authors = array();
return $authors;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment