Created
May 25, 2012 19:54
-
-
Save connors511/2790194 to your computer and use it in GitHub Desktop.
scraper_imdb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Some methods might fail due to pcre.backtrack_limit when using preg_match_all | |
*/ | |
class Scraper_Imdb extends Scraper | |
{ | |
protected $_urls = array( | |
'main' => 'http://www.imdb.com/title/%s/combined', | |
'plot' => 'http://www.imdb.com/title/%s/plotsummary', | |
'summary' => 'http://www.imdb.com/title/%s/synopsis', | |
'cast' => 'http://www.imdb.com/title/%s/fullcredits', | |
'officialsites' => 'http://www.imdb.com/title/%s/officialsites', | |
'releaseinfo' => 'http://www.imdb.com/title/%s/releaseinfo' | |
); | |
protected $_fields = array( | |
'title', | |
'originaltitle', | |
'released', | |
'rating', | |
'directors', | |
'plot', | |
'plotsummary', | |
'contentrating', | |
//'country', | |
//'language', | |
'genres', | |
//'cast', | |
'tagline', | |
//'top250', | |
//'studio', | |
'votes', | |
//'releasedate', | |
'runtime', | |
'producers', | |
'actors', | |
'poster', | |
//'mpaa', | |
//'writers', | |
//'poster' | |
); | |
protected $_movie; | |
protected $_id; | |
protected $_overwrite; | |
protected $_scrape_fields; | |
public function get_author() | |
{ | |
return "Matthias Larsen"; | |
} | |
public function get_name() | |
{ | |
return "IMDb Scraper"; | |
} | |
public function get_supported_fields() | |
{ | |
return Model_Scraper_Field::find('all', array( | |
'where' => array( | |
array('field','IN',$this->_fields) | |
) | |
)); | |
} | |
public function get_type() | |
{ | |
return Model_Scraper_Type::find('first', array( | |
'where' => array( | |
array('type','=','movies') | |
) | |
)); | |
} | |
public function get_version() | |
{ | |
return "0.4"; | |
} | |
public function __construct() | |
{ | |
} | |
public function set_movie(Model_Movie &$movie) | |
{ | |
$this->_movie = $movie; | |
} | |
public function search_imdb($fields = array(), $overwrite = false) | |
{ | |
$this->_scrape_fields = empty($fields) ? $this->_fields : $fields; | |
$this->_overwrite = $overwrite; | |
$url = sprintf('http://www.imdb.com/find?s=tt&q=%s+(%s)', urlencode($this->_movie->title), $this->_movie->released); | |
$page = $this->download_url($url); | |
$page = str_replace(array("\n", "\r", "<b>", "</b>"), "", $page); | |
$page = preg_replace("#\s{2,}#", "", $page); | |
$matches = array(); | |
$r = preg_match_all('#<title>(?:IMDb - )?(?<title>.+?) \((?<released>\d{4})\)(?:.+?)rel="canonical" (?:.+?)/(?<id>tt\d{7})#s', $page, $matches); | |
if ($r and !empty($matches['title'][0])) | |
{ | |
echo "- got direct match on {$matches['title'][0]} ({$matches['id'][0]})<br>"; | |
$results = array(); | |
foreach ($matches['id'] as $k => $id) | |
{ | |
$results[] = array( | |
'id' => $id, | |
'title' => $matches['title'][$k], | |
'released' => $matches['released'][$k] | |
); | |
} | |
if ($results[0]['title'] == $this->_movie->title && $results[0]['released'] == $this->_movie->released) | |
{ | |
if ($this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_by_id($results[0]['id']); | |
} | |
else if ($this->_overwrite and $this->_scrape_fields != $this->_fields) | |
{ | |
$this->populate_fields_by_id($this->_fields, $results[0]['id']); | |
} | |
else if (!$this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_missing_by_id($results[0]['id']); | |
} | |
else | |
{ | |
// Not overwrite, only some fields | |
$this->populate_missing_fields_by_id($this->_fields, $results[0]['id']); | |
} | |
} | |
else | |
{ | |
// Direct match should be 99% correct | |
// TODO: Config option | |
if ($this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_by_id($results[0]['id']); | |
} | |
else if ($this->_overwrite and $this->_scrape_fields != $this->_fields) | |
{ | |
$this->populate_fields_by_id($this->_fields, $results[0]['id']); | |
} | |
else if (!$this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_missing_by_id($results[0]['id']); | |
} | |
else | |
{ | |
// Not overwrite, only some fields | |
$this->populate_missing_fields_by_id($this->_fields, $results[0]['id']); | |
} | |
} | |
} | |
else | |
{ | |
preg_match_all('#\?link=/title/(?<id>tt\d{7})/\';">(?<title>.{1,100})</a> \((?<released>\d{4})\)#', $page, $matches); | |
if (count($matches) > 0) | |
{ | |
$results = array(); | |
foreach ($matches['id'] as $k => $id) | |
{ | |
$results[] = array( | |
'id' => $id, | |
'title' => $matches['title'][$k], | |
'released' => $matches['released'][$k] | |
); | |
} | |
// Do we have a title + year match? | |
$bets = array(); | |
foreach ($results as $k => $r) | |
{ | |
// TODO: Add some sort of rating for the bets? | |
if ($this->_movie->title == $r['title'] && $this->_movie->released == $r['released']) | |
{ | |
// Pretty safe bet.. | |
// TODO: Config option to allow a title + year match to be auto selected? | |
$bets = array(); | |
$bets[] = $r; | |
break; | |
} | |
else if ($this->_movie->title == $r['title']) | |
{ | |
if (abs(intval($this->_movie->released) - intval($r['released']) < 2)) | |
{ | |
$bets[] = $r; | |
} | |
else | |
{ | |
$bets[] = $r; | |
} | |
} | |
else if ($this->_movie->released == $r['released']) | |
{ | |
// Pretty lousy match | |
$bets[] = $r; | |
} | |
} | |
if (is_array($bets)) | |
{ | |
$bet = current($bets); | |
if ($bet) | |
{ | |
if ($this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_by_id($bet['id']); | |
} | |
else if ($this->_overwrite and $this->_scrape_fields != $this->_fields) | |
{ | |
$this->populate_fields_by_id($this->_fields, $bet['id']); | |
} | |
else if (!$this->_overwrite and $this->_scrape_fields == $this->_fields) | |
{ | |
$this->populate_all_missing_by_id($bet['id']); | |
} | |
else | |
{ | |
// Not overwrite, only some fields | |
$this->populate_missing_fields_by_id($this->_fields, $bet['id']); | |
} | |
} | |
} | |
} | |
else | |
{ | |
echo 'Skipping ' . $this->_movie->title; | |
} | |
} | |
return false; | |
} | |
public function scrape_title() | |
{ | |
// TODO: Config option to overwrite with original title? | |
$title = $this->_scrape_title_helper('title'); | |
if ($title) | |
{ | |
return $title; | |
} | |
return $this->_movie->title; | |
} | |
public function scrape_originaltitle() | |
{ | |
$title = $this->_scrape_title_helper('original'); | |
if ($title) | |
{ | |
return $title; | |
} | |
return $this->_movie->title; | |
} | |
/** | |
* Gets titles from imdb page | |
* @param string $get 'title', 'original' or 'alts' | |
* @return array|string|bool returns array on alts, otherwise string. returns false or empty array on failure | |
*/ | |
public function _scrape_title_helper($get = false) | |
{ | |
// TODO: Cache this? | |
$html = $this->download_url_param($this->_urls['main'], $this->_id); | |
$releaseInfoHtml = $this->download_url_param($this->_urls['releaseinfo'], $this->_id); | |
$matches = array(); | |
$title = false; | |
$title_alts = array(); | |
$title_orig = false; | |
if (preg_match('#(<title>)(?<title>.*)( [(].*</title>)#', $html, $matches)) | |
{ | |
$title = $matches['title']; | |
} | |
if (preg_match('#\(AKA\)</a></h5>\s<table border="0" cellpadding="2">(?<html>.+?)</table>#s', $releaseInfoHtml, $alt_html)) | |
{ | |
if (preg_match_all('#<td>(?<name>.*?)</td>\s+?<td>(?<details>.*?)</td>#s', $alt_html['html'], $m_titles)) | |
{ | |
foreach ($m_titles['name'] as $k => $t) | |
{ | |
if (strpos($t, 'imax') === FALSE and strpos($t, 'working ') === FALSE and strpos($t, 'fake ') === FALSE) | |
{ | |
$title_alts[] = array( | |
'title' => $t, | |
'detail' => $m_titles['details'][$k] | |
); | |
} | |
} | |
} | |
} | |
if (strpos($html, 'title-extra') !== FALSE) | |
{ | |
if (preg_match('#class="title-extra">(?<title>.*?) <i>\(original title\)</i>#s', $html, $m_orig)) | |
{ | |
if (!empty($m_orig['title']) and strlen(trim($m_orig['title'])) > 0) | |
{ | |
$title_orig = trim($m_orig['title']); | |
} | |
} | |
} | |
if ($title) | |
{ | |
$title = preg_replace('#\(\d{4}\)#', '', $title); | |
} | |
if ($title_orig) | |
{ | |
$title_orig = preg_replace('#\(\d{4}\)#', '', $title_orig); | |
} | |
$titles = array( | |
'title' => $title, | |
'original' => $title_orig, | |
'alts' => $title_alts | |
); | |
if ($get and isset($titles[$get])) | |
{ | |
return $titles[$get]; | |
} | |
return $titles; | |
} | |
public function scrape_released() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
// TODO fix regex | |
if (preg_match("#\((?<released1>\d{4})/.*?\)|\((?<released2>\d{4})\)#", $page, $matches)) | |
{ | |
if ($matches['released1'] != "") | |
{ | |
return $matches['released1']; | |
} | |
if ($matches['released2'] != "") | |
{ | |
return $matches['released2']; | |
} | |
} | |
$page = $this->download_url_param($this->_urls['releaseinfo'], $this->_id); | |
if (preg_match_all('#\?region=[A-Z]+?">(?<country>[a-zA-Z ]*?)</a>(.*?)/year/(?<released>\d{4})#s', $page, $matches)) | |
{ | |
$releases = array_combine($matches['country'], $matches['released']); | |
// TODO Config option to select year, or promt for it | |
if (isset($releases['USA'])) | |
{ | |
return $releases['USA']; | |
} | |
if (isset($releases['UK'])) | |
{ | |
return $releases['UK']; | |
} | |
return current($releases); | |
} | |
// return original if not found | |
return $this->_movie->released; | |
} | |
public function scrape_rating() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all("#<div\sclass=\"starbar-meta\">\s*?<b>(?<rating>.*?)/10</b>#", $page, $matches)) | |
{ | |
return $matches['rating'][0]; | |
} | |
} | |
public function scrape_directors() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#Directed [bB]y (?<director>.*?)\.#', $page, $matches)) | |
{ | |
$directors = explode(',', $matches['director'][0]); | |
$res = array(); | |
$res = Model_Person::find('all', array( | |
'where' => array( | |
array('name', 'in', $directors) | |
) | |
)); | |
if (count($res) < count($directors)) | |
{ | |
// We're missing some | |
foreach ($res as $model) | |
{ | |
if (isset($directors[$model->name])) | |
{ | |
unset($directors[$model->name]); | |
} | |
} | |
foreach ($directors as $dir) | |
{ | |
$tmp = new Model_Person(); | |
$tmp->name = $dir; | |
$res[] = $tmp; | |
} | |
} | |
$dirs = array(); | |
foreach ($res as $k => $r) | |
{ | |
$d = new Model_Director(); | |
$d->person = $r; | |
$dirs[] = $d; | |
} | |
return $dirs; | |
} | |
else | |
{ | |
} | |
return $this->_movie->directors; | |
} | |
public function scrape_plot() | |
{ | |
$page = $this->download_url_param($this->_urls['plot'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#<p class="plotpar">(?<plot>.*?)<i>#s', $page, $matches)) | |
{ | |
return $this->_scrape_plot_helper($matches['plot'][0]); | |
} | |
else | |
{ | |
$page = $this->download_url_param($this->_urls['summary'], $this->_id); | |
if (preg_match_all('#<div id="swiki.2.1">(?<synopsis>.*?)</div>#s', $page, $matches)) | |
{ | |
return $this->_scrape_plot_helper($matches['synopsis'][0]); | |
} | |
else | |
{ | |
// Fall back to the summary | |
return $this->scrape_plotsummary(); | |
} | |
} | |
return $this->_movie->plot; | |
} | |
public function scrape_plotsummary() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#<h5>Plot:</h5>\s<div class="info-content">(?<plot>.*?)<a#s', $page, $matches)) | |
{ | |
return $this->_scrape_plot_helper($matches['plot'][0]); | |
} | |
else | |
{ | |
} | |
return $this->_movie->plotsummary; | |
} | |
private function _scrape_plot_helper($str) | |
{ | |
$str = str_replace(array( | |
"Add synopsis »", | |
"Full synopsis »", | |
"Full summary »", | |
"See more »", | |
"|"), '', $str); | |
return trim(html_entity_decode(trim($str))); | |
} | |
public function scrape_contentrating() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#<h5>Certification:</h5>(?<cert>.*?)</div>#', $page, $matches)) | |
{ | |
$cert = $matches['cert'][0]; | |
if (preg_match_all('#USA:(?<mpaa1>.*?)</a>|USA:(?<mpaa2>.*?)$#', $cert, $matches)) | |
{ | |
if (isset($matches['mpaa1'])) | |
{ | |
if (is_array($matches['mpaa1'])) | |
{ | |
// fml.. | |
foreach ($matches['mpaa1'] as $m) | |
{ | |
if (strpos($m, 'PG') !== FALSE) | |
{ | |
// Return the match that contains PG. Unrated could be matched too.. | |
return $m; | |
} | |
} | |
return $matches['mpaa1'][0]; | |
} | |
else | |
{ | |
return $matches['mpaa1'][0]; | |
} | |
} | |
else if (isset($matches['mpaa2'])) | |
{ | |
return $matches['mpaa2'][0]; | |
} | |
} | |
} | |
else | |
{ | |
} | |
return $this->_movie->contentrating; | |
} | |
public function scrape_genres() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#<a href=\"/Sections/Genres/(?<genre>[a-zA-Z-]*)(/\">|\">)#', $page, $matches)) | |
{ | |
$genres = array(); | |
foreach ($matches['genre'] as $g) | |
{ | |
$genre = Model_Genre::find('first', array( | |
'where' => array( | |
array( | |
'name', '=', $g | |
) | |
) | |
)); | |
if ($genre == null) | |
{ | |
$genre = new Model_Genre(); | |
$genre->name = $g; | |
} | |
$genres[] = $genre; | |
} | |
return $genres; | |
} | |
else | |
{ | |
} | |
return $this->_movie->genres; | |
} | |
public function scrape_tagline() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#<h5>Tagline:</h5>\s<div\sclass="info-content">(?<tagline>.*?)</div>#s', $page, $matches)) | |
{ | |
return strip_tags(str_replace(array(' more', 'See more »', 'See more', ' »'), '', $matches['tagline'][0])); | |
} | |
else | |
{ | |
} | |
return $this->_movie->tagline; | |
} | |
public function scrape_votes() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#tn15more">(?<votes>.*?)</a>#', $page, $matches)) | |
{ | |
return str_replace(',', '', $matches['votes'][0]); | |
} | |
else | |
{ | |
} | |
return $this->_movie->votes; | |
} | |
public function scrape_runtime() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match('#Runtime:</h5><div class="info-content">.*?(?<runtime>\d*?) min#', $page, $matches)) | |
{ | |
return $matches['runtime']; | |
} | |
else | |
{ | |
} | |
return $this->_movie->runtime; | |
} | |
public function scrape_producers() | |
{ | |
$page = $this->download_url_param($this->_urls['cast'], $this->_id); | |
$matches = array(); | |
if (preg_match('#(<table.*?Produced by.*?</table>)#', $page, $match)) | |
{ | |
$page = $match[0]; | |
if (preg_match_all('#<a href="(?:.*?)/(?<id>nm[0-9]{7})/">(?<name>.{0,40})</a></td><td valign="top" nowrap="1"> .... </td><td valign="top"><a href="(?:.*?)">(?<role>.*?producer.*?)</a>#', $page, $matches)) | |
{ | |
echo "* joy for $this->_id<br>"; | |
$producers = array(); | |
foreach ($matches['name'] as $k => $name) | |
{ | |
$role = trim($matches['role'][$k]); | |
$producer = Model_Producer::find('all', array( | |
'related' => array( | |
'person' => array( | |
'where' => array( | |
array( | |
'name', '=', $name | |
) | |
) | |
) | |
), | |
'where' => array( | |
array( | |
'role' => $role | |
) | |
) | |
)); | |
if (count($producer) == 1) | |
{ | |
$producer = current($producer); | |
} | |
else if (count($producer) > 1) | |
{ | |
// Wtf? | |
continue; | |
} | |
if ($producer == null) | |
{ | |
$person = Model_Person::find('first', array( | |
'where' => array( | |
array('name', '=', $name) | |
) | |
)); | |
if ($person == null) | |
{ | |
$person = new Model_Person(); | |
$person->name = $name; | |
} | |
$producer = new Model_Producer(); | |
$producer->person = $person; | |
$producer->role = $role; | |
$producers[] = $producer; | |
} | |
} | |
if (!empty($producers)) | |
{ | |
return $producers; | |
} | |
} | |
else | |
{ | |
} | |
} | |
return $this->_movie->producers; | |
} | |
public function scrape_actors() | |
{ | |
$page = $this->download_url_param($this->_urls['cast'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#>(?<name>.{0,40}?)</a></td><td class="ddd"> ... </td><td class="char">(?<role>.*?)</td>#', $page, $matches)) | |
{ | |
$actors = array(); | |
foreach ($matches['role'] as $k => $role) | |
{ | |
if (strpos($role, 'uncredited') === FALSE) | |
{ | |
// Only take uncredited into account | |
// TODO: Config option | |
$role = strip_tags($role); | |
$roles = explode('/', $role); | |
foreach ($roles as $r) | |
{ | |
$r = trim($r); | |
$actor = Model_Actor::find('all', array( | |
'related' => array( | |
'person' => array( | |
'where' => array( | |
array( | |
'name', '=', $matches['name'][$k] | |
) | |
) | |
) | |
), | |
'where' => array( | |
array( | |
'role' => $r | |
) | |
) | |
)); | |
if (count($actor) == 1) | |
{ | |
$actor = current($actor); | |
} | |
else if (count($actor) > 1) | |
{ | |
// Wtf? | |
continue; | |
} | |
if ($actor == null) | |
{ | |
$person = Model_Person::find('first', array( | |
'where' => array( | |
array('name', '=', $matches['name'][$k]) | |
) | |
)); | |
if ($person == null) | |
{ | |
$person = new Model_Person(); | |
$person->name = $matches['name'][$k]; | |
} | |
$actor = new Model_Actor(); | |
$actor->person = $person; | |
$actor->role = $r; | |
$actors[] = $actor; | |
} | |
} | |
} | |
} | |
if (!empty($actors)) | |
{ | |
return $actors; | |
} | |
} | |
else | |
{ | |
} | |
return $this->_movie->actors; | |
} | |
public function scrape_poster() | |
{ | |
foreach (array('poster', 'product') as $type) | |
{ | |
$page = $this->download_url(sprintf('http://www.imdb.com/title/%s/mediaindex?refine=%s', $this->_id, $type)); | |
$matches = array(); | |
if (preg_match_all('#(?<url>/rg/mediaindex/unknown-thumbnail/media/rm\d{10}/tt\d{7})#', $page, $matches)) | |
{ | |
foreach ($matches['url'] as $m) | |
{ | |
$p = $this->download_url('http://www.imdb.com' . $m); | |
if (preg_match('#src="(?<url>http://ia\.media-imdb\.com/images/M/(?<str>[A-Za-z0-9_]+?)@@\._V1\._SX(?<width>\d{3})_SY(?<height>\d{3})_\.jpg)"#', $p, $match)) | |
{ | |
// Just return the first match. | |
// TODO: Config to ask for a selection on poster, or download all of them? | |
if (intval($match['height']) >= 300) | |
{ | |
return $match['url']; | |
} | |
} | |
} | |
} | |
else | |
{ | |
} | |
} | |
return $this->_movie->thumb; | |
} | |
public function scrape_writers() | |
{ | |
$this->scrape_producers(); | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#writerlist/(.*?)">(?<name>.*?)</a>(?<role>.*?)<br#', $page, $matches)) | |
{ | |
$writers = array(); | |
foreach ($matches['name'] as $k => $name) | |
{ | |
$role = trim($matches['role'][$k]); | |
$writer = Model_Writer::find('all', array( | |
'related' => array( | |
'person' => array( | |
'where' => array( | |
array( | |
'name', '=', $name | |
) | |
) | |
) | |
), | |
'where' => array( | |
array( | |
'role' => $role | |
) | |
) | |
)); | |
if (count($writer) == 1) | |
{ | |
$writer = current($writer); | |
} | |
else if (count($writer) > 1) | |
{ | |
// Wtf? | |
continue; | |
} | |
if ($writer == null) | |
{ | |
$person = Model_Person::find('first', array( | |
'where' => array( | |
array('name', '=', $name) | |
) | |
)); | |
if ($person == null) | |
{ | |
$person = new Model_Person(); | |
$person->name = $name; | |
} | |
$writer = new Model_Producer(); | |
$writer->person = $person; | |
$writer->role = $role; | |
$writers[] = $writer; | |
} | |
} | |
if (!empty($writers)) | |
{ | |
return $writers; | |
} | |
} | |
else | |
{ | |
} | |
return $this->_movie->writers; | |
} | |
/* public function scrape_top250() | |
{ | |
$page = $this->download_url_param($this->_urls['main'], $this->_id); | |
$matches = array(); | |
if (preg_match_all('#Top 250: #(?<top250>\d{1,3})</a>#s', $page, $matches)) | |
{ | |
return $matches['top250'][0]; | |
} | |
else | |
{ | |
} | |
return $this->_movie->top250; | |
} */ | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment