Last active
February 9, 2017 18:58
-
-
Save gray/796139 to your computer and use it in GitHub Desktop.
scrape imdb to get list of best classics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.010; | |
use strict; | |
use warnings; | |
use Carp::Always; | |
use List::Util qw(any none); | |
use URI; | |
use URI::QueryParam; | |
use Web::Scraper::LibXML; | |
use constant { | |
MIN_YEAR => 1930, | |
MAX_YEAR => 1985, | |
MIN_VOTES => 100, | |
MIN_RATING => 6.5, | |
BASE_URI => 'http://www.imdb.com/search/title', | |
}; | |
use constant INCLUDE_GENRES => qw( | |
action adventure comedy crime mystery sci-fi thriller war | |
); | |
use constant EXCLUDE_GENRES => qw(documentary musical); | |
binmode $_, ':utf8' for *STDOUT, *STDERR; | |
STDOUT->autoflush(1); | |
my $scraper = scraper { | |
process 'div.lister-list div.lister-item-content', | |
'results[]' => scraper { | |
process 'a[href ^= "/title/tt"]', | |
title => 'TEXT', url => '@href'; | |
process 'span.genre', 'genres[]' => sub { | |
(my $genres = $_[0]->as_text) =~ s/\s+//g; | |
return split /,/, $genres; | |
}; | |
process 'div.rating-list > meta[itemprop="ratingValue"]', | |
rating => '@content'; | |
process 'div.rating-list > meta[itemprop="ratingCount"]', | |
votes => '@content'; | |
process 'p[class=""]', 'cast[]' => sub { | |
my ($cast) = ($_[0]->as_text // '') =~ /\sStars:\s(.*)\s/s; | |
$cast =~ s/^ \s+ | \s+ $//gx; | |
return split /\s*,\s*/, $cast // ''; | |
}; | |
}; | |
process 'div.nav a.next-page', next => '@href'; | |
}; | |
my $uri = URI->new(BASE_URI); | |
$uri->query_form( | |
count => 100, | |
num_votes => MIN_VOTES . ',', | |
sort => 'user_rating,desc', | |
title_type => 'feature', | |
user_rating => MIN_RATING . ',', | |
); | |
for my $year (MIN_YEAR .. MAX_YEAR) { | |
my $uri = $uri->clone; | |
$uri->query_param(release_date => "$year,$year"); | |
{ | |
my $res = $scraper->scrape($uri); | |
for my $movie (@{$res->{results}}) { | |
my %genres = map { lc $_ => 1 } @{$movie->{genres}}; | |
next if any { exists $genres{$_} } EXCLUDE_GENRES; | |
next if none { exists $genres{$_} } INCLUDE_GENRES; | |
say join ' | ', $year, @$movie{qw(rating votes title url)}, | |
join(',', sort keys %genres), | |
join(', ', @{$movie->{cast} // []}); | |
} | |
redo if $uri = $res->{next}; | |
} | |
} | |
continue { sleep 1 } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment