Skip to content

Instantly share code, notes, and snippets.

@keiya
Created May 12, 2012 15:35
Show Gist options
  • Select an option

  • Save keiya/2667184 to your computer and use it in GitHub Desktop.

Select an option

Save keiya/2667184 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
#
# by Keiya Chinen
use strict;
use warnings;
use utf8;
use LWP;
use LWP::ConnCache;
use Data::Dumper;
use Text::CSV_XS;
use Digest::SHA qw(sha256_hex);
$| = 1;
# http keepalive
my $conncache = LWP::ConnCache->new;
my $ua = LWP::UserAgent->new;
my $base_url = 'http://www.traileraddict.com/';
my $list_base_url = $base_url.'thefilms/';
open (my $FH, ">> trailer_db.tsv");
open (my $ER, ">> fail.log");
flock($FH, 2);
for (my $i = 1; $i <= 95; $i++) {
my $movie_list = access($list_base_url.$i,$ua,$conncache);
my @movie_detail_paths = $movie_list =~ m|<a href="/tags/(.+?)">|g;
print('movie list page '.$i.' : '.$#movie_detail_paths." movies detected.\n");
foreach my $movie_detail_path (@movie_detail_paths) {
print('movie page : '.$movie_detail_path."\n");
my $movie_detail = access($base_url.'tags/'.$movie_detail_path,$ua,$conncache);
my %movie_metadata;
$movie_detail =~ m|<span style="color:#848D57;">Studio:</span> (.+?)</div>|;
$movie_metadata{'Studio'} = $1;
$movie_detail =~ m|<span style="color:#848D57;">Release:</span> (.+?)</div>|;
$movie_metadata{'Release'} = $1;
$movie_detail =~ m|<span style="color:#848D57;">Director:</span> (.+?)</div>|;
$movie_metadata{'Director'} = $1;
$movie_detail =~ m|<span style="color:#848D57;">Writer:</span> (.+?)</div>|;
$movie_metadata{'Writer'} = $1;
$movie_detail =~ m|<span style="color:#848D57;">Cast:</span> (.+?)</div>|;
$movie_metadata{'Cast'} = $1;
$movie_detail =~ m|<span style="color:#848D57;">Genre:</span> (.+?)</div>|;
$movie_metadata{'Genre'} = $1;
my @movie_trailer_paths = $movie_detail =~ m|<a href="/trailer/${movie_detail_path}/(.+?)"|g;
my %already_scanned;
my @movie_trailer_paths_uniq = uniqArray(\@movie_trailer_paths);
foreach my $movie_trailer_path (@movie_trailer_paths_uniq) {
$already_scanned{$movie_trailer_path} = 1;
my $movie_trailer = access($base_url.'trailer/'.$movie_detail_path.'/'.$movie_trailer_path,$ua,$conncache);
$movie_trailer =~ m/var tkey = (\d+);/;
my $tid = $1;
my $movie_metadata_raw = access($base_url.'fvare.php?tid='.$tid,$ua,$conncache);
my @movie_metadata_elem = split(/&/,$movie_metadata_raw);
foreach my $movie_metadata_val (@movie_metadata_elem) {
my @movie_metadata_pair = split(/=/,$movie_metadata_val);
$movie_metadata_pair[1] =~ tr/+/ /;
$movie_metadata_pair[1] =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack('H2', $1)/eg;
$movie_metadata{$movie_metadata_pair[0]} = $movie_metadata_pair[1];
}
$movie_metadata{'filename'} = Digest::SHA::sha256_hex(
$movie_metadata{'filmtitle'}.'||'.
$movie_metadata{'title'}.'||'.
$movie_metadata{'Studio'}.'||'.
$movie_metadata{'Release'}.'||'.
$movie_metadata{'Director'}.'||'.
$movie_metadata{'Writer'}.'||'.
$movie_metadata{'Cast'}
);
my $res = save($movie_metadata{'fileurl'},$ua,$movie_metadata{'filename'},$conncache);
print $FH
nonl($movie_metadata{'filmtitle'})."\t".
nonl($movie_metadata{'title'})."\t".
nonl($movie_metadata{'trailerd'})."\t".
nonl($movie_metadata{'filename'})."\t".
$res->content_type."\t".
nonl($movie_metadata{'Studio'})."\t".
nonl($movie_metadata{'Release'})."\t".
nonl($movie_metadata{'Director'})."\t".
nonl($movie_metadata{'Writer'})."\t".
nonl($movie_metadata{'Cast'})."\t".
nonl($movie_metadata{'Genre'})."\n";
}
}
}
close($FH);
close($ER);
sub access{
print('[GET] '.$_[0]."\n");
my $ua = $_[1];
$ua->conn_cache( $_[2] );
my $response = $ua->get($_[0]);
if ($response->is_success) {
my $return = $response->decoded_content;
return $return;
} else {
#die "Failed\n";
print $ER "Could not access : $_[0]\n";
return undef;
}
}
sub save {
print('[SAV] '.$_[0]."\n");
my $ua = $_[1];
$ua->conn_cache( $_[3] );
my $response = $ua->get($_[0],':content_file' => $_[2]);
if ($response->is_success) {
return $response;
} else {
print $ER "Could not save : $_[0]\n";
return undef;
}
}
sub nonl {
my $str = shift;
if ($str) {
$str =~ s/[\n\r]//g;
}
return $str;
}
# http://katsubemakito.net/cgiperl/variable/array/post-44.html
sub uniqArray{
my $array = shift;
my %hash = ();
foreach my $value ( @$array ){
$hash{$value} = 1;
}
return(
keys %hash
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment