Skip to content

Instantly share code, notes, and snippets.

@dann
Created January 4, 2009 09:05
Show Gist options
  • Select an option

  • Save dann/43028 to your computer and use it in GitHub Desktop.

Select an option

Save dann/43028 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Web::Scraper;
use Path::Class;
use Digest::MD5 qw(md5_hex);
use URI;
use Perl6::Say;
use LWP::UserAgent::ProgressBar;
main();
my %args = ();
sub main {
setup();
my @urls = collect_image_urls();
download_images(@urls);
}
sub setup {
_setup_options();
_make_download_dir();
}
sub _setup_options {
GetOptions( ¥%args, "start_page=i", "end_page=i", "help", )
or pod2usage(2);
pod2usage(0) if $args{help};
$args{end_page} = 1 unless $args{end_page};
$args{start_page} = 1 unless $args{start_page};
}
sub _make_download_dir {
dir('4uimg')->mkpath;
}
sub scrape_image_links {
my $url = shift;
my $result;
eval {
$result = scraper {
process
'ul.entry-list>li>div.entry-footer>div.wrapper-entry-description>div.entry-description>p.entry-img-src',
'image[]' => 'TEXT';
result 'image';
}
->scrape( URI->new($url) );
};
if ($@) {
warn $@;
return wantarray ? () : [];
}
my @image_urls = ();
foreach my $img ( @{$result} ) {
my $image_url = 'http://' . $img;
push @image_urls, $image_url;
}
wantarray ? @image_urls : ¥@image_urls;
}
sub fetch_image {
my $image_url = shift;
say "fetching image from " . $image_url;
my $res = _fetch_original_image($image_url);
unless ( $res->is_success ) {
$res = _fetch_4u_image($image_url);
}
$res->content;
}
sub _fetch_original_image {
my $image_url = shift;
_fetch($image_url);
}
sub _fetch_4u_image {
my $image_url = shift;
$image_url =‾ m{/([^/]+)$};
$image_url = 'http://www.straightline.jp/html/found/static/upload/l/l_' . $1;
say "fetching image from 4u >" . $image_url;
_fetch($image_url);
}
sub _fetch {
my $image_url = shift;
my $ua = LWP::UserAgent::ProgressBar->new;
$ua->agent("Mozilla/8.0");
$ua->timeout(20);
$ua->get_with_progress($image_url);
}
sub save_image_as_file {
my $content = shift;
my $file_id = shift;
my $write_path = file( '4uimg', $file_id . ".jpg" );
open my $FH, '>', $write_path;
binmode $FH;
print $FH $content;
close $FH;
say "saved image to : " . $write_path;
}
sub collect_image_urls {
say "*** Collecting image urls ***";
my @all_links = ();
for my $i ( $args{start_page} .. $args{end_page} ) {
my $url = sprintf 'http://4u.straightline.jp/?page=%s', $i;
say "collecting image url from : " . $url;
my @links = scrape_image_links($url);
sleep 1;
push @all_links, @links;
}
@all_links;
}
sub download_images {
my @image_urls = @_;
say "*** Downloading images ***";
for my $image_url (@image_urls) {
download_image($image_url) unless check_file_exist($image_url);
}
say "*** Congratulations! Completed downloding! :-) ***";
}
sub check_file_exist {
my $image_url = shift;
my $file_id = file_id($image_url);
if ( -f file( '4uimg', $file_id . ".jpg" ) ) {
say "file already exsits : " . $image_url . " : " . $file_id;
return 1;
}
else {
return 0;
}
}
sub download_image {
my $image_url = shift;
my $image = fetch_image($image_url);
my $file_id = file_id($image_url);
save_image_as_file( $image, $file_id );
}
sub file_id {
my $url = shift;
md5_hex($url);
}
__END__
=head1 NAME
4u downloader
=head1 SYNOPSIS
4udowonloder.pl [options]
Options:
--start_page start page num (default 1)
--end_page end page num (default 1)
if you give 100 to this option , this script fetches images
from page 1 to 100
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment