Created
January 4, 2009 09:05
-
-
Save dann/43028 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use strict; | |
| use warnings; | |
| use Getopt::Long; | |
| use Pod::Usage; | |
| use Web::Scraper; | |
| use Path::Class; | |
| use Digest::MD5 qw(md5_hex); | |
| use URI; | |
| use Perl6::Say; | |
| use LWP::UserAgent::ProgressBar; | |
| main(); | |
| my %args = (); | |
| sub main { | |
| setup(); | |
| my @urls = collect_image_urls(); | |
| download_images(@urls); | |
| } | |
| sub setup { | |
| _setup_options(); | |
| _make_download_dir(); | |
| } | |
| sub _setup_options { | |
| GetOptions( ¥%args, "start_page=i", "end_page=i", "help", ) | |
| or pod2usage(2); | |
| pod2usage(0) if $args{help}; | |
| $args{end_page} = 1 unless $args{end_page}; | |
| $args{start_page} = 1 unless $args{start_page}; | |
| } | |
| sub _make_download_dir { | |
| dir('4uimg')->mkpath; | |
| } | |
| sub scrape_image_links { | |
| my $url = shift; | |
| my $result; | |
| eval { | |
| $result = scraper { | |
| process | |
| 'ul.entry-list>li>div.entry-footer>div.wrapper-entry-description>div.entry-description>p.entry-img-src', | |
| 'image[]' => 'TEXT'; | |
| result 'image'; | |
| } | |
| ->scrape( URI->new($url) ); | |
| }; | |
| if ($@) { | |
| warn $@; | |
| return wantarray ? () : []; | |
| } | |
| my @image_urls = (); | |
| foreach my $img ( @{$result} ) { | |
| my $image_url = 'http://' . $img; | |
| push @image_urls, $image_url; | |
| } | |
| wantarray ? @image_urls : ¥@image_urls; | |
| } | |
| sub fetch_image { | |
| my $image_url = shift; | |
| say "fetching image from " . $image_url; | |
| my $res = _fetch_original_image($image_url); | |
| unless ( $res->is_success ) { | |
| $res = _fetch_4u_image($image_url); | |
| } | |
| $res->content; | |
| } | |
| sub _fetch_original_image { | |
| my $image_url = shift; | |
| _fetch($image_url); | |
| } | |
| sub _fetch_4u_image { | |
| my $image_url = shift; | |
| $image_url =‾ m{/([^/]+)$}; | |
| $image_url = 'http://www.straightline.jp/html/found/static/upload/l/l_' . $1; | |
| say "fetching image from 4u >" . $image_url; | |
| _fetch($image_url); | |
| } | |
| sub _fetch { | |
| my $image_url = shift; | |
| my $ua = LWP::UserAgent::ProgressBar->new; | |
| $ua->agent("Mozilla/8.0"); | |
| $ua->timeout(20); | |
| $ua->get_with_progress($image_url); | |
| } | |
| sub save_image_as_file { | |
| my $content = shift; | |
| my $file_id = shift; | |
| my $write_path = file( '4uimg', $file_id . ".jpg" ); | |
| open my $FH, '>', $write_path; | |
| binmode $FH; | |
| print $FH $content; | |
| close $FH; | |
| say "saved image to : " . $write_path; | |
| } | |
| sub collect_image_urls { | |
| say "*** Collecting image urls ***"; | |
| my @all_links = (); | |
| for my $i ( $args{start_page} .. $args{end_page} ) { | |
| my $url = sprintf 'http://4u.straightline.jp/?page=%s', $i; | |
| say "collecting image url from : " . $url; | |
| my @links = scrape_image_links($url); | |
| sleep 1; | |
| push @all_links, @links; | |
| } | |
| @all_links; | |
| } | |
| sub download_images { | |
| my @image_urls = @_; | |
| say "*** Downloading images ***"; | |
| for my $image_url (@image_urls) { | |
| download_image($image_url) unless check_file_exist($image_url); | |
| } | |
| say "*** Congratulations! Completed downloding! :-) ***"; | |
| } | |
| sub check_file_exist { | |
| my $image_url = shift; | |
| my $file_id = file_id($image_url); | |
| if ( -f file( '4uimg', $file_id . ".jpg" ) ) { | |
| say "file already exsits : " . $image_url . " : " . $file_id; | |
| return 1; | |
| } | |
| else { | |
| return 0; | |
| } | |
| } | |
| sub download_image { | |
| my $image_url = shift; | |
| my $image = fetch_image($image_url); | |
| my $file_id = file_id($image_url); | |
| save_image_as_file( $image, $file_id ); | |
| } | |
| sub file_id { | |
| my $url = shift; | |
| md5_hex($url); | |
| } | |
| __END__ | |
| =head1 NAME | |
| 4u downloader | |
| =head1 SYNOPSIS | |
| 4udowonloder.pl [options] | |
| Options: | |
| --start_page start page num (default 1) | |
| --end_page end page num (default 1) | |
| if you give 100 to this option , this script fetches images | |
| from page 1 to 100 | |
| =cut | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment