Created
March 2, 2011 15:10
-
-
Save seungwon0/851068 to your computer and use it in GitHub Desktop.
Scrap images from dcinside gallery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# | |
# dcscrap - scrap images from dcinside gallery | |
# | |
# dcscrap downloads image files from dcinside gallery. | |
# | |
# Dcinside is a South Korean internet forum. Initially established as | |
# a community dedicated to digital cameras and photography, it has met | |
# broad notoriety in Korea due to its unique nature. | |
# | |
# Original One-Liner: https://gist.github.com/729723 | |
# | |
# Seungwon Jeong <[email protected]> | |
# | |
# Copyright (C) 2010 by Seungwon Jeong | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, but | |
# WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see | |
# <http://www.gnu.org/licenses/>. | |
use strict; | |
use warnings; | |
use 5.010; | |
use utf8; | |
use English qw( -no_match_vars ); | |
use LWP::Simple qw( get ); | |
use Encode qw( encode_utf8 ); | |
use Getopt::Long; | |
use IO::Prompt; | |
use Image::ExifTool qw( ImageInfo ); | |
use autodie; | |
sub print_usage { | |
print <<'END_USAGE'; | |
dcscrap 0.5.1 | |
Copyright (C) 2011 by Seungwon Jeong | |
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> | |
This is free software: you are free to change and redistribute it. | |
There is NO WARRANTY, to the extent permitted by law. | |
Usages: | |
dcscrap [OPTIONS] ADDRESS | |
dcscrap [OPTIONS] GALLERY NUM1 [NUM2] | |
Arguments: | |
ADDRESS the address of a post | |
GALLERY the name of a gallery (e.g. girlsgeneration_new) | |
NUM1, NUM2 the serial number of a post | |
Options: | |
-C, --directory DIR change to directory DIR | |
-o, --overwrite always overwrite | |
-n, --no-overwrite do not overwrite | |
Examples: | |
dcscrap http://gall.dcinside.com/Jessica/1495723 | |
dcscrap 'http://gall.dcinside.com/list.php?id=yoona&no=1011909&page=16&bbs=' | |
dcscrap --directory ~/Pictures taehee 96482 | |
dcscrap -n racinggirl 227120 227180 | |
Please report bugs to <[email protected]>. | |
END_USAGE | |
return; | |
} | |
sub get_gallery_and_num_from { | |
my $address = shift; | |
my $gallery_url = quotemeta 'http://gall.dcinside.com'; | |
# e.g. http://gall.dcinside.com/Jessica/1495723 | |
my $pattern1 = qr{^ $gallery_url / (?<gallery>[^/]+) / (?<num>\d+) $}xms; | |
# e.g. http://gall.dcinside.com/list.php?id=yoona&no=1011909&... | |
my $pattern2 = qr{^ $gallery_url / list[.]php | |
[?]id= (?<gallery>[^&]+) &no= (?<num>\d+)}xms; | |
if ( $address =~ $pattern1 || $address =~ $pattern2 ) { | |
return $LAST_PAREN_MATCH{gallery}, $LAST_PAREN_MATCH{num}; | |
} | |
return; | |
} | |
sub get_images { | |
my $arg_ref = shift; | |
my $gallery = $arg_ref->{gallery}; | |
my $num1 = $arg_ref->{num1}; | |
my $num2 = $arg_ref->{num2}; | |
my $overwrite = $arg_ref->{overwrite}; | |
my $no_overwrite = $arg_ref->{no_overwrite}; | |
return if !gallery_is_available($gallery); | |
for my $num ( $num1 .. $num2 ) { | |
say "[${gallery} #${num}]"; # Print progress | |
my @image_srcs = get_image_srcs( $gallery, $num ); | |
INDEX: | |
for my $index ( 0 .. $#image_srcs ) { | |
say 'Downloading image...'; # Print progress | |
my $image = get( $image_srcs[$index] ); | |
if ( !defined $image ) { | |
warn "Cannot download $image_srcs[$index]\n"; | |
next INDEX; | |
} | |
my $filename = "${gallery}_${num}_${index}"; | |
my $ext = get_img_file_ext_for( \$image ); | |
if ( defined $ext && $ext ne q{} ) { | |
$filename .= ".$ext"; | |
} | |
save_image( | |
{ image_ref => \$image, | |
filename => $filename, | |
overwrite => $overwrite, | |
no_overwrite => $no_overwrite | |
} | |
); | |
} | |
} | |
return 1; | |
} | |
sub get_image_srcs { | |
my $gallery = shift; | |
my $num = shift; | |
my $address = "http://gall.dcinside.com/list.php?id=${gallery}&no=${num}"; | |
my $webpage = get($address); | |
if ( !defined $webpage ) { | |
warn "Cannot fetch $address.\n"; | |
return; | |
} | |
$webpage = encode_utf8($webpage); | |
return if dcinside_is_busy($webpage); | |
return if post_is_deleted($webpage); | |
# src='http://dcimg1.dcinside.com/viewimage.php...' | |
# or | |
# src="http://uccfs.paran.com/PUD/..." | |
my $pattern = qr{src=['"] | |
( http:// | |
(?: | |
dcimg1[.]dcinside[.]com/viewimage[.]php | |
| | |
uccfs[.]paran[.]com/PUD/ | |
) | |
[^'"]+ | |
)}xms; | |
my @image_srcs = ( $webpage =~ /$pattern/xmsg ); | |
if ( !@image_srcs ) { | |
warn "Cannot find any images.\n"; | |
} | |
return @image_srcs; | |
} | |
sub save_image { | |
my $arg_ref = shift; | |
my $image_ref = $arg_ref->{image_ref}; | |
my $filename = $arg_ref->{filename}; | |
my $overwrite = $arg_ref->{overwrite}; | |
my $no_overwrite = $arg_ref->{no_overwrite}; | |
if ( -f $filename ) { | |
return if $no_overwrite; | |
if ( !$overwrite ) { | |
my $prompt = "Overwrite '$filename'? [y/n] "; | |
return if !prompt( $prompt, -yes_no ); | |
} | |
} | |
say "Saving ${filename}..."; # Print progress | |
open my $fh, '>', $filename; | |
binmode $fh; | |
print {$fh} ${$image_ref}; | |
close $fh; | |
return; | |
} | |
sub get_img_file_ext_for { | |
my $image_ref = shift; | |
my $image_info = ImageInfo( $image_ref, 'FileType' ); | |
given ( $image_info->{FileType} ) { | |
when ('BMP') { return 'bmp'; } | |
when ('GIF') { return 'gif'; } | |
when ('JPEG') { return 'jpg'; } | |
when ('PNG') { return 'png'; } | |
} | |
return; | |
} | |
sub gallery_is_available { | |
my $gallery = shift; | |
my $address = "http://gall.dcinside.com/list.php?id=${gallery}"; | |
my $webpage = get($address); | |
if ( !defined $webpage ) { | |
warn "Cannot fetch $address.\n"; | |
return; | |
} | |
$webpage = encode_utf8($webpage); | |
if ( $webpage =~ /생성되지 [ ] 않은 [ ] 게시판/xms ) { | |
warn "'$gallery' gallery doesn't seem to exist.\n"; | |
return; | |
} | |
return if dcinside_is_busy($webpage); | |
return 1; | |
} | |
sub dcinside_is_busy { | |
my $webpage = shift; | |
if ( $webpage =~ /사용자가 [ ] 많아/xms ) { | |
warn "Dcinside is busy now.\n"; | |
return 1; | |
} | |
return; | |
} | |
sub post_is_deleted { | |
my $webpage = shift; | |
if ( $webpage =~ /해당 [ ] 게시물은 [ ] 삭제/xms ) { | |
warn "The post has been deleted.\n"; | |
return 1; | |
} | |
return; | |
} | |
my $directory; | |
my $overwrite; | |
my $no_overwrite; | |
my @options = ( | |
'C|directory=s' => \$directory, | |
'overwrite' => \$overwrite, | |
'no-overwrite' => \$no_overwrite, | |
); | |
if ( !GetOptions(@options) ) { | |
print_usage(); | |
exit 2; | |
} | |
if ( defined $directory ) { | |
die "'$directory' directory does not exist.\n" if !-d $directory; | |
chdir $directory; | |
} | |
my ( $gallery, $num1, $num2 ); | |
given ( scalar @ARGV ) { | |
when (1) { | |
my $address = shift; | |
( $gallery, $num1 ) = get_gallery_and_num_from($address); | |
if ( !defined $gallery || !defined $num1 ) { | |
print_usage(); | |
exit 2; | |
} | |
} | |
when (2) { | |
$gallery = shift; | |
$num1 = shift; | |
} | |
when (3) { | |
$gallery = shift; | |
$num1 = shift; | |
$num2 = shift; | |
} | |
default { | |
print_usage(); | |
exit 2; | |
} | |
} | |
$num2 //= $num1; | |
my $num_pattern = qr/^ [1-9] \d* $/xms; | |
if ( $num1 !~ $num_pattern || $num2 !~ $num_pattern ) { | |
print_usage(); | |
exit 2; | |
} | |
if ( $num1 > $num2 ) { | |
( $num1, $num2 ) = ( $num2, $num1 ); | |
} | |
my $arg_ref = { | |
gallery => $gallery, | |
num1 => $num1, | |
num2 => $num2, | |
overwrite => $overwrite, | |
no_overwrite => $no_overwrite, | |
}; | |
get_images($arg_ref) or exit 1; | |
__END__ | |
=head1 NAME | |
dcscrap - scrap images from dcinside gallery | |
=head1 SYNOPSIS | |
=over | |
=item dcscrap [OPTIONS] ADDRESS | |
=item dcscrap [OPTIONS] GALLERY NUM1 [NUM2] | |
=back | |
=head1 DESCRIPTION | |
dcscrap downloads image files from dcinside gallery. | |
Dcinside is a South Korean internet forum. Initially established as a | |
community dedicated to digital cameras and photography, it has met | |
broad notoriety in Korea due to its unique nature. | |
=head1 ARGUMENTS | |
=over | |
=item ADDRESS | |
the address of a post | |
=item GALLERY | |
the name of a gallery (e.g. girlsgeneration_new) | |
=item NUM1, NUM2 | |
the serial number of a post | |
=back | |
=head1 OPTIONS | |
=over | |
=item -C, --directory F<DIR> | |
change to directory F<DIR> | |
=item -o, --overwrite | |
always overwrite | |
=item -n, --no-overwrite | |
do not overwrite | |
=back | |
=head1 EXAMPLES | |
dcscrap http://gall.dcinside.com/Jessica/1495723 | |
dcscrap 'http://gall.dcinside.com/list.php?id=yoona&no=1011909&page=16&bbs=' | |
dcscrap --directory ~/Pictures taehee 96482 | |
dcscrap -n racinggirl 227120 227180 | |
=back | |
=head1 URL | |
L<https://gist.github.com/851068> | |
=head1 AUTHOR | |
Seungwon Jeong E<lt>[email protected]<gt> | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment