mschmitt · September 17, 2014 19:35
diff --git a/twitpicdl.pl b/twitpicdl.pl
 #!/usr/bin/perl -w
 use strict;
 use diagnostics;
 use pQuery;
 use LWP::Simple;
 use JSON;
 use Getopt::Std;

 # A script to download a user's pics from twitpic before the shutdown 
 # in September 2014. 

 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 
 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 
 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

 # We get all printed data from a JSON object, thus I assume that everything 
 # is UTF8. Silly me.
 binmode(STDOUT, ":utf8");

 # Parse options
 getopts('u:d:', \my %opts);
 unless ($opts{'d'} and $opts{'u'}){
 	print "Useage: $0 -u <user> -d <output directory>\n";
 	exit;
 }

 # Open the output HTML page for writing.
 open my $html_out, "> $opts{'d'}/index.html" or die "Can't write to $opts{'d'}/index.html: $!\n";
 binmode($html_out, ":utf8");
 print $html_out <<Done;
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 </head>
 <body>
 Done

 # Create directory for the downloaded photos.
 mkdir "$opts{'d'}/pix";
 chdir "$opts{'d'}/pix" or die "Can't chdir to $opts{'d'}/pix o.O\n";

 # Scrape the index pages with photos. Start at page 1:
 my $url_userhome = 'http://twitpic.com/photos/' . $opts{'u'};
 my $page = 1;
 while(1){
 	my $url_thispage = $url_userhome . '?page=' . $page;
 	my $pq_thispage = pQuery($url_thispage);
 	$pq_thispage->find('div.user-photo-wrap')->each(\&dump_one_twitpic);
 	# Stop looping if there is no "Next" link on the current page.
 	last if ($pq_thispage->find('p.pagination')->find('a:contains("Next")')->size() == 0);
 	$page++;
 	# Break after first page (test only)
 	# last;
 }

 # Finish the index page.
 print $html_out <<Done;
 <em>R.I.P. Twitpic, we will miss you. :-(</em> <br />
 </body>
 </html>
 Done

 sub dump_one_twitpic {
 	# Find the per-pic permalink from the div on the index page.
 	my $pq_div      = pQuery($_);
 	my $pq_anchor   = $pq_div->find('div.user-photo')->find('a')->toHtml;
 	my $anchor_dom  = pQuery::DOM->fromHTML($pq_anchor);
 	my $anchor_href = $anchor_dom->getAttribute('href');
 	   $anchor_href =~ s/^\///; # Leading slash, omg.

 	# The two interesting URLs for the next step.
 	my $url_thispic = 'http://twitpic.com/' . $anchor_href;
 	my $url_fullpic = 'http://twitpic.com/' . $anchor_href . '/full';

 	# Load the full size page for the full size IMG SRC
 	my $pq_full_page = pQuery($url_fullpic);
 	my $pq_full_img  = $pq_full_page->find('div#media-full')->find('img')->toHtml;
 	my $pq_full_dom  = pQuery::DOM->fromHTML($pq_full_img);
 	my $pq_full_src  = $pq_full_dom->getAttribute('src');
 	
 	# Back to the principal per-image page for obscure metadata
 	my $img_page     = get($url_thispic);
 	my @html_lines   = split /\n/, $img_page;
 	my $tpconfig_raw;
 	foreach(@html_lines){
 		if (/Twitpic.Config.set\((.+)\);$/){
 			$tpconfig_raw = $1;
 		}
 	}
 	my $tpconfig  = decode_json($tpconfig_raw);
 	my $timestamp = $tpconfig->{'media'}->{'timestamp'};
 	my $message   = $tpconfig->{'media'}->{'message'};
 	my $type      = $tpconfig->{'media'}->{'type'};

 	# This is where the image gets downloaded.
 	print "$timestamp $message $url_thispic\n";
 	getstore($pq_full_src, $anchor_href . '.' . $type ); # Remember, chdir was done above.

 	# Add to the index page.
 	print $html_out "<b>$timestamp</b><br />\n";
 	print $html_out "$message<br />\n";
 	print $html_out "<img src=\"pix/$anchor_href.$type\" /><br />\n";
 	print $html_out "<hr />\n";
 }
	#!/usr/bin/perl -w
	use strict;
	use diagnostics;
	use pQuery;
	use LWP::Simple;
	use JSON;
	use Getopt::Std;

	# A script to download a user's pics from twitpic before the shutdown
	# in September 2014.

	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	# We get all printed data from a JSON object, thus I assume that everything
	# is UTF8. Silly me.
	binmode(STDOUT, ":utf8");

	# Parse options
	getopts('u:d:', \my %opts);
	unless ($opts{'d'} and $opts{'u'}){
	print "Useage: $0 -u <user> -d <output directory>\n";
	exit;
	}

	# Open the output HTML page for writing.
	open my $html_out, "> $opts{'d'}/index.html" or die "Can't write to $opts{'d'}/index.html: $!\n";
	binmode($html_out, ":utf8");
	print $html_out <<Done;
	<html xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	</head>
	<body>
	Done

	# Create directory for the downloaded photos.
	mkdir "$opts{'d'}/pix";
	chdir "$opts{'d'}/pix" or die "Can't chdir to $opts{'d'}/pix o.O\n";

	# Scrape the index pages with photos. Start at page 1:
	my $url_userhome = 'http://twitpic.com/photos/' . $opts{'u'};
	my $page = 1;
	while(1){
	my $url_thispage = $url_userhome . '?page=' . $page;
	my $pq_thispage = pQuery($url_thispage);
	$pq_thispage->find('div.user-photo-wrap')->each(\&dump_one_twitpic);
	# Stop looping if there is no "Next" link on the current page.
	last if ($pq_thispage->find('p.pagination')->find('a:contains("Next")')->size() == 0);
	$page++;
	# Break after first page (test only)
	# last;
	}

	# Finish the index page.
	print $html_out <<Done;
	<em>R.I.P. Twitpic, we will miss you. :-(</em> <br />
	</body>
	</html>
	Done

	sub dump_one_twitpic {
	# Find the per-pic permalink from the div on the index page.
	my $pq_div = pQuery($_);
	my $pq_anchor = $pq_div->find('div.user-photo')->find('a')->toHtml;
	my $anchor_dom = pQuery::DOM->fromHTML($pq_anchor);
	my $anchor_href = $anchor_dom->getAttribute('href');
	$anchor_href =~ s/^\///; # Leading slash, omg.

	# The two interesting URLs for the next step.
	my $url_thispic = 'http://twitpic.com/' . $anchor_href;
	my $url_fullpic = 'http://twitpic.com/' . $anchor_href . '/full';

	# Load the full size page for the full size IMG SRC
	my $pq_full_page = pQuery($url_fullpic);
	my $pq_full_img = $pq_full_page->find('div#media-full')->find('img')->toHtml;
	my $pq_full_dom = pQuery::DOM->fromHTML($pq_full_img);
	my $pq_full_src = $pq_full_dom->getAttribute('src');

	# Back to the principal per-image page for obscure metadata
	my $img_page = get($url_thispic);
	my @html_lines = split /\n/, $img_page;
	my $tpconfig_raw;
	foreach(@html_lines){
	if (/Twitpic.Config.set\((.+)\);$/){
	$tpconfig_raw = $1;
	}
	}
	my $tpconfig = decode_json($tpconfig_raw);
	my $timestamp = $tpconfig->{'media'}->{'timestamp'};
	my $message = $tpconfig->{'media'}->{'message'};
	my $type = $tpconfig->{'media'}->{'type'};

	# This is where the image gets downloaded.
	print "$timestamp $message $url_thispic\n";
	getstore($pq_full_src, $anchor_href . '.' . $type ); # Remember, chdir was done above.

	# Add to the index page.
	print $html_out "<b>$timestamp</b><br />\n";
	print $html_out "$message<br />\n";
	print $html_out "<img src=\"pix/$anchor_href.$type\" /><br />\n";
	print $html_out "<hr />\n";
	}