Skip to content

Instantly share code, notes, and snippets.

@mschmitt
Created September 17, 2014 19:35
Show Gist options
  • Save mschmitt/67fa9f56b63cd169eb4f to your computer and use it in GitHub Desktop.
Save mschmitt/67fa9f56b63cd169eb4f to your computer and use it in GitHub Desktop.
Improvised downloader (in Perl) for Twitpic. Outputs HTML with timestamps and messages, and a photo directory.
#!/usr/bin/perl -w
use strict;
use diagnostics;
use pQuery;
use LWP::Simple;
use JSON;
use Getopt::Std;
# A script to download a user's pics from twitpic before the shutdown
# in September 2014.
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# We get all printed data from a JSON object, thus I assume that everything
# is UTF8. Silly me.
binmode(STDOUT, ":utf8");
# Parse options
getopts('u:d:', \my %opts);
unless ($opts{'d'} and $opts{'u'}){
print "Useage: $0 -u <user> -d <output directory>\n";
exit;
}
# Open the output HTML page for writing.
open my $html_out, "> $opts{'d'}/index.html" or die "Can't write to $opts{'d'}/index.html: $!\n";
binmode($html_out, ":utf8");
print $html_out <<Done;
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
Done
# Create directory for the downloaded photos.
mkdir "$opts{'d'}/pix";
chdir "$opts{'d'}/pix" or die "Can't chdir to $opts{'d'}/pix o.O\n";
# Scrape the index pages with photos. Start at page 1:
my $url_userhome = 'http://twitpic.com/photos/' . $opts{'u'};
my $page = 1;
while(1){
my $url_thispage = $url_userhome . '?page=' . $page;
my $pq_thispage = pQuery($url_thispage);
$pq_thispage->find('div.user-photo-wrap')->each(\&dump_one_twitpic);
# Stop looping if there is no "Next" link on the current page.
last if ($pq_thispage->find('p.pagination')->find('a:contains("Next")')->size() == 0);
$page++;
# Break after first page (test only)
# last;
}
# Finish the index page.
print $html_out <<Done;
<em>R.I.P. Twitpic, we will miss you. :-(</em> <br />
</body>
</html>
Done
sub dump_one_twitpic {
# Find the per-pic permalink from the div on the index page.
my $pq_div = pQuery($_);
my $pq_anchor = $pq_div->find('div.user-photo')->find('a')->toHtml;
my $anchor_dom = pQuery::DOM->fromHTML($pq_anchor);
my $anchor_href = $anchor_dom->getAttribute('href');
$anchor_href =~ s/^\///; # Leading slash, omg.
# The two interesting URLs for the next step.
my $url_thispic = 'http://twitpic.com/' . $anchor_href;
my $url_fullpic = 'http://twitpic.com/' . $anchor_href . '/full';
# Load the full size page for the full size IMG SRC
my $pq_full_page = pQuery($url_fullpic);
my $pq_full_img = $pq_full_page->find('div#media-full')->find('img')->toHtml;
my $pq_full_dom = pQuery::DOM->fromHTML($pq_full_img);
my $pq_full_src = $pq_full_dom->getAttribute('src');
# Back to the principal per-image page for obscure metadata
my $img_page = get($url_thispic);
my @html_lines = split /\n/, $img_page;
my $tpconfig_raw;
foreach(@html_lines){
if (/Twitpic.Config.set\((.+)\);$/){
$tpconfig_raw = $1;
}
}
my $tpconfig = decode_json($tpconfig_raw);
my $timestamp = $tpconfig->{'media'}->{'timestamp'};
my $message = $tpconfig->{'media'}->{'message'};
my $type = $tpconfig->{'media'}->{'type'};
# This is where the image gets downloaded.
print "$timestamp $message $url_thispic\n";
getstore($pq_full_src, $anchor_href . '.' . $type ); # Remember, chdir was done above.
# Add to the index page.
print $html_out "<b>$timestamp</b><br />\n";
print $html_out "$message<br />\n";
print $html_out "<img src=\"pix/$anchor_href.$type\" /><br />\n";
print $html_out "<hr />\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment