Created
September 17, 2014 19:35
-
-
Save mschmitt/67fa9f56b63cd169eb4f to your computer and use it in GitHub Desktop.
Improvised downloader (in Perl) for Twitpic. Outputs HTML with timestamps and messages, and a photo directory.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use diagnostics; | |
use pQuery; | |
use LWP::Simple; | |
use JSON; | |
use Getopt::Std; | |
# A script to download a user's pics from twitpic before the shutdown | |
# in September 2014. | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
# We get all printed data from a JSON object, thus I assume that everything | |
# is UTF8. Silly me. | |
binmode(STDOUT, ":utf8"); | |
# Parse options | |
getopts('u:d:', \my %opts); | |
unless ($opts{'d'} and $opts{'u'}){ | |
print "Useage: $0 -u <user> -d <output directory>\n"; | |
exit; | |
} | |
# Open the output HTML page for writing. | |
open my $html_out, "> $opts{'d'}/index.html" or die "Can't write to $opts{'d'}/index.html: $!\n"; | |
binmode($html_out, ":utf8"); | |
print $html_out <<Done; | |
<html xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
</head> | |
<body> | |
Done | |
# Create directory for the downloaded photos. | |
mkdir "$opts{'d'}/pix"; | |
chdir "$opts{'d'}/pix" or die "Can't chdir to $opts{'d'}/pix o.O\n"; | |
# Scrape the index pages with photos. Start at page 1: | |
my $url_userhome = 'http://twitpic.com/photos/' . $opts{'u'}; | |
my $page = 1; | |
while(1){ | |
my $url_thispage = $url_userhome . '?page=' . $page; | |
my $pq_thispage = pQuery($url_thispage); | |
$pq_thispage->find('div.user-photo-wrap')->each(\&dump_one_twitpic); | |
# Stop looping if there is no "Next" link on the current page. | |
last if ($pq_thispage->find('p.pagination')->find('a:contains("Next")')->size() == 0); | |
$page++; | |
# Break after first page (test only) | |
# last; | |
} | |
# Finish the index page. | |
print $html_out <<Done; | |
<em>R.I.P. Twitpic, we will miss you. :-(</em> <br /> | |
</body> | |
</html> | |
Done | |
sub dump_one_twitpic { | |
# Find the per-pic permalink from the div on the index page. | |
my $pq_div = pQuery($_); | |
my $pq_anchor = $pq_div->find('div.user-photo')->find('a')->toHtml; | |
my $anchor_dom = pQuery::DOM->fromHTML($pq_anchor); | |
my $anchor_href = $anchor_dom->getAttribute('href'); | |
$anchor_href =~ s/^\///; # Leading slash, omg. | |
# The two interesting URLs for the next step. | |
my $url_thispic = 'http://twitpic.com/' . $anchor_href; | |
my $url_fullpic = 'http://twitpic.com/' . $anchor_href . '/full'; | |
# Load the full size page for the full size IMG SRC | |
my $pq_full_page = pQuery($url_fullpic); | |
my $pq_full_img = $pq_full_page->find('div#media-full')->find('img')->toHtml; | |
my $pq_full_dom = pQuery::DOM->fromHTML($pq_full_img); | |
my $pq_full_src = $pq_full_dom->getAttribute('src'); | |
# Back to the principal per-image page for obscure metadata | |
my $img_page = get($url_thispic); | |
my @html_lines = split /\n/, $img_page; | |
my $tpconfig_raw; | |
foreach(@html_lines){ | |
if (/Twitpic.Config.set\((.+)\);$/){ | |
$tpconfig_raw = $1; | |
} | |
} | |
my $tpconfig = decode_json($tpconfig_raw); | |
my $timestamp = $tpconfig->{'media'}->{'timestamp'}; | |
my $message = $tpconfig->{'media'}->{'message'}; | |
my $type = $tpconfig->{'media'}->{'type'}; | |
# This is where the image gets downloaded. | |
print "$timestamp $message $url_thispic\n"; | |
getstore($pq_full_src, $anchor_href . '.' . $type ); # Remember, chdir was done above. | |
# Add to the index page. | |
print $html_out "<b>$timestamp</b><br />\n"; | |
print $html_out "$message<br />\n"; | |
print $html_out "<img src=\"pix/$anchor_href.$type\" /><br />\n"; | |
print $html_out "<hr />\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment