Skip to content

Instantly share code, notes, and snippets.

@eligor13
Created September 13, 2014 02:31
Show Gist options
  • Save eligor13/95f6ddbacac73757e20e to your computer and use it in GitHub Desktop.
Save eligor13/95f6ddbacac73757e20e to your computer and use it in GitHub Desktop.
#!/bin/sh
# Modified by Stan Schwertly to download locally rather than to send to Posterous.
# Github: http://github.com/Stantheman/Twitpic-Backup
# Copyright 2010 Tim "burndive" of http://burndive.blogspot.com/
# This software is licensed under the Creative Commons GNU GPL version 2.0 or later.
# License informattion: http://creativecommons.org/licenses/GPL/2.0/
# This script is a derivative of the original, obtained from here:
# http://tuxbox.blogspot.com/2010/03/twitpic-to-posterous-export-script.html
# Version1.2 [add retry]
RUN_DATE=`date +%F--%H-%m-%S`
TP_NAME=$1
WORKING_DIR=$2
IMG_DOWNLOAD=1
PREFIX=twitpic-$TP_NAME
HTML_OUT=$PREFIX-all-$RUN_DATE.html
#CURL_OPT='-f --retry 3 --retry-delay 5 --retry-max-time 60'
CURL_OPT='--retry 3 --retry-delay 5 --retry-max-time 60'
# Checks the user-supplied arguments
if [ -z "$TP_NAME" ]; then
echo "You must supply a TP_NAME."
exit
fi
if [ ! -d "$WORKING_DIR" ]; then
echo "You must supply a WORKING_DIR."
exit
fi
cd $WORKING_DIR
# Checks for the directories it needs
if [ ! -d "images" ]; then
mkdir images;
fi
if [ ! -d "html" ]; then
mkdir html;
fi
if [ ! -d "logs" ]; then
mkdir logs;
fi
PAGE=0
MAXRETRY=10
RETRY=0
LAST=`curl http://twitpic.com/photos/${TP_NAME} \
| grep "<a href=.*>Last<" \
| sed "s/.*\?page=\([0-9]*\).*/\1/"`
if [ -z "$LAST" ]; then
NEXT=`curl http://twitpic.com/photos/${TP_NAME} \
| grep "<a href=.*>Next<" \
| sed "s/.*\?page=\([0-9]*\).*/\1/"`
if [ -z "$NEXT" ]; then
PAGE=1
else
PAGE=$NEXT
fi
else
PAGE=$LAST
fi
while [ $PAGE -ne 0 ]; do
echo PAGE: $PAGE
FILENAME="html/$PREFIX-page-$PAGE.html"
echo "FILENAME=" $FILENAME
echo "0 curl http://twitpic.com/photos/${TP_NAME}?page=$PAGE -O $FILENAME"
if [ ! -f "$FILENAME" ]; then
echo "0"
# wget http://twitpic.com/photos/${TP_NAME}?page=$PAGE -O $FILENAME
echo "1 ${TP_NAME}?page=$PAGE -O $FILENAME"
curl http://twitpic.com/photos/${TP_NAME}?page=$PAGE -o $FILENAME $CURL_OPT
if [ $? -eq 22 -a $RETRY -le $MAXRETRY ]; then
RETRY=`expr $RETRY + 1`
sleep 1
else
RETRY=0
PAGE=`expr $PAGE - 1`
fi
else
RETRY=0
PAGE=`expr $PAGE - 1`
fi
done
ALL_IDS=`cat html/$PREFIX-page-* | grep -Eo "<a href=\"/[a-zA-Z0-9]+\">" | grep -Eo "/[a-zA-Z0-9]+" | grep -Eo "[a-zA-Z0-9]+" | grep -v "sopapipa" | sort -r | uniq | xargs`
COUNT=0
LOG_FILE=logs/$PREFIX-log-$RUN_DATE.txt
echo $ALL_IDS | tee -a $LOG_FILE
for ID in $ALL_IDS; do
COUNT=`expr $COUNT + 1`
echo $ID: $COUNT | tee -a $LOG_FILE
echo "Processing $ID..."
FULL_HTML="html/$PREFIX-$ID-full.html"
# wget http://twitpic.com/$ID/full -O $FULL_HTML
if [ ! -f "$FULL_HTML" ]; then
RETRY=$MAXRETRY
while [ $RETRY -ne 0 ]; do
echo "2 " curl http://twitpic.com/$ID/full -O $FULL_HTML
curl http://twitpic.com/$ID/full -o $FULL_HTML $CURL_OPT
if [ $? -eq 22 ]; then
RETRY=`expr $RETRY - 1`
sleep 1
else
RETRY=0
fi
done
fi
FULL_URL=`grep "<img src" $FULL_HTML | grep -Eo "src=\"[^\"]*\"" | grep -Eo "https://[^\"]*"`
if [ "$IMG_DOWNLOAD" -eq 1 ]; then
EXT=`echo "$FULL_URL" | grep -Eo "[a-zA-Z0-9]+\.[a-zA-Z0-9]+\?" | head -n1 | grep -Eo "\.[a-zA-Z0-9]+"`
if [ -z "$EXT" ]; then
EXT=`echo "$FULL_URL" | grep -Eo "\.[a-zA-Z0-9]+$"`
fi
FULL_FILE=$PREFIX-$ID-full$EXT
# wget "$FULL_URL" -O "images/$FULL_FILE"
if [ ! -f "images/$FULL_FILE" ]; then
RETRY=$MAXRETRY
while [ $RETRY -ne 0 ]; do
echo "3 " curl "$FULL_URL" -O "images/$FULL_FILE"
curl "$FULL_URL" -o "images/$FULL_FILE" $CURL_OPT
if [ $? -eq 22 ]; then
RETRY=`expr $RETRY - 1`
sleep 1
else
RETRY=0
fi
done
fi
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment