Created
January 1, 2014 17:49
-
-
Save msm595/8209922 to your computer and use it in GitHub Desktop.
Download every liked image on tumblr to a folder. Requires python3, requests, and beautifulsoup.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Either input info here, use command-line arguments, or be asked in-program. | |
email = "" | |
password = "" | |
folder = "" #remember to use / for directories, not \ | |
threads = 8 | |
######################################## | |
##### downtumblrlikes.py v20140101 ##### | |
######################################## | |
###### DO NOT EDIT BELOW THIS LINE ##### | |
######################################## | |
import os, requests, time | |
from bs4 import BeautifulSoup as bs | |
from concurrent.futures import ThreadPoolExecutor | |
def downtumblrlikes(email, password, folder, threads): | |
s = requests.Session() | |
## get form_key to login properly | |
print('Getting form_key...') | |
r = s.get('https://www.tumblr.com/login') | |
formKey = r.text.split('<input type="hidden" name="form_key" value="')[1] | |
formKey = formKey.split('"')[0] | |
data = {'user[email]': email, 'user[password]': password, 'user[tos]': 1, | |
'context': 'login', 'version': 'STANDARD', 'follow': '', | |
'http_referer': 'https://www.tumblr.com/login', 'form_key': formKey} | |
## login | |
print('Logging in...') | |
r = s.post('https://www.tumblr.com/login', data=data) | |
if 'logged_in' not in s.cookies or s.cookies['logged_in'] is not '1': | |
print('Error logging in.') | |
return | |
else: | |
print('Successfully logged in.') | |
data = {} | |
email = '' | |
password = '' | |
## get total pages | |
r = s.get('http://www.tumblr.com/likes/page/1') | |
totalPages = int(bs(r.text).select('a.likes div')[0]['data-count'])//10+1 | |
ald = set(os.listdir(folder)) #already downloaded files | |
totalBytes = 0 | |
sTime = time.time() | |
def parsePage(page): | |
imgs = [] | |
r = s.get('http://www.tumblr.com/likes/page/%d' % page, headers={ | |
'X-Requested-With': 'XMLHttpRequest'}) | |
soup = bs(r.text) | |
## get pics from page (either from script or html) | |
posts = soup.find_all("div", class_=lambda x: x in ['is_photo', 'is_photoset']) | |
for p in posts: | |
script = p('script') | |
if not script: | |
url = p.select('div.post_media img')[0]['src'] | |
imgs.append(url) | |
else: | |
script = script[0].text | |
urls = script.split('high_res: \'') | |
if len(urls) == 2: #1 url for photo | |
urls = [urls[1].split('\'')[0]] | |
else: #many urls for photoset | |
urls = (u.split('"')[0].replace('\\/', '/') for u | |
in script.split('high_res":"')[1:]) | |
imgs.extend(urls) | |
return set(imgs) | |
def downloadImage(i): | |
filename = i.split('/')[-1] | |
if filename in ald: | |
return -2 #file already exists | |
r = s.get(i, stream=True) | |
if r.status_code == 200: | |
size = 0 | |
with open(folder+'/'+filename, 'wb') as file: | |
for chunk in r.iter_content(1024*4): | |
file.write(chunk) | |
size = os.fstat(file.fileno()).st_size | |
return size #file sucessfully downloaded | |
else: | |
return -1 | |
imgs = set() | |
downloaded = set() | |
existing = set() | |
error = set() | |
print("Grabbing img urls from %d pages:" % totalPages) | |
with ThreadPoolExecutor(max_workers=threads) as executor: | |
for page, i in zip(range(1,totalPages+1), | |
executor.map(parsePage, range(1,totalPages+1))): | |
imgs |= i | |
print('\tParsed page %d' % page) | |
## download images | |
print('Downloading %d images:' % len(imgs)) | |
with ThreadPoolExecutor(max_workers=threads) as executor: | |
for img, result in zip(imgs, executor.map(downloadImage, imgs)): | |
name = img.split('/')[-1] | |
if result == -2: | |
print('\tAlready exists: %s' % name) | |
existing.add(img) | |
elif result == -1: | |
print('\tError downloading: %s' % name) | |
error.add(img) | |
else: | |
print('\tDownloaded: %s' % name) | |
totalBytes += result | |
downloaded.add(img) | |
eTime = time.time() | |
print('Done.') | |
print('Downloaded %d new images.' % len(downloaded)) | |
print('Already had %d images.' % len(existing)) | |
print('Error dling %d images: \n\t%s' % (len(error), "\n\t".join(error))) | |
print('Downloaded %0.2fMB in %0.2fs (%0.2fMb/s)' % | |
((totalBytes/2**20), (eTime-sTime), 8*(totalBytes/2**20)/(eTime-sTime))) | |
if __name__ == "__main__": | |
import argparse, getpass | |
parser = argparse.ArgumentParser(description='Download tumblr likes.', | |
prog='downtumblrlikes.py') | |
parser.add_argument('-e', '--email', metavar='email') | |
parser.add_argument('-p', '--password', metavar='pass') | |
parser.add_argument('-f', '--folder', metavar='folder') | |
parser.add_argument('-t', '--threads', type=int, metavar='threads') | |
parser.add_argument('-v', '--version', action='version', version='%(prog)s 20140101') | |
args = parser.parse_args() | |
email = args.email if args.email != None else email | |
password = args.password if args.password != None else password | |
folder = args.folder if args.folder != None else folder | |
threads = args.threads if args.threads != None else threads | |
if len(email) == 0: | |
email = input('Email: ') | |
if len(password) == 0: | |
password = getpass.getpass('Password: ') | |
if len(folder) == 0: | |
folder = input('Folder: ') | |
if threads < 1: | |
threads = int(input('Threads: ')) | |
downtumblrlikes(email, password, folder, threads) |
Rude! Anyway, I've just made a script for this, but it uses the consumer secret and key to communicate with the tumblr API. However, this seems like it would work for a much more general case. I imagine it was a PITA to get all the scraping and automating login just right.
Nice job. I'll post mine later so you can see how I went about it, if you're interested at all.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
wtf is this?
i want to download my likes not this bunch of crap.