Last active
August 29, 2015 14:03
-
-
Save nicokoch/08f10781dc7870ffcd98 to your computer and use it in GitHub Desktop.
Command line script to download images from reddit image-posts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import os | |
import getpass | |
import urllib2 | |
from StringIO import StringIO | |
import time | |
import datetime | |
import argparse | |
import praw | |
from bs4 import BeautifulSoup | |
from progressbar import ProgressBar, Percentage, Bar, ETA | |
def get_submissions(subreddit, count, filter): | |
r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl') | |
sr = r.get_subreddit(subreddit) | |
return filter(sr, count) | |
def get_links(submissions): | |
res = [] | |
for sub in submissions: | |
res.append(sub.url) | |
return res | |
def filter_for_imgur(urls): | |
res = [] | |
for url in urls: | |
url = url.encode('ascii', 'ignore') | |
if 'imgur' in url: | |
if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'): | |
res.append(url) | |
else: #we have to get the direct links here | |
try: | |
response = urllib2.urlopen(url) | |
except urllib2.HTTPError as e: | |
print url +": \t"+str(e.code)+" "+e.msg | |
continue | |
except urllib2.URLError as e: | |
print "Could not download "+url | |
continue | |
if "image" in get_content_type(response): | |
res.append(url) | |
continue | |
soup = BeautifulSoup(response.read()) | |
image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image") | |
imgs = image_container[0].findChildren("img") if len(image_container) > 0 else [] | |
for img in imgs: | |
link = img.get("data-src") if img.get("data-src") else img.get("src") | |
if not link: | |
continue | |
res.append("http://" + link[2:]) | |
else: | |
try: | |
response = urllib2.urlopen(url) | |
except urllib2.HTTPError as e: | |
print url +": \t"+str(e.code)+" "+e.msg | |
continue | |
except urllib2.URLError as e: | |
print "Could not download "+url | |
continue | |
if "image" in get_content_type(response): | |
res.append(url) | |
return res | |
def get_content_type(response): | |
for header in response.info().headers: | |
if header.startswith("Content-Type"): | |
return header.split(":")[1] | |
def get_file_format(content_type): | |
short=content_type.split("/")[1] | |
if ("jpg" in short or "jpeg" in short): | |
return "jpg" | |
elif ("gif" in short): | |
return "gif" | |
else: | |
return "png" | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Download Images from Reddit") | |
parser.add_argument('subreddit', help="The subreddit to load images from") | |
parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)") | |
parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images") | |
parser.add_argument('--category', '-t', default="top", | |
choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week", | |
"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week", | |
"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"], | |
help="From which category do you want to download") | |
return parser.parse_args() | |
def download_images(urls, directory): | |
actual = 0 | |
not_read=[] | |
ts=time.time() | |
timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') | |
widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' '] | |
pbar = ProgressBar(widgets=widgets, maxval=100 ).start(); | |
if directory and not directory.endswith("/"): | |
directory += "/" | |
if not os.path.isdir(directory): | |
print directory+" could not be found" | |
for i, url in enumerate(urls): | |
try: | |
response = urllib2.urlopen(url) | |
except urllib2.HTTPError as e: | |
print url +": \t"+str(e.code)+" "+e.msg | |
continue | |
except urllib2.URLError as e: | |
print "Could not download "+url | |
continue | |
content_type = get_content_type(response) | |
if "image" in content_type: | |
percent = float(i+1) / len(urls) * 100 | |
pbar.update(percent); | |
image_data = StringIO(response.read()) | |
directory = directory if directory else "" | |
with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'w') as f: | |
f.write(image_data.buf) | |
actual += 1 | |
time.sleep(2) | |
else: | |
not_read.append(url); | |
pbar.finish(); | |
if(len(not_read) > 0): | |
print "Could not read the following urls:" | |
for url in not_read: | |
print url | |
return actual | |
def get_filters(): | |
return {"top": lambda r, c: r.get_top(limit=c), | |
"top-all": lambda r, c: r.get_top_from_all(limit=c), | |
"top-day": lambda r, c: r.get_top_from_day(limit=c), | |
"top-hour": lambda r, c: r.get_top_from_hour(limit=c), | |
"top-month": lambda r, c: r.get_top_from_month(limit=c), | |
"top-week": lambda r, c: r.get_top_from_week(limit=c), | |
"top-year": lambda r, c: r.get_top_from_year(limit=c), | |
"con": lambda r, c: r.get_controversial(limit=c), | |
"con-all": lambda r, c: r.get_controversial_from_all(limit=c), | |
"con-day": lambda r, c: r.get_controversial_from_day(limit=c), | |
"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c), | |
"con-month": lambda r, c: r.get_controversial_from_month(limit=c), | |
"con-week": lambda r, c: r.get_controversial_from_week(limit=c), | |
"con-year": lambda r, c: r.get_controversial_from_year(limit=c), | |
"hot": lambda r, c: r.get_hot(limit=c), | |
"new": lambda r, c: r.get_new(limit=c), | |
"new-bydate": lambda r, c: r.get_new_by_date(limit=c), | |
"new-byrising": lambda r, c: r.get_new_by_rising(limit=c), | |
"random": lambda r, c: r.get_random_submission(limit=c), | |
"rising": lambda r, c: r.get_rising(limit=c), | |
} | |
def main(): | |
args = parse_args() | |
urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category])) | |
print "Found "+str(len(urls))+" reddit threads" | |
urls = filter_for_imgur(urls) | |
print "Found "+str(len(urls))+" image links" | |
actual = download_images(urls, args.output) | |
print "Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory" | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# To use this, install the following python packages (for example with pip): | |
# beautifulsoup4 | |
# praw | |
# py3-progressbar | |
import os | |
import getpass | |
import urllib.request, urllib.error, urllib.parse | |
from io import BytesIO | |
import time | |
import datetime | |
import argparse | |
import praw | |
from bs4 import BeautifulSoup | |
from progressbar import ProgressBar, Percentage, Bar, ETA | |
def get_submissions(subreddit, count, filter): | |
r = praw.Reddit(user_agent=getpass.getuser()+'\'s reddit_dl') | |
sr = r.get_subreddit(subreddit) | |
return list(filter(sr, count)) | |
def get_links(submissions): | |
res = [] | |
for sub in submissions: | |
res.append(sub.url) | |
return res | |
def filter_for_imgur(urls): | |
res = [] | |
for url in urls: | |
url_enc = url.encode('UTF-8', 'ignore') | |
if "imgur" in url: | |
if url.endswith('jpg') or url.endswith('gif') or url.endswith('png'): | |
res.append(url) | |
else: #we have to get the direct links here | |
try: | |
response = urllib.request.urlopen(url) | |
except urllib.error.HTTPError as e: | |
print(url +": \t"+str(e.code)+" "+e.msg) | |
continue | |
except urllib.error.URLError as e: | |
print("Could not download "+url) | |
continue | |
if "image" in get_content_type(response): | |
res.append(url) | |
continue | |
soup = BeautifulSoup(response.read()) | |
image_container = soup.select("#image-container") if "/a/" in url else soup.select("#image") | |
imgs = image_container[0].findChildren("img") if len(image_container) > 0 else [] | |
for img in imgs: | |
link = img.get("data-src") if img.get("data-src") else img.get("src") | |
if not link: | |
continue | |
res.append("http://" + link[2:]) | |
else: | |
try: | |
response = urllib.request.urlopen(url) | |
except urllib.error.HTTPError as e: | |
print(url +": \t"+str(e.code)+" "+e.msg) | |
continue | |
except urllib.error.URLError as e: | |
print("Could not download "+url) | |
continue | |
if "image" in get_content_type(response): | |
res.append(url) | |
return res | |
def get_content_type(response): | |
return response.info().get("Content-Type") | |
def get_file_format(content_type): | |
short=content_type.split("/")[1] | |
if ("jpg" in short or "jpeg" in short): | |
return "jpg" | |
elif ("gif" in short): | |
return "gif" | |
else: | |
return "png" | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Download Images from Reddit") | |
parser.add_argument('subreddit', help="The subreddit to load images from") | |
parser.add_argument('--count', '-c', default='10', type=int, help="Number of images (top posts first)") | |
parser.add_argument('--output', '-o', default=".", action='store', help="The output directory for the images") | |
parser.add_argument('--category', '-t', default="top", | |
choices=["top", "top-all", "top-day", "top-hour", "top-month", "top-month", "top-week", | |
"top-year", "con", "con-all", "con-day", "con-hour", "con-month", "con-week", | |
"con-year", "hot", "new", "new-bydate", "new-byrising", "random", "rising"], | |
help="From which category do you want to download") | |
return parser.parse_args() | |
def download_images(urls, directory): | |
actual = 0 | |
not_read=[] | |
ts=time.time() | |
timestamp=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') | |
widgets=["Downloading ",Percentage(),' ', Bar(), ETA(),' '] | |
pbar = ProgressBar(widgets=widgets, maxval=100 ).start(); | |
if directory and not directory.endswith("/"): | |
directory += "/" | |
if not os.path.isdir(directory): | |
print(directory+" could not be found") | |
for i, url in enumerate(urls): | |
try: | |
response = urllib.request.urlopen(url) | |
except urllib.error.HTTPError as e: | |
print(url +": \t"+str(e.code)+" "+e.msg) | |
continue | |
except urllib.error.URLError as e: | |
print("Could not download "+url) | |
continue | |
content_type = get_content_type(response) | |
if "image" in content_type: | |
percent = float(i+1) / len(urls) * 100 | |
pbar.update(percent); | |
image_data = BytesIO(response.read()) | |
directory = directory if directory else "" | |
with open(directory + timestamp +"-image-"+str(i)+"."+get_file_format(content_type), 'wb') as f: | |
f.write(image_data.read()) | |
actual += 1 | |
time.sleep(2) | |
else: | |
not_read.append(url); | |
pbar.finish(); | |
if(len(not_read) > 0): | |
print("Could not read the following urls:") | |
for url in not_read: | |
print(url) | |
return actual | |
def get_filters(): | |
return {"top": lambda r, c: r.get_top(limit=c), | |
"top-all": lambda r, c: r.get_top_from_all(limit=c), | |
"top-day": lambda r, c: r.get_top_from_day(limit=c), | |
"top-hour": lambda r, c: r.get_top_from_hour(limit=c), | |
"top-month": lambda r, c: r.get_top_from_month(limit=c), | |
"top-week": lambda r, c: r.get_top_from_week(limit=c), | |
"top-year": lambda r, c: r.get_top_from_year(limit=c), | |
"con": lambda r, c: r.get_controversial(limit=c), | |
"con-all": lambda r, c: r.get_controversial_from_all(limit=c), | |
"con-day": lambda r, c: r.get_controversial_from_day(limit=c), | |
"con-hour": lambda r, c: r.get_controversial_from_hour(limit=c), | |
"con-month": lambda r, c: r.get_controversial_from_month(limit=c), | |
"con-week": lambda r, c: r.get_controversial_from_week(limit=c), | |
"con-year": lambda r, c: r.get_controversial_from_year(limit=c), | |
"hot": lambda r, c: r.get_hot(limit=c), | |
"new": lambda r, c: r.get_new(limit=c), | |
"new-bydate": lambda r, c: r.get_new_by_date(limit=c), | |
"new-byrising": lambda r, c: r.get_new_by_rising(limit=c), | |
"random": lambda r, c: r.get_random_submission(limit=c), | |
"rising": lambda r, c: r.get_rising(limit=c), | |
} | |
def main(): | |
args = parse_args() | |
urls = get_links(get_submissions(args.subreddit, args.count, get_filters()[args.category])) | |
print("Found "+str(len(urls))+" reddit threads") | |
urls = filter_for_imgur(urls) | |
print("Found "+str(len(urls))+" image links") | |
actual = download_images(urls, args.output) | |
print("Downloaded "+str(actual)+" images to "+ args.output if args.output else "current directory") | |
if __name__ == "__main__": | |
main() |
Just ask here, so others can benefit from a solution. What's the error message?
Edit: So I just tested the script and it still works fine for me. Make sure you follow the steps from the first comment (install all the dependencies). In case you are using python3 instead of python2, I just added a new script for that (The dependencies of the python3 script are descriped in the code comment)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script can be used to download images from reddit posts.
Dependencies (all installable by pip)
beatifulsoup4
praw
progressbar
The other imports should be installed by default.
Usage
You can for example download wallpapers from /r/wallpapers.
Example:
./reddit_dl.py --output ~/wallpapers --count 10 --category top-all wallpapers