Last active
July 16, 2020 17:12
-
-
Save j-mcc1993/0880415bd92d52eddaac to your computer and use it in GitHub Desktop.
Reddit Scraper 2.0 with Imgur API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3.5 | |
import datetime | |
import os | |
import praw | |
import pprint | |
import urllib2 | |
from imgurpython import ImgurClient | |
from sys import argv | |
# Set Imgur authentication fields | |
client_id = REDACTED | |
client_secret = REDACTED | |
# Reddit authentication | |
r_client_id = REDACTED | |
r_client_secret = REDACTED | |
# Get arguments from command line | |
script, sub, lim, debug = argv; | |
# Downloads image to directory given imgur ID | |
def download_image(img_id, directory, author): | |
print(img_id + ': downloading...') | |
response = urllib2.urlopen('https://i.imgur.com/%s.png' % (img_id)) | |
html = response.read() | |
path = directory + author + '.png' | |
group = open(path, 'wb') | |
group.write(html) | |
group.close() | |
# Downloads an album of images given an imgur album ID | |
def download_album(album_id, directory, author): | |
# create list of image objects | |
image_obj_list = client.get_album_images(album_id) | |
# empty list for image id's | |
id_list = [] | |
i = 0; | |
for img in image_obj_list: | |
print(img.id + ': downloading...') | |
response = urllib2.urlopen('https://i.imgur.com/%s.png' % (img.id)) | |
html = response.read() | |
path = directory + author + ' ' + str(i) + '.png' | |
group = open(path, 'wb') | |
group.write(html) | |
group.close() | |
id_list.append(image.id) | |
i += 1 | |
# Optionally print list of ID's | |
if debug == '1': | |
print(id_list) | |
# Initialize Imgur client and PRAW object | |
client = ImgurClient(client_id, client_secret) | |
r = praw.Reddit(client_id = r_client_id, | |
client_secret = r_client_secret, | |
user_agent = 'RedditScraper2.0 by /u/I_Am_Treebeard') | |
# Generate list of urls | |
subreddit = r.subreddit(sub) | |
link_list = subreddit.hot(limit = int(lim)) | |
# Make directory for photos | |
directory = '/Users/j.mcc3093/Desktop/%s (%s)/' % (sub, datetime.date.today()) | |
if not os.path.exists(directory): os.makedirs(directory) | |
# Main loop to scrape URLs | |
for link in link_list: | |
# Get URL and author name | |
author = link.author.name | |
url = link.url | |
# Don't download gifs | |
if '.gif' in url: | |
continue | |
# Check whether PRAW inserted a question mark | |
if '?' in url: | |
url = url[:url.rfind('?')] | |
# Check if link is an album | |
if '/a/' in url: | |
album_id = url[url.rfind('/a/')+3:] | |
download_album(album_id, directory, author) | |
# Check if link is a single image | |
elif 'i.imgur' in url or 'imgur' in url: | |
img_id = url[url.rfind('.com/')+5:].replace('.jpg', '') | |
download_image(img_id, directory, author) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment