Created
December 22, 2012 02:19
-
-
Save dentearl/4357111 to your computer and use it in GitHub Desktop.
A script to return valid imgur urls to subreddit posted images. Has a cache mode to ensure no repeated images.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
imgurURLs.py, | |
21 December 2012 | |
dent earl, dent.earl (a) gmail com | |
a script to return valid imgur urls to subreddit posted images | |
based in large part on a script from Tankor Smash: | |
http://blog.tankorsmash.com/?p=266 | |
""" | |
from argparse import ArgumentParser | |
import cPickle | |
import datetime | |
import json | |
import os | |
from pprint import pprint | |
import requests | |
def initArgs(parser): | |
parser.add_argument('--limit', dest='limit', type=int, default=1, | |
help='number of URLs to output. default=%(default)s') | |
parser.add_argument('--subreddit', dest='subreddit', type=str, default='cats', | |
help='subreddit to scrape. default=%(default)s') | |
parser.add_argument('--page', dest='page', type=int, default=0, | |
help='imgur page to start searching') | |
parser.add_argument('--novel', '--remember', dest='isNovel', default=False, action='store_true', | |
help=('Remembers which images are returned, trys to not return ' | |
'previously seen image')) | |
parser.add_argument('--nsfwOK', dest='isNSFWOK', default=False, action='store_true', | |
help='Normally NSFW images are ignored, this option allows them.') | |
parser.add_argument('--forget', dest='isForget', default=False, action='store_true', | |
help='before going to imgur, forgets the cache.') | |
def getUrls(history, args): | |
# get json object from imgur gallery. can be appended with /month or /week for | |
# more recent entries | |
args.page -= 1 | |
urls = [] # list of pairs containing the image name and file extension | |
while len(urls) < args.limit: | |
args.page += 1 | |
r = requests.get(r'http://imgur.com/r/%s/top/page/%d.json' % (args.subreddit, args.page)) | |
j = json.loads(r.text) # creates a python dict from the JSON object | |
for entry in j['data']: | |
if len(urls) == args.limit: | |
break | |
name = entry['hash'] # get the raw image name | |
ext = entry['ext'] # get the image extension (.jpg, .gif etc) | |
if entry['nsfw'] and not args.isNSFWOK: | |
continue | |
url = r'http://imgur.com/%s%s' % (name, ext) | |
if url not in history: | |
# history will be empty if --novel is off | |
urls.append(url) | |
history.add(url) | |
return urls | |
def reportUrls(urls, history, args): | |
for u in urls: | |
print u | |
recordHistory(history, args) | |
def checkHistory(args): | |
history = set([]) | |
if not args.isNovel: | |
return history | |
if os.path.exists(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle')): | |
f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'r') | |
history = cPickle.load(f) | |
f.close() | |
return history | |
def recordHistory(history, args): | |
if not args.isNovel: | |
return | |
f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'wb') | |
cPickle.dump(history, f, 1) # 1 is ascii, 2 is binary format | |
f.close() | |
def forgetful(args): | |
if args.isForget: | |
os.remove(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle')) | |
def main(): | |
parser = ArgumentParser() | |
initArgs(parser) | |
args = parser.parse_args() | |
forgetful(args) | |
history = checkHistory(args) | |
urls = getUrls(history, args) | |
reportUrls(urls, history, args) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment