dentearl · December 22, 2012 02:19
diff --git a/imgurURLs.py b/imgurURLs.py
 #!/usr/bin/env python
 """
 imgurURLs.py,
 21 December 2012
 dent earl, dent.earl (a) gmail com
 a script to return valid imgur urls to subreddit posted images
 based in large part on a script from Tankor Smash:
     http://blog.tankorsmash.com/?p=266
 """
 from argparse import ArgumentParser
 import cPickle
 import datetime
 import json
 import os
 from pprint import pprint
 import requests

 def initArgs(parser):
    parser.add_argument('--limit', dest='limit', type=int, default=1,
                        help='number of URLs to output. default=%(default)s')
    parser.add_argument('--subreddit', dest='subreddit', type=str, default='cats',
                        help='subreddit to scrape. default=%(default)s')
    parser.add_argument('--page', dest='page', type=int, default=0,
                        help='imgur page to start searching')
    parser.add_argument('--novel', '--remember', dest='isNovel', default=False, action='store_true',
                        help=('Remembers which images are returned, trys to not return '
                              'previously seen image'))
    parser.add_argument('--nsfwOK', dest='isNSFWOK', default=False, action='store_true',
                        help='Normally NSFW images are ignored, this option allows them.')
    parser.add_argument('--forget', dest='isForget', default=False, action='store_true',
                        help='before going to imgur, forgets the cache.')
 def getUrls(history, args):
    # get json object from imgur gallery. can be appended with /month or /week for
    # more recent entries
    args.page -= 1 
    urls = [] # list of pairs containing the image name and file extension
    while len(urls) < args.limit:
        args.page += 1
        r = requests.get(r'http://imgur.com/r/%s/top/page/%d.json' % (args.subreddit, args.page))
        j = json.loads(r.text) # creates a python dict from the JSON object
        for entry in j['data']:
            if len(urls) == args.limit:
                break
            name = entry['hash'] # get the raw image name
            ext = entry['ext'] # get the image extension (.jpg, .gif etc)
            if entry['nsfw'] and not args.isNSFWOK:
                continue
            url = r'http://imgur.com/%s%s' % (name, ext)
            if url not in history:
                # history will be empty if --novel is off
                urls.append(url)
                history.add(url)
    return urls
 def reportUrls(urls, history, args):
    for u in urls:
        print u
    recordHistory(history, args)
 def checkHistory(args):
    history = set([])
    if not args.isNovel:
        return history
    if os.path.exists(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle')):
        f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'r')
        history = cPickle.load(f)
        f.close()
    return history
 def recordHistory(history, args):
    if not args.isNovel:
        return
    f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'wb')
    cPickle.dump(history, f, 1) # 1 is ascii, 2 is binary format
    f.close()
 def forgetful(args):
    if args.isForget:
        os.remove(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'))
 def main():
    parser = ArgumentParser()
    initArgs(parser)
    args = parser.parse_args()
    forgetful(args)
    history = checkHistory(args)
    urls = getUrls(history, args)
    reportUrls(urls, history, args)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	imgurURLs.py,
	21 December 2012
	dent earl, dent.earl (a) gmail com
	a script to return valid imgur urls to subreddit posted images
	based in large part on a script from Tankor Smash:
	http://blog.tankorsmash.com/?p=266
	"""
	from argparse import ArgumentParser
	import cPickle
	import datetime
	import json
	import os
	from pprint import pprint
	import requests

	def initArgs(parser):
	parser.add_argument('--limit', dest='limit', type=int, default=1,
	help='number of URLs to output. default=%(default)s')
	parser.add_argument('--subreddit', dest='subreddit', type=str, default='cats',
	help='subreddit to scrape. default=%(default)s')
	parser.add_argument('--page', dest='page', type=int, default=0,
	help='imgur page to start searching')
	parser.add_argument('--novel', '--remember', dest='isNovel', default=False, action='store_true',
	help=('Remembers which images are returned, trys to not return '
	'previously seen image'))
	parser.add_argument('--nsfwOK', dest='isNSFWOK', default=False, action='store_true',
	help='Normally NSFW images are ignored, this option allows them.')
	parser.add_argument('--forget', dest='isForget', default=False, action='store_true',
	help='before going to imgur, forgets the cache.')
	def getUrls(history, args):
	# get json object from imgur gallery. can be appended with /month or /week for
	# more recent entries
	args.page -= 1
	urls = [] # list of pairs containing the image name and file extension
	while len(urls) < args.limit:
	args.page += 1
	r = requests.get(r'http://imgur.com/r/%s/top/page/%d.json' % (args.subreddit, args.page))
	j = json.loads(r.text) # creates a python dict from the JSON object
	for entry in j['data']:
	if len(urls) == args.limit:
	break
	name = entry['hash'] # get the raw image name
	ext = entry['ext'] # get the image extension (.jpg, .gif etc)
	if entry['nsfw'] and not args.isNSFWOK:
	continue
	url = r'http://imgur.com/%s%s' % (name, ext)
	if url not in history:
	# history will be empty if --novel is off
	urls.append(url)
	history.add(url)
	return urls
	def reportUrls(urls, history, args):
	for u in urls:
	print u
	recordHistory(history, args)
	def checkHistory(args):
	history = set([])
	if not args.isNovel:
	return history
	if os.path.exists(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle')):
	f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'r')
	history = cPickle.load(f)
	f.close()
	return history
	def recordHistory(history, args):
	if not args.isNovel:
	return
	f = open(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'), 'wb')
	cPickle.dump(history, f, 1) # 1 is ascii, 2 is binary format
	f.close()
	def forgetful(args):
	if args.isForget:
	os.remove(os.path.join(os.getcwd(), '.imgurURLsHistory.pickle'))
	def main():
	parser = ArgumentParser()
	initArgs(parser)
	args = parser.parse_args()
	forgetful(args)
	history = checkHistory(args)
	urls = getUrls(history, args)
	reportUrls(urls, history, args)

	if __name__ == '__main__':
	main()