Created
December 13, 2013 20:34
-
-
Save nikolak/7950905 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from time import time, sleep | |
| import requests | |
| import logging | |
| import json | |
| # submission dict layout initial idea | |
| # {"id": { | |
| # time:{ #unix timestamp, str | |
| # score:int, | |
| # upvotes:int, | |
| # downvotes:int | |
| # } | |
| # time:{...}etc | |
| # "last_check":float, # unix timestamp | |
| # "last_score":int # score on last_check | |
| # } | |
| # } | |
| from os import path | |
| if not path.exists("data"): | |
| with open("data","w") as a:a.write("{}") | |
| logging.basicConfig(level=logging.DEBUG, | |
| format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', | |
| datefmt='%H:%M:%S %d.%m', | |
| filename='debug.log', | |
| filemode='w') | |
| console = logging.StreamHandler() | |
| console.setLevel(logging.INFO) | |
| formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] - %(message)s', | |
| datefmt='%H:%M:%S') | |
| console.setFormatter(formatter) | |
| logging.getLogger('').addHandler(console) | |
| log = logging.getLogger('') | |
| headers = {"user-agent": "/u/wub_wub post fetching script"} | |
| api_url = "http://www.reddit.com/r/all/new/.json?limit=100" | |
| # api_url="http://www.reddit.com/new/.json?limit=100" | |
| api_by_id = "http://www.reddit.com/by_id/{}.json" # {}=t3_{id},t3_{id} | |
| with open("data", "r") as input_file: # IOError if it doesn't exist | |
| submissions = json.load(input_file) | |
| log.info("Loaded {} submissions from file\n".format(len(submissions))) | |
| def update_and_purge(purge=True, purge_time_min=10, score_diff=10): | |
| removed_counter = 0 | |
| old_counter = 0 | |
| ids = [i for i in submissions.keys()] | |
| log.info("Updating {} submissions".format(len(ids))) | |
| for x in range(0, len(ids), 100): | |
| items = ids[x:x + 100] | |
| items_url = api_by_id.format(",".join(items)) | |
| request_time = time() | |
| log.info( | |
| "[UPDATING] Starting request... {} out of {}".format((x / 100) + 1, | |
| (len(ids) / 100) + 1)) | |
| try: | |
| r = requests.get(items_url, headers=headers, timeout=20) | |
| except: | |
| log.warning("Updating failed, retrying...") | |
| return False | |
| r_json = r.json() | |
| sub_list = r_json['data']['children'] | |
| for sub in sub_list: | |
| sub = sub['data'] | |
| value = submissions[sub['name']] | |
| value[time()] = { | |
| "score": sub['score'], | |
| "upvotes": sub['ups'], | |
| "downvotes": sub['downs'] | |
| } | |
| if purge and value['last_check'] is not None: #fixme: not needed I think | |
| if time() - float(value['last_check']) > purge_time_min * 60: | |
| if sub['score'] < submissions[sub['name']]['last_score'] + score_diff and sub['score'] < 500: | |
| submissions.pop(sub['name']) | |
| removed_counter += 1 | |
| else: | |
| old_counter += 1 | |
| submissions[sub['name']]['last_check'] = time() | |
| submissions[sub['name']]['last_score'] = sub['score'] | |
| if time() - request_time < 2: | |
| sleep(2) | |
| log.info("Updated {} submissions. Removed:{}. " | |
| "Old submissions updated:{}".format(len(ids) - removed_counter, | |
| removed_counter, | |
| old_counter)) | |
| return True | |
| def get_new_posts(): | |
| log.info("[NEW] Getting new posts.") | |
| counter = 0 | |
| try: | |
| r = requests.get(api_url, headers=headers, timeout=20) | |
| except: | |
| log.warning("Getting new posts failed, retrying...") | |
| return False | |
| r_json = r.json() | |
| sub_list = r_json['data']['children'] | |
| for sub in sub_list: | |
| sub = sub['data'] | |
| if submissions.get(sub['name']) is None: | |
| submissions[sub['name']] = { | |
| time(): { | |
| "score": sub['score'], | |
| "upvotes": sub['ups'], | |
| "downvotes": sub['downs'] | |
| }, | |
| "last_check": time(), | |
| "last_score": sub['score'] | |
| } | |
| counter += 1 | |
| log.info("Added {} new submissions.".format(counter)) | |
| return True | |
| def save_to_file(): | |
| with open("data", "w") as out_file: | |
| json.dump(submissions, out_file, indent=4) | |
| log.info("Saved data to file") | |
| def show_stats(): | |
| most_items = 0 | |
| submission_id = None | |
| for subid, value in submissions.items(): | |
| if len(value) > most_items: | |
| most_items = len(value) | |
| submission_id = subid | |
| print "Submission ID :{}, Items:{}".format(submission_id, | |
| most_items) | |
| def main(): | |
| run = 0 | |
| while True: | |
| run += 1 | |
| start = time() | |
| try: | |
| new_posts_done = False | |
| update_done = False | |
| log.info("\n{}\nRun: {} - Total submissions: {}\n".format("=" * 79, | |
| run, | |
| len(submissions))) | |
| while new_posts_done is not True: | |
| new_posts_done = get_new_posts() | |
| sleep(2) | |
| if run % 2 == 0: # Every 3 min, update every post | |
| while update_done is not True: | |
| update_done = update_and_purge(purge_time_min=30, | |
| score_diff=20) | |
| sleep(2) | |
| except: | |
| log.error("Error occured on run: {}".format(run)) | |
| diff = time() - start | |
| save_to_file() | |
| if 60 - diff < 0: | |
| log.info("Skipping timeout, diff: {:.0f}".format(60 - diff)) | |
| else: | |
| log.info("Sleeping timeout: {:.0f}".format(60 - diff)) | |
| sleep(60 - diff) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment