Skip to content

Instantly share code, notes, and snippets.

@nikolak
Created December 13, 2013 20:34
Show Gist options
  • Select an option

  • Save nikolak/7950905 to your computer and use it in GitHub Desktop.

Select an option

Save nikolak/7950905 to your computer and use it in GitHub Desktop.
from time import time, sleep
import requests
import logging
import json
# submission dict layout initial idea
# {"id": {
# time:{ #unix timestamp, str
# score:int,
# upvotes:int,
# downvotes:int
# }
# time:{...}etc
# "last_check":float, # unix timestamp
# "last_score":int # score on last_check
# }
# }
from os import path
if not path.exists("data"):
with open("data","w") as a:a.write("{}")
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%H:%M:%S %d.%m',
filename='debug.log',
filemode='w')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s [%(levelname)-8s] - %(message)s',
datefmt='%H:%M:%S')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
log = logging.getLogger('')
headers = {"user-agent": "/u/wub_wub post fetching script"}
api_url = "http://www.reddit.com/r/all/new/.json?limit=100"
# api_url="http://www.reddit.com/new/.json?limit=100"
api_by_id = "http://www.reddit.com/by_id/{}.json" # {}=t3_{id},t3_{id}
with open("data", "r") as input_file: # IOError if it doesn't exist
submissions = json.load(input_file)
log.info("Loaded {} submissions from file\n".format(len(submissions)))
def update_and_purge(purge=True, purge_time_min=10, score_diff=10):
removed_counter = 0
old_counter = 0
ids = [i for i in submissions.keys()]
log.info("Updating {} submissions".format(len(ids)))
for x in range(0, len(ids), 100):
items = ids[x:x + 100]
items_url = api_by_id.format(",".join(items))
request_time = time()
log.info(
"[UPDATING] Starting request... {} out of {}".format((x / 100) + 1,
(len(ids) / 100) + 1))
try:
r = requests.get(items_url, headers=headers, timeout=20)
except:
log.warning("Updating failed, retrying...")
return False
r_json = r.json()
sub_list = r_json['data']['children']
for sub in sub_list:
sub = sub['data']
value = submissions[sub['name']]
value[time()] = {
"score": sub['score'],
"upvotes": sub['ups'],
"downvotes": sub['downs']
}
if purge and value['last_check'] is not None: #fixme: not needed I think
if time() - float(value['last_check']) > purge_time_min * 60:
if sub['score'] < submissions[sub['name']]['last_score'] + score_diff and sub['score'] < 500:
submissions.pop(sub['name'])
removed_counter += 1
else:
old_counter += 1
submissions[sub['name']]['last_check'] = time()
submissions[sub['name']]['last_score'] = sub['score']
if time() - request_time < 2:
sleep(2)
log.info("Updated {} submissions. Removed:{}. "
"Old submissions updated:{}".format(len(ids) - removed_counter,
removed_counter,
old_counter))
return True
def get_new_posts():
log.info("[NEW] Getting new posts.")
counter = 0
try:
r = requests.get(api_url, headers=headers, timeout=20)
except:
log.warning("Getting new posts failed, retrying...")
return False
r_json = r.json()
sub_list = r_json['data']['children']
for sub in sub_list:
sub = sub['data']
if submissions.get(sub['name']) is None:
submissions[sub['name']] = {
time(): {
"score": sub['score'],
"upvotes": sub['ups'],
"downvotes": sub['downs']
},
"last_check": time(),
"last_score": sub['score']
}
counter += 1
log.info("Added {} new submissions.".format(counter))
return True
def save_to_file():
with open("data", "w") as out_file:
json.dump(submissions, out_file, indent=4)
log.info("Saved data to file")
def show_stats():
most_items = 0
submission_id = None
for subid, value in submissions.items():
if len(value) > most_items:
most_items = len(value)
submission_id = subid
print "Submission ID :{}, Items:{}".format(submission_id,
most_items)
def main():
run = 0
while True:
run += 1
start = time()
try:
new_posts_done = False
update_done = False
log.info("\n{}\nRun: {} - Total submissions: {}\n".format("=" * 79,
run,
len(submissions)))
while new_posts_done is not True:
new_posts_done = get_new_posts()
sleep(2)
if run % 2 == 0: # Every 3 min, update every post
while update_done is not True:
update_done = update_and_purge(purge_time_min=30,
score_diff=20)
sleep(2)
except:
log.error("Error occured on run: {}".format(run))
diff = time() - start
save_to_file()
if 60 - diff < 0:
log.info("Skipping timeout, diff: {:.0f}".format(60 - diff))
else:
log.info("Sleeping timeout: {:.0f}".format(60 - diff))
sleep(60 - diff)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment