Last active
May 27, 2016 11:20
-
-
Save kylelk/81717c7cacf4794630ff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# By Kyle Kersey | |
# 2014 | |
# | |
import praw | |
import sqlite3 | |
import sys | |
database_path = "reddit_comments.db" | |
subreddit_queue_file = "subreddit_queue.txt" | |
def pop_task(): | |
task = "" | |
with open(subreddit_queue_file, "r") as f: | |
lines = f.read().split() | |
if len(lines) == 0: | |
task = None | |
else: | |
task = lines.pop(0) | |
lines = "\n".join(lines) + '\n' | |
with open(subreddit_queue_file, "w") as f: | |
f.write(lines) | |
return task | |
def push_task(task): | |
with open(subreddit_queue_file, "a") as f: | |
f.write(task + '\n') | |
fetch_limit = 100 | |
r = praw.Reddit('Comment Scraper 1.0 by u/kerseykyle') | |
# conn.text_factory = str | |
def save_submission(conn, submission): | |
submission_values = [str(submission.id), | |
str(submission.author), | |
str(submission.domain), | |
str(submission.is_self), | |
int(submission.num_comments), | |
str(submission.over_18), | |
submission.permalink, | |
int(submission.score), | |
submission.selftext, | |
submission.selftext_html, | |
str(submission.subreddit), | |
str(submission.subreddit_id), | |
str(submission.thumbnail), | |
submission.title, | |
submission.url, | |
str(submission.edited), | |
str(submission.distinguished), | |
str(submission.stickied)] | |
insert_script = """ | |
INSERT OR REPLACE INTO reddit_submissions | |
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) | |
""" | |
conn.execute(insert_script, submission_values) | |
def save_comment(conn, comment, submission_id): | |
comment_values = [str(comment.id), | |
str(comment.author), | |
str(comment.name), | |
comment.body, | |
comment.body_html, | |
int(comment.created_utc), | |
int(comment.created), | |
int(comment.edited), | |
str(comment.link_id), | |
str(comment.parent_id), | |
int(comment.score), | |
str(comment.subreddit), | |
str(comment.subreddit_id), | |
str(submission_id)] | |
insert_script = """ | |
INSERT OR REPLACE INTO reddit_comments | |
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?) | |
""" | |
conn.execute(insert_script, comment_values) | |
def get_reddit_submissions(task): | |
good_count = 1.0 | |
bad_count = 1.0 | |
submission_count = 0 | |
is_link = False | |
conn = sqlite3.connect(database_path) | |
if task.startswith("http://www.reddit.com/r/"): | |
submissions = [r.get_submission(task)] | |
is_link = True | |
else: | |
subreddit = r.get_subreddit(task) | |
submissions = subreddit.get_hot(limit=fetch_limit) | |
try: | |
for submission in submissions: | |
save_submission(conn, submission) | |
if is_link: | |
submission.replace_more_comments(limit=None, threshold=0) | |
flat_comments = praw.helpers.flatten_tree(submission.comments) | |
submission_count += 1 | |
print "[%d %s %s]"%(submission_count, submission.subreddit, submission.id), (submission.title).encode("UTF-8") | |
for comment in flat_comments: | |
try: | |
save_comment(conn, comment, submission.id) | |
good_count += 1 | |
except AttributeError: | |
bad_count += 1 | |
pass | |
except KeyboardInterrupt: | |
conn.commit() | |
conn.close() | |
print "%f.2"%(bad_count / good_count) | |
sys.exit(1) | |
print "comments insterted [%%%f]"%((bad_count/good_count)*100), good_count | |
conn.commit() | |
conn.close() | |
task = sys.argv[1] | |
push_task(task) | |
task = pop_task() | |
while task != None: | |
if task == "": | |
continue | |
get_reddit_submissions(task) | |
task = pop_task() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment