Skip to content

Instantly share code, notes, and snippets.

@kylelk
Last active May 27, 2016 11:20
Show Gist options
  • Save kylelk/81717c7cacf4794630ff to your computer and use it in GitHub Desktop.
Save kylelk/81717c7cacf4794630ff to your computer and use it in GitHub Desktop.
#
# By Kyle Kersey
# 2014
#
import praw
import sqlite3
import sys
database_path = "reddit_comments.db"
subreddit_queue_file = "subreddit_queue.txt"
def pop_task():
task = ""
with open(subreddit_queue_file, "r") as f:
lines = f.read().split()
if len(lines) == 0:
task = None
else:
task = lines.pop(0)
lines = "\n".join(lines) + '\n'
with open(subreddit_queue_file, "w") as f:
f.write(lines)
return task
def push_task(task):
with open(subreddit_queue_file, "a") as f:
f.write(task + '\n')
fetch_limit = 100
r = praw.Reddit('Comment Scraper 1.0 by u/kerseykyle')
# conn.text_factory = str
def save_submission(conn, submission):
submission_values = [str(submission.id),
str(submission.author),
str(submission.domain),
str(submission.is_self),
int(submission.num_comments),
str(submission.over_18),
submission.permalink,
int(submission.score),
submission.selftext,
submission.selftext_html,
str(submission.subreddit),
str(submission.subreddit_id),
str(submission.thumbnail),
submission.title,
submission.url,
str(submission.edited),
str(submission.distinguished),
str(submission.stickied)]
insert_script = """
INSERT OR REPLACE INTO reddit_submissions
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
"""
conn.execute(insert_script, submission_values)
def save_comment(conn, comment, submission_id):
comment_values = [str(comment.id),
str(comment.author),
str(comment.name),
comment.body,
comment.body_html,
int(comment.created_utc),
int(comment.created),
int(comment.edited),
str(comment.link_id),
str(comment.parent_id),
int(comment.score),
str(comment.subreddit),
str(comment.subreddit_id),
str(submission_id)]
insert_script = """
INSERT OR REPLACE INTO reddit_comments
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
"""
conn.execute(insert_script, comment_values)
def get_reddit_submissions(task):
good_count = 1.0
bad_count = 1.0
submission_count = 0
is_link = False
conn = sqlite3.connect(database_path)
if task.startswith("http://www.reddit.com/r/"):
submissions = [r.get_submission(task)]
is_link = True
else:
subreddit = r.get_subreddit(task)
submissions = subreddit.get_hot(limit=fetch_limit)
try:
for submission in submissions:
save_submission(conn, submission)
if is_link:
submission.replace_more_comments(limit=None, threshold=0)
flat_comments = praw.helpers.flatten_tree(submission.comments)
submission_count += 1
print "[%d %s %s]"%(submission_count, submission.subreddit, submission.id), (submission.title).encode("UTF-8")
for comment in flat_comments:
try:
save_comment(conn, comment, submission.id)
good_count += 1
except AttributeError:
bad_count += 1
pass
except KeyboardInterrupt:
conn.commit()
conn.close()
print "%f.2"%(bad_count / good_count)
sys.exit(1)
print "comments insterted [%%%f]"%((bad_count/good_count)*100), good_count
conn.commit()
conn.close()
task = sys.argv[1]
push_task(task)
task = pop_task()
while task != None:
if task == "":
continue
get_reddit_submissions(task)
task = pop_task()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment