Skip to content

Instantly share code, notes, and snippets.

@thezakman
Created May 17, 2021 03:36
Show Gist options
  • Save thezakman/33ff64e80de5a49a5dbe5f904c3cfafb to your computer and use it in GitHub Desktop.
Save thezakman/33ff64e80de5a49a5dbe5f904c3cfafb to your computer and use it in GitHub Desktop.
#######
# IMPORT PACKAGES
#######
import praw
import pandas as pd
# Acessing the reddit api
reddit = praw.Reddit(client_id="",#my client id
client_secret="", #your client secret
user_agent="my user agent", #user agent name
username = "", # your reddit username
password = "") # your reddit password
sub = ['Askreddit'] # make a list of subreddits you want to scrape the data from
for s in sub:
subreddit = reddit.subreddit(s) # Chosing the subreddit
########################################
# CREATING DICTIONARY TO STORE THE DATA WHICH WILL BE CONVERTED TO A DATAFRAME
########################################
# NOTE: ALL THE POST DATA AND COMMENT DATA WILL BE SAVED IN TWO DIFFERENT
# DATASETS AND LATER CAN BE MAPPED USING IDS OF POSTS/COMMENTS AS WE WILL
# BE CAPTURING ALL IDS THAT COME IN OUR WAY
# SCRAPING CAN BE DONE VIA VARIOUS STRATEGIES {HOT,TOP,etc} we will go with keyword strategy i.e using search a keyword
query = ['Gaming']
for item in query:
post_dict = {
"title" : [],
"score" : [],
"id" : [],
"url" : [],
"comms_num": [],
"created" : [],
"body" : []
}
comments_dict = {
"comment_id" : [],
"comment_parent_id" : [],
"comment_body" : [],
"comment_link_id" : []
}
for submission in subreddit.search(query,sort = "top",limit = 1):
post_dict["title"].append(submission.title)
post_dict["score"].append(submission.score)
post_dict["id"].append(submission.id)
post_dict["url"].append(submission.url)
post_dict["comms_num"].append(submission.num_comments)
post_dict["created"].append(submission.created)
post_dict["body"].append(submission.selftext)
##### Acessing comments on the post
submission.comments.replace_more(limit = 1)
for comment in submission.comments.list():
comments_dict["comment_id"].append(comment.id)
comments_dict["comment_parent_id"].append(comment.parent_id)
comments_dict["comment_body"].append(comment.body)
comments_dict["comment_link_id"].append(comment.link_id)
post_comments = pd.DataFrame(comments_dict)
post_comments.to_csv(s+"_comments_"+ item +"subreddit.csv")
post_data = pd.DataFrame(post_dict)
post_data.to_csv(s+"_"+ item +"subreddit.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment