Last active
June 5, 2020 19:16
-
-
Save oaguy1/599cbb1046fde618faa41624369249da to your computer and use it in GitHub Desktop.
Simple Reddit EDA for NLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import praw | |
from praw.models import Comment | |
client_id = "your actual client id" | |
client_secret = "your client secret" | |
user_agent = "your user agent" | |
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent) | |
# reminder, text from reddit is generally nsfw | |
sub_name = "politics" | |
max_posts = 1000 | |
sub = reddit.subreddit(sub_name) | |
comments = [] | |
comments_growth = [] | |
nlp = spacy.load('en_core_web_sm') | |
for i, submission in enumerate(sub.hot(limit=max_posts)): | |
# extract all the comments into an array of strings, this accounts for placeholder comments | |
# that tell praw to grab more comments | |
extracted = [nlp(comm.body) for comm in submission.comments if type(comm) == Comment] | |
# append extracted comments to our corpus | |
comments += extracted | |
# store how many comments were stored | |
comments_growth.append(len(comments)) | |
flatten_lists = lambda corpus: list(itertools.chain.from_iterable(corpus)) | |
analysis = pd.DataFrame() | |
analysis['corpus'] = pd.Series({ i: len(list(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) }) | |
analysis['vocab'] = pd.Series({ i: len(set(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) }) | |
plt = analysis.plot(title=f"Comment Growth in r/{sub_name}") | |
plt.set_xlabel("No of Posts") | |
plt.set_ylabel("No of Comments") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment