This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<script src="https://code.jquery.com/jquery-1.11.3.min.js"></script> | |
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css" rel="stylesheet"> | |
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> | |
<link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css"> | |
<style> | |
a:hover { | |
text-decoration: none; | |
} | |
h1 { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import praw | |
from praw.models import Comment | |
client_id = "your actual client id" | |
client_secret = "your client secret" | |
user_agent = "your user agent" | |
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_sm') | |
# comments is an array of strings we generated earlier | |
parsed_bodies = [nlp(comm) for comm in comments] | |
cleaned = [] | |
for doc in parsed_bodies: | |
current = [] | |
for token in doc: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
raw_comment = "I hate u/oaguy1 and u/example123" | |
reddit_rx = re.compile(r"\b/?u/[\w-]{3,20}\b") | |
#returns "I hate USERNAME and USERNAME" | |
masked_comment = re.sub(reddit_rx, lambda x: "USERNAME", raw_comment) |