Created
October 21, 2024 18:15
-
-
Save nwithan8/b355460495198b3c7a55a86e0eeed4ef to your computer and use it in GitHub Desktop.
Analyze live emotions of RedditCFB game thread
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import enum | |
from datetime import datetime | |
import dotenv | |
import praw | |
from textblob import TextBlob | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
# Load the environment variables | |
config = dotenv.dotenv_values(".redditcfb_sentiment_analysis_env") | |
# Initialize the PRAW Reddit API client | |
reddit = praw.Reddit( | |
client_id=config['PRAW_CLIENT_ID'], | |
client_secret=config['PRAW_CLIENT_SECRET'], | |
user_agent=config['PRAW_USER_AGENT'], | |
username=config['PRAW_USERNAME'], | |
password=config['PRAW_PASSWORD'] | |
) | |
class Analyzer(enum.Enum): | |
TEXTBLOB = "textblob" | |
VADER = "vader" | |
NEUTRAL_SCORE_THRESHOLD = 0.2 | |
MOST_POSITIVE = "Extremely positive" | |
MORE_POSITIVE = "Very positive" | |
POSITIVE = "Positive" | |
NEUTRAL = "Neutral" | |
NEGATIVE = "Negative" | |
MORE_NEGATIVE = "Very negative" | |
MOST_NEGATIVE = "Extremely negative" | |
HEAT_RANGES = { | |
MOST_NEGATIVE: (-1, -0.8), | |
MORE_NEGATIVE: (-0.8, -0.6), | |
NEGATIVE: (-0.6, -NEUTRAL_SCORE_THRESHOLD), | |
NEUTRAL: (-NEUTRAL_SCORE_THRESHOLD, NEUTRAL_SCORE_THRESHOLD), | |
POSITIVE: (NEUTRAL_SCORE_THRESHOLD, 0.6), | |
MORE_POSITIVE: (0.6, 0.8), | |
MOST_POSITIVE: (0.8, 1) | |
} | |
SUMMARY_MESSAGE_TEMPLATE = """ | |
Current Vibes, according to math: | |
General feeling: {general_feeling} ({average_score}) | |
Positive comments: {positive_count} | |
Negative comments: {negative_count} | |
Most positive comment: {positive_feeling} ({max_score}) | |
Most negative comment: {negative_feeling} ({min_score}) | |
Standard deviation: {standard_deviation} | |
""" | |
def calculate_standard_deviation(values: list[float]) -> float: | |
""" | |
Calculate the standard deviation of a list of values. | |
:param values: The list of values | |
:return: The standard deviation of the values | |
""" | |
n = len(values) | |
mean = sum(values) / n | |
variance = sum((x - mean) ** 2 for x in values) / n | |
return variance ** 0.5 | |
def get_heat_range(score: float) -> str: | |
""" | |
Get the heat range of a sentiment score. | |
:param score: The sentiment score | |
:return: The heat range of the sentiment score | |
""" | |
for heat_range, (lower_bound, upper_bound) in HEAT_RANGES.items(): | |
if lower_bound <= score < upper_bound: | |
return heat_range | |
if score == 1: # Handle the edge case of a score of 1 | |
return MOST_POSITIVE | |
return 'unknown' | |
class SentimentAnalysisStats: | |
raw_scores: list[float] | |
total_score: float | |
average_score: float | |
min_score: float | |
max_score: float | |
positive_count: int | |
negative_count: int | |
neutral_count: int | |
standard_deviation: float | |
def __init__(self, raw_scores: list[float], neutral_score_threshold: float = 0.2): | |
self.raw_scores = raw_scores | |
self.total_score = sum(raw_scores) | |
self.average_score = sum(raw_scores) / len(raw_scores) | |
self.min_score = min(raw_scores) | |
self.max_score = max(raw_scores) | |
self.positive_count = len([score for score in raw_scores if score > neutral_score_threshold]) | |
self.negative_count = len([score for score in raw_scores if score < -neutral_score_threshold]) | |
self.neutral_count = len( | |
[score for score in raw_scores if -neutral_score_threshold <= score <= neutral_score_threshold]) | |
self.standard_deviation = calculate_standard_deviation(values=raw_scores) | |
def get_sentiment_textblob(text: str) -> float: | |
""" | |
Get the sentiment of a text. The sentiment is a float value between -1 (negative) and 1 (positive). | |
:param text: The text to analyze | |
:return: The sentiment of the text | |
""" | |
# Create a TextBlob object | |
blob = TextBlob(text) | |
# Get the sentiment of the text | |
return blob.sentiment.polarity | |
def get_sentiment_vader(text: str, analyzer: SentimentIntensityAnalyzer) -> float: | |
""" | |
Get the sentiment of a text. The sentiment is a float value between -1 (negative) and 1 (positive). | |
:param text: The text to analyze | |
:param analyzer: The sentiment analyzer | |
:return: The sentiment of the text | |
""" | |
# Get the sentiment of the text | |
return analyzer.polarity_scores(text)['compound'] | |
def get_sentiment_stats_from_comments(comments: list[praw.reddit.models.Comment], | |
analyzer: Analyzer, | |
neutral_score_threshold: float = 0.2) -> SentimentAnalysisStats: | |
""" | |
Get the average, minimum, and maximum and standard deviation of the sentiment of a list of comments. | |
:param comments: The list of comments | |
:param analyzer: The sentiment analyzer to use | |
:param neutral_score_threshold: The threshold for considering a sentiment score as neutral | |
:return: Statistics of the sentiment of the comments | |
""" | |
if analyzer == Analyzer.TEXTBLOB: | |
sentiment_scores = [get_sentiment_textblob(text=comment.body) for comment in comments] | |
elif analyzer == Analyzer.VADER: | |
# Initialize the VADER sentiment analyzer once, more efficient | |
vader_sentiment_analyzer = SentimentIntensityAnalyzer() | |
sentiment_scores = [get_sentiment_vader(text=comment.body, analyzer=vader_sentiment_analyzer) for comment in | |
comments] | |
else: | |
raise ValueError("Invalid sentiment analyzer") | |
return SentimentAnalysisStats(raw_scores=sentiment_scores, neutral_score_threshold=neutral_score_threshold) | |
def comment_is_recent(comment, time_limit_timestamp) -> bool: | |
""" | |
Check if a comment is recent based on a time limit timestamp. | |
:param comment: The comment | |
:param time_limit_timestamp: The time limit timestamp | |
:return: True if the comment is recent, False otherwise | |
""" | |
return comment.created_utc >= time_limit_timestamp | |
def collect_comments(thread_id: str, comment_limit: int = 100, time_limit_minutes: int = 10) \ | |
-> list[praw.reddit.models.Comment]: | |
""" | |
Collect recent comments from a Reddit thread. | |
:param thread_id: The ID of the thread | |
:param comment_limit: The maximum number of comments to fetch | |
:param time_limit_minutes: The time limit in minutes to fetch comments | |
:return: A list of comments | |
""" | |
current_time = datetime.utcnow().timestamp() # Best to set this BEFORE retrieving all the comments, to lower chances of getting comments older than the time limit | |
thread = reddit.submission(id=thread_id) | |
# Fetch (up to) the 100 most recent comments | |
thread.comments.replace_more(limit=0) # Remove the "MoreComments" objects | |
comments = thread.comments.list()[:comment_limit] | |
# Filter comments based on the time limit | |
# First, check the time of the last comment compared to the time limit | |
last_comment = comments[-1] | |
time_limit_timestamp = current_time - time_limit_minutes * 60 | |
if comment_is_recent(comment=last_comment, time_limit_timestamp=time_limit_timestamp): | |
# If the last comment is within the time limit, we're done | |
return comments | |
else: | |
# Need to remove comments that are older than the time limit | |
return [comment for comment in comments if | |
comment_is_recent(comment=comment, time_limit_timestamp=time_limit_timestamp)] | |
def get_snapshot_summary_of_thread(thread_id: str, | |
analyzer: Analyzer, | |
comment_limit: int = 100, | |
time_limit_minutes: int = 10) -> str: | |
""" | |
Get a analysis snapshot summary of a Reddit thread. | |
:param thread_id: The ID of the thread | |
:param analyzer: The sentiment analyzer to use | |
:param comment_limit: The maximum number of comments to fetch | |
:param time_limit_minutes: The time limit in minutes to fetch comments | |
:return: A summary of the thread analysis | |
""" | |
comments: list[praw.reddit.models.Comment] = collect_comments(thread_id=thread_id, comment_limit=comment_limit, | |
time_limit_minutes=time_limit_minutes) | |
if not comments: | |
return "No comments found in the thread." | |
sentiment_statistics = get_sentiment_stats_from_comments(comments=comments, analyzer=analyzer, | |
neutral_score_threshold=0.2) | |
summary_message = SUMMARY_MESSAGE_TEMPLATE.format( | |
general_feeling=get_heat_range(sentiment_statistics.average_score), | |
average_score=f"{sentiment_statistics.average_score:.3f}", | |
positive_count=sentiment_statistics.positive_count, | |
negative_count=sentiment_statistics.negative_count, | |
positive_feeling=get_heat_range(sentiment_statistics.max_score), | |
max_score=sentiment_statistics.max_score, | |
negative_feeling=get_heat_range(sentiment_statistics.min_score), | |
min_score=sentiment_statistics.min_score, | |
standard_deviation=sentiment_statistics.standard_deviation | |
) | |
return summary_message | |
if __name__ == '__main__': | |
game_thread_id = '1g7p1ln' # Georgia vs. Texas 2024 4th Quarter Game Thread | |
analyzer_type = Analyzer.VADER # VADER seems to be more extreme (accurate) than TextBlob, but makes for more fun results | |
summary = get_snapshot_summary_of_thread(thread_id=game_thread_id, analyzer=analyzer_type, comment_limit=100, | |
time_limit_minutes=100000000) # In real-time, this would be 10 minutes, but for testing purposes, we set it to a large number to avoid getting no comments | |
print(summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output: