Last active
February 12, 2024 11:31
-
-
Save firmai/6a89400120fb9480c687ed719cc44a98 to your computer and use it in GitHub Desktop.
Reddit Sentiment.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Reddit Sentiment.ipynb", | |
"provenance": [], | |
"authorship_tag": "ABX9TyMOevKSlcC0kH7pjBSwnNvv", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/firmai/6a89400120fb9480c687ed719cc44a98/reddit-sentiment.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Starting (API/Raw Data)" | |
], | |
"metadata": { | |
"id": "d8YLsVeGw7NE" | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"At the start you have either raw data or a type of API that you have access to. Our purpose here is to create a numerical sentiment database from textual data and other features." | |
], | |
"metadata": { | |
"id": "0DUB2gmRw_nA" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install praw" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "5bBa9QWkya5Z", | |
"outputId": "f2c385e9-237e-49e8-b7d1-b9c417ef051b" | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting praw\n", | |
" Downloading praw-7.7.1-py3-none-any.whl (191 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m191.0/191.0 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hCollecting prawcore<3,>=2.1 (from praw)\n", | |
" Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)\n", | |
"Collecting update-checker>=0.18 (from praw)\n", | |
" Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)\n", | |
"Requirement already satisfied: websocket-client>=0.54.0 in /usr/local/lib/python3.10/dist-packages (from praw) (1.7.0)\n", | |
"Requirement already satisfied: requests<3.0,>=2.6.0 in /usr/local/lib/python3.10/dist-packages (from prawcore<3,>=2.1->praw) (2.31.0)\n", | |
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.3.2)\n", | |
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.6)\n", | |
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.0.7)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2024.2.2)\n", | |
"Installing collected packages: update-checker, prawcore, praw\n", | |
"Successfully installed praw-7.7.1 prawcore-2.4.0 update-checker-0.18.0\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"VADER ( Valence Aware Dictionary for Sentiment Reasoning) is a model used for text sentiment analysis that is sensitive to both polarity (positive/negative) and intensity (strength) of emotion." | |
], | |
"metadata": { | |
"id": "9d7M7A_5JIx1" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"## This could take 5-10 minutes to load\n", | |
"import nltk\n", | |
"from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA\n", | |
"import praw\n", | |
"import matplotlib.pyplot as plt\n", | |
"import math\n", | |
"import datetime as dt\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"\n", | |
"nltk.download('vader_lexicon')\n", | |
"nltk.download('stopwords')\n", | |
"\n" | |
], | |
"metadata": { | |
"id": "soTxrx2ZFSyM", | |
"outputId": "60833bbc-f20d-4362-c525-b887623795ca", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n", | |
"[nltk_data] Package vader_lexicon is already up-to-date!\n", | |
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
"[nltk_data] Package stopwords is already up-to-date!\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 1 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"id": "6UWDPIBvtJHJ" | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"reddit = praw.Reddit(client_id='gtvkIEzRvVUf-U65JaQlNg',\n", | |
" client_secret='6mGollB6YVWWdNk56CLELFSoi8diMA',\n", | |
" user_agent='mathieudempsey',\n", | |
" check_for_async=False) ## to use this, make a Reddit app. Client ID is in top left corner, client secret is given, and user agent is the username that the app is under\n", | |
"\n", | |
"\n", | |
"\n", | |
"sub_reddits = reddit.subreddit('wallstreetbets')\n", | |
"stocks = [\"GME\", \"PTON\"]\n", | |
"\n", | |
"def commentSentiment(ticker, urlT):\n", | |
" subComments = []\n", | |
" bodyComment = []\n", | |
" try:\n", | |
" check = reddit.submission(url=urlT)\n", | |
" subComments = check.comments\n", | |
" except:\n", | |
" return 0\n", | |
"\n", | |
" for comment in subComments:\n", | |
" try:\n", | |
" bodyComment.append(comment.body)\n", | |
" except:\n", | |
" return 0\n", | |
"\n", | |
" sia = SIA() # VADER’s SentimentIntensityAnalyzer() takes in a string and returns a dictionary of scores (negative, neutral, positive)\n", | |
" # .. and compound which is computed by normalising the three before-mentioned scores. (positive is high, negative low)\n", | |
" results = []\n", | |
" for line in bodyComment:\n", | |
" scores = sia.polarity_scores(line)\n", | |
" scores['headline'] = line\n", | |
"\n", | |
" results.append(scores)\n", | |
"\n", | |
" df =pd.DataFrame.from_records(results)\n", | |
" df.head()\n", | |
" df['label'] = 0\n", | |
"\n", | |
" try:\n", | |
" df.loc[df['compound'] > 0.1, 'label'] = 1\n", | |
" df.loc[df['compound'] < -0.1, 'label'] = -1\n", | |
" except:\n", | |
" return 0\n", | |
"\n", | |
" averageScore = 0\n", | |
" position = 0\n", | |
" while position < len(df.label)-1:\n", | |
" averageScore = averageScore + df.label[position]\n", | |
" position += 1\n", | |
" averageScore = averageScore/len(df.label)\n", | |
"\n", | |
" return(averageScore)\n", | |
"\n", | |
"\n", | |
"def latestComment(ticker, urlT):\n", | |
" subComments = []\n", | |
" updateDates = []\n", | |
" try:\n", | |
" check = reddit.submission(url=urlT)\n", | |
" subComments = check.comments\n", | |
" except:\n", | |
" return 0\n", | |
"\n", | |
" for comment in subComments:\n", | |
" try:\n", | |
" updateDates.append(comment.created_utc)\n", | |
" except:\n", | |
" return 0\n", | |
"\n", | |
" updateDates.sort()\n", | |
" return(updateDates[-1])\n", | |
"\n", | |
"\n", | |
"def get_date(date):\n", | |
" return dt.datetime.fromtimestamp(date)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"This could be very slow, and even take 10 mins or so to run." | |
], | |
"metadata": { | |
"id": "_b1ook01aW61" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"\n", | |
"submission_statistics = []\n", | |
"d = {}\n", | |
"for ticker in stocks:\n", | |
" for submission in reddit.subreddit('wallstreetbets').search(ticker, limit=130): #Search submissions related to the ticker\n", | |
" if submission.domain != \"self.wallstreetbets\":\n", | |
" continue\n", | |
" d = {}\n", | |
" d['ticker'] = ticker\n", | |
" d['num_comments'] = submission.num_comments\n", | |
" d['comment_sentiment_average'] = commentSentiment(ticker, submission.url)\n", | |
" if d['comment_sentiment_average'] == 0.000000:\n", | |
" continue\n", | |
" d['latest_comment_date'] = latestComment(ticker, submission.url)\n", | |
" d['score'] = submission.score\n", | |
" d['upvote_ratio'] = submission.upvote_ratio\n", | |
" d['date'] = submission.created_utc\n", | |
" d['domain'] = submission.domain\n", | |
" d['num_crossposts'] = submission.num_crossposts\n", | |
" d['author'] = submission.author\n", | |
" submission_statistics.append(d)\n", | |
"\n", | |
"dfSentimentStocks = pd.DataFrame(submission_statistics)\n", | |
"\n", | |
"_timestampcreated = dfSentimentStocks[\"date\"].apply(get_date)\n", | |
"dfSentimentStocks = dfSentimentStocks.assign(timestamp = _timestampcreated)\n", | |
"\n", | |
"_timestampcomment = dfSentimentStocks[\"latest_comment_date\"].apply(get_date)\n", | |
"dfSentimentStocks = dfSentimentStocks.assign(commentdate = _timestampcomment)\n", | |
"\n", | |
"dfSentimentStocks.sort_values(\"latest_comment_date\", axis = 0, ascending = True,inplace = True, na_position ='last')\n", | |
"\n", | |
"\n", | |
"dfSentimentStocks.to_csv('Reddit_Sentiment_Equity.csv', index=False)" | |
], | |
"metadata": { | |
"id": "9cch2LaNN-R8" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"try:\n", | |
" dfSentimentStocks = pd.read_csv(\"Reddit_Sentiment_Equity.csv\", parse_dates=['commentdate'])\n", | |
"except:\n", | |
" dfSentimentStocks = pd.read_csv(\"https://storage.googleapis.com/public-quant/course//content/Reddit_Sentiment_Equity(1).csv\", parse_dates=['commentdate'])" | |
], | |
"metadata": { | |
"id": "RCTAf7phZRfE" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"del dfSentimentStocks[\"latest_comment_date\"]\n", | |
"del dfSentimentStocks[\"date\"]" | |
], | |
"metadata": { | |
"id": "rEFl-NuiMWqE" | |
}, | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks = dfSentimentStocks.set_index(\"commentdate\")" | |
], | |
"metadata": { | |
"id": "OTT8okKRYFjc" | |
}, | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks[\"ticker\"].value_counts()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Sn2QrqSCFfmO", | |
"outputId": "f63c026e-a4d6-4fd7-9272-05b09330eea9" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"PTON 51\n", | |
"GME 31\n", | |
"Name: ticker, dtype: int64" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks = dfSentimentStocks[dfSentimentStocks[\"ticker\"]==\"PTON\"]" | |
], | |
"metadata": { | |
"id": "stzonsLTYyJ8" | |
}, | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks = dfSentimentStocks.resample('W')" | |
], | |
"metadata": { | |
"id": "2se-A_I-ZN71" | |
}, | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks = dfSentimentStocks.ffill()" | |
], | |
"metadata": { | |
"id": "Y02Zi5-PY5EV" | |
}, | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"dfSentimentStocks[\"comment_sentiment_average\"].plot()\n" | |
], | |
"metadata": { | |
"id": "PHdOYA89Y_N8", | |
"outputId": "cc900cfb-9310-4778-9996-69cda7e3a5e0", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 484 | |
} | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<Axes: xlabel='commentdate'>" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 9 | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"<Figure size 640x480 with 1 Axes>" | |
], | |
"image/png": "\n" | |
}, | |
"metadata": {} | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment