This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PRAW to interact with reddit | |
import praw | |
#install textblob if not already installed using "pip install -U textblob" | |
from textblob import TextBlob | |
import nltk | |
# Download VADER, if not downloaded | |
# nltk.download('vader_lexicon') | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
# create object for VADER sentiment function interaction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks | |
import tensorflow as tf | |
from sklearn.metrics import f1_score | |
from sklearn.model_selection import train_test_split | |
import tensorflow_datasets as tfds | |
from transformers import TFRobertaForSequenceClassification | |
from transformers import RobertaTokenizer | |
# Load your Dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks | |
import tensorflow as tf | |
from sklearn.metrics import f1_score | |
from sklearn.model_selection import train_test_split | |
import tensorflow_datasets as tfds | |
from transformers import TFRobertaForSequenceClassification | |
from transformers import RobertaTokenizer | |
import os |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
import pandas as pd | |
from transformers import RobertaTokenizer | |
import tensorflow as tf | |
from transformers import TFRobertaForSequenceClassification | |
import tensorflow_datasets as tfds | |
reddit = praw.Reddit(client_id='client id', | |
client_secret='client sceret', | |
user_agent='user agent') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import TFAutoModelForTokenClassification, AutoTokenizer | |
import tensorflow as tf | |
import praw | |
import pandas as pd | |
model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
label_list = [ | |
"O", # Outside of a named entity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.parsing.preprocessing | |
import remove_stopwords | |
import genism | |
from wordcloud import WordCloud | |
import numpy as np | |
import random | |
# import stopwords from gensim methods to stop_list variable | |
# You can also manually add stopwords | |
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from matplotlib import cm | |
from matplotlib.pyplot import plt | |
nlp = spacy.load('en_core_web_sm') | |
ner_collection = {"Location":[],"Person":[],"Date":[],"Quantity":[],"Organisation":[]} | |
location = [] | |
person = [] | |
date = [] | |
quantity = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from collections import Counter | |
# Loat the train and test data | |
train_df = pd.read_csv('train.csv') | |
train_df['df_type'] = 'train' | |
test_df = pd.read_csv('test.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the | |
import pandas as pd | |
from matplotlib import cm | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.patches as patches | |
import matplotlib.path as path | |
import matplotlib.ticker as ticker | |
import matplotlib.animation as animation | |
import pandas as pd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# prtint min, max, median, first quartile, third quartile and random quartile | |
# using .quartile() | |
for i in num_col: | |
print(f'Min: {train[i].quantile(0)} First Quartile: {train[i].quantile(0.25)}' | |
f'Median: {train[i].quantile(0.5)} Third Quartile: {train[i].quantile(0.75)}' | |
f'Max: {train[i].quantile(0)} Random Quartile(90%): {train[i].quantile(0.9)}') | |
# quartile for categorical variables | |
def percentile(n): | |
def percentile_(x): |
OlderNewer