This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import transformers | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | |
from scripts.text_processing.preprocess_tweets_lite import TextCleaner | |
import pandas as pd | |
class SentimentAnalyzer(TextCleaner): | |
def __init__(self, model="cardiffnlp/twitter-roberta-base-sentiment-latest", emotion=False): | |
super().__init__(stop_words_remove=False) | |
self.model = None | |
self.tokenizer = None |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# calculate per follower metrics | |
tweets_df['favorite_count_pf'] = tweets_df['favorite_count']/ tweets_df['follower_count'] | |
tweets_df['retweet_count_pf'] = tweets_df['retweet_count']/ tweets_df['follower_count'] | |
tweets_df['quote_count_pf'] = tweets_df['quote_count']/ tweets_df['follower_count'] | |
tweets_df['reply_count_pf'] = tweets_df['reply_count']/ tweets_df['follower_count'] | |
# normalise metrics | |
scaler = MinMaxScaler() | |
scaled_values = scaler.fit_transform(tweets_df[['favorite_count_pf', 'retweet_count_pf', 'quote_count_pf', 'reply_count_pf']]) | |
tweets_df[['favorite_count_pf', 'retweet_count_pf', 'quote_count_pf', 'reply_count_pf']] = scaled_values |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LinearRegression | |
# Create a list of the tuned models with names | |
models = [ | |
('xgb', xgb_tuner.best_estimator_), | |
('rf', rf_tuner.best_estimator_), | |
('ridge', ridge_tuner.best_estimator_) | |
] | |
# Create the stacking model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from skopt import BayesSearchCV | |
# Tune XGBoost model | |
xgb_tuner = BayesSearchCV( | |
xgb_pipeline, | |
xgb_param_grid, | |
cv=5, | |
scoring='neg_root_mean_squared_error', | |
n_iter=30, | |
n_jobs=-1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from skopt.space import Real, Integer, Categorical | |
# Parameter grid for XGBoost | |
xgb_param_grid = { | |
'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'), | |
'regressor__n_estimators': Integer(50, 2000), | |
'regressor__max_depth': Integer(3, 50), | |
'regressor__min_child_weight': Integer(1, 20), | |
'regressor__gamma': Real(0, 5), | |
'regressor__subsample': Real(0.5, 1), |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.ensemble import RandomForestRegressor, StackingRegressor | |
from sklearn.linear_model import Ridge, LinearRegression | |
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler | |
from sklearn.model_selection import train_test_split | |
from xgboost import XGBRegressor | |
preprocessor = ColumnTransformer( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define funtion te rplace missing vairbales | |
import pandas as pd | |
def impute_missing(df, test=False): | |
if test == False: | |
id_df = df['Id'] | |
y = df['SalePrice'] | |
df = df.drop(columns=['Id', 'SalePrice']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import pandas as pd | |
import folium | |
from folium.plugins import MarkerCluster | |
from streamlit_folium import folium_static | |
tweets_dash_final =pd.read_csv(path_to_csv) | |
# Define a function to assign emojis based on emotion | |
def get_emoji(sentiment): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from nltk import word_tokenize, WordNetLemmatizer | |
from nltk.corpus import stopwords | |
class TextCleaner: | |
def __init__(self, stop_words=None, stop_words_remove=False): | |
self.stop_words_remove = stop_words_remove | |
if stop_words: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import torch | |
from sklearn.model_selection import train_test_split | |
from torch.utils.data import Dataset, DataLoader | |
from transformers import AutoTokenizer | |
class DataPipeline: | |
def __init__(self, df, target_col, text, model, save_data, random_state=42): | |
self.df = df |