Skip to content

Instantly share code, notes, and snippets.

View john-adeojo's full-sized avatar

John Adeojo john-adeojo

View GitHub Profile
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scripts.text_processing.preprocess_tweets_lite import TextCleaner
import pandas as pd
class SentimentAnalyzer(TextCleaner):
def __init__(self, model="cardiffnlp/twitter-roberta-base-sentiment-latest", emotion=False):
super().__init__(stop_words_remove=False)
self.model = None
self.tokenizer = None
# calculate per follower metrics
tweets_df['favorite_count_pf'] = tweets_df['favorite_count']/ tweets_df['follower_count']
tweets_df['retweet_count_pf'] = tweets_df['retweet_count']/ tweets_df['follower_count']
tweets_df['quote_count_pf'] = tweets_df['quote_count']/ tweets_df['follower_count']
tweets_df['reply_count_pf'] = tweets_df['reply_count']/ tweets_df['follower_count']
# normalise metrics
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(tweets_df[['favorite_count_pf', 'retweet_count_pf', 'quote_count_pf', 'reply_count_pf']])
tweets_df[['favorite_count_pf', 'retweet_count_pf', 'quote_count_pf', 'reply_count_pf']] = scaled_values
from sklearn.linear_model import LinearRegression
# Create a list of the tuned models with names
models = [
('xgb', xgb_tuner.best_estimator_),
('rf', rf_tuner.best_estimator_),
('ridge', ridge_tuner.best_estimator_)
]
# Create the stacking model
from skopt import BayesSearchCV
# Tune XGBoost model
xgb_tuner = BayesSearchCV(
xgb_pipeline,
xgb_param_grid,
cv=5,
scoring='neg_root_mean_squared_error',
n_iter=30,
n_jobs=-1
from skopt.space import Real, Integer, Categorical
# Parameter grid for XGBoost
xgb_param_grid = {
'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
'regressor__n_estimators': Integer(50, 2000),
'regressor__max_depth': Integer(3, 50),
'regressor__min_child_weight': Integer(1, 20),
'regressor__gamma': Real(0, 5),
'regressor__subsample': Real(0.5, 1),
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
preprocessor = ColumnTransformer(
# define funtion te rplace missing vairbales
import pandas as pd
def impute_missing(df, test=False):
if test == False:
id_df = df['Id']
y = df['SalePrice']
df = df.drop(columns=['Id', 'SalePrice'])
import streamlit as st
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from streamlit_folium import folium_static
tweets_dash_final =pd.read_csv(path_to_csv)
# Define a function to assign emojis based on emotion
def get_emoji(sentiment):
import re
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
class TextCleaner:
def __init__(self, stop_words=None, stop_words_remove=False):
self.stop_words_remove = stop_words_remove
if stop_words:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
class DataPipeline:
def __init__(self, df, target_col, text, model, save_data, random_state=42):
self.df = df