Skip to content

Instantly share code, notes, and snippets.

View audhiaprilliant's full-sized avatar
🎯
Focusing

Audhi Aprilliant audhiaprilliant

🎯
Focusing
View GitHub Profile
@audhiaprilliant
audhiaprilliant / twitter_data_viz_barplot_tweet_activities_part_two.R
Last active December 13, 2020 09:41
Twitter Data Visualization using ggplot2
# Total tweets of 2019-05-28
ggplot(data.hour.date1)+
geom_bar(aes(x = Hour,
y = Total.Tweets,
fill = I('red')),
stat = 'identity',
alpha = 0.75,
show.legend = FALSE)+
geom_hline(yintercept = mean(data.hour.date1$Total.Tweets),
col = I('black'),
@audhiaprilliant
audhiaprilliant / twitter_data_viz_density_sentiment.R
Last active December 13, 2020 09:38
Twitter Data Visualization using ggplot2
# JOKO WIDODO
df.score.1 = subset(senti.jokowi,class == c('Negative','Positive'))
colnames(df.score.1) = c('Score','Text','Sentiment')
# Data viz
ggplot(df.score.1)+
geom_density(aes(x = Score,
fill = Sentiment),
alpha = 0.75)+
xlim(c(-11,11))+
labs(title = 'Density Plot of Sentiment Scores',
@audhiaprilliant
audhiaprilliant / twitter_data_viz_barplot_sentiment.R
Created December 13, 2020 09:37
Twitter Data Visualization using ggplot2
# JOKO WIDODO
df.senti.score.1 = data.frame(table(senti.jokowi$score))
colnames(df.senti.score.1) = c('Score','Freq')
# Data pre-processing
df.senti.score.1$Score = as.character(df.senti.score.1$Score)
df.senti.score.1$Score = as.numeric(df.senti.score.1$Score)
Score1 = df.senti.score.1$Score
sign(df.senti.score.1[1,1])
for (i in 1:nrow(df.senti.score.1)) {
sign.row = sign(df.senti.score.1[i,'Score'])
@audhiaprilliant
audhiaprilliant / twitter_data_viz_piechart_sentiment.R
Created December 13, 2020 09:43
Twitter Data Visualization using ggplot2
# JOKO WIDODO
df.senti.3 = as.data.frame(table(senti.jokowi$class))
colnames(df.senti.3) = c('Sentiment','Freq')
# Data pre-processing
df.pie.1 = df.senti.3
df.pie.1$Prop = df.pie.1$Freq/sum(df.pie.1$Freq)
df.pie.1 = df.pie.1 %>%
arrange(desc(Sentiment)) %>%
mutate(lab.ypos = cumsum(Prop) - 0.5*Prop)
# Data viz
@audhiaprilliant
audhiaprilliant / concave_function.py
Created December 13, 2020 09:49
Concave Function to Interpolate Stocks Data
# Function to input NA in column date
def imput_date(df:pd.DataFrame,col:'Date'):
start = datetime.datetime.strptime(df.loc[0,col],'%Y-%m-%d')
end = datetime.datetime.strptime(df.loc[df.shape[0] - 1,col],'%Y-%m-%d')
list_date = pd.date_range(start,end).strftime('%Y-%m-%d').tolist()
pd_date = pd.DataFrame(df,list_date)
pd_date[col] = pd_date.index.astype(object)
pd_date = pd_date.reset_index(drop=True)
df_date = pd.merge(pd_date[col],df,on=col,how='left')
return df_date
@audhiaprilliant
audhiaprilliant / threshold_simulation.py
Created December 24, 2020 02:55
How to choose the optimal threshold for imbalanced classification
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for data simulation
from sklearn.datasets import make_classification # Create a synthetic dataframe
from sklearn.linear_model import LogisticRegression # Classification model
from sklearn.model_selection import train_test_split # Split the dataframe
from sklearn.metrics import roc_curve # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve # Calculate the Precision-Recall curve
@audhiaprilliant
audhiaprilliant / threshold_roc.py
Created December 24, 2020 02:57
How to choose the optimal threshold for imbalanced classification
# Create the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
# Plot the ROC curve
df_fpr_tpr = pd.DataFrame({'FPR':fpr, 'TPR':tpr, 'Threshold':thresholds})
df_fpr_tpr.head()
# Create the data viz
plotnine.options.figure_size = (8, 4.8)
(
@audhiaprilliant
audhiaprilliant / threshold_roc_gmean.py
Created December 24, 2020 02:58
How to choose the optimal threshold for imbalanced classification
# Calculate the G-mean
gmean = np.sqrt(tpr * (1 - fpr))
# Find the optimal threshold
index = np.argmax(gmean)
thresholdOpt = round(thresholds[index], ndigits = 4)
gmeanOpt = round(gmean[index], ndigits = 4)
fprOpt = round(fpr[index], ndigits = 4)
tprOpt = round(tpr[index], ndigits = 4)
print('Best Threshold: {} with G-Mean: {}'.format(thresholdOpt, gmeanOpt))
@audhiaprilliant
audhiaprilliant / threshold_roc_youden.py
Created December 24, 2020 03:00
How to choose the optimal threshold for imbalanced classification
# Calculate the Youden's J statistic
youdenJ = tpr - fpr
# Find the optimal threshold
index = np.argmax(youdenJ)
thresholdOpt = round(thresholds[index], ndigits = 4)
youdenJOpt = round(gmean[index], ndigits = 4)
fprOpt = round(fpr[index], ndigits = 4)
tprOpt = round(tpr[index], ndigits = 4)
print('Best Threshold: {} with Youden J statistic: {}'.format(thresholdOpt, youdenJOpt))
@audhiaprilliant
audhiaprilliant / threshold_precision_recall.py
Created December 24, 2020 03:02
How to choose the optimal threshold for imbalanced classification
# Create the Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
# Plot the ROC curve
df_recall_precision = pd.DataFrame({'Precision':precision[:-1],
'Recall':recall[:-1],
'Threshold':thresholds})
df_recall_precision.head()
# Creat a data viz