Skip to content

Instantly share code, notes, and snippets.

View erykml's full-sized avatar

Eryk Lewinson erykml

View GitHub Profile
from scipy.stats.stats import pearsonr
for column in X.columns:
corr_test = pearsonr(X[column], lin_reg.resid)
print(f'Variable: {column} --- correlation: {corr_test[0]:.4f}, p-value: {corr_test[1]:.4f}')
%%R
for (i in 1:(dim(X)[2])){
cor_test <- cor.test(X[, i], lin_reg$residuals) #
print(paste('Variable:', colnames(X)[i],
'--- correlation:', as.character(cor_test$estimate),
', p-value:', as.character(cor_test$p.value), sep = " ", collapse = NULL))
}
from scipy import stats
def normality_of_residuals_test(model):
'''
Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to
investigate the normality of residuals.
Arg:
* model - fitted OLS models from statsmodels
'''
%%R
library(tseries)
library(olsrr)
qqnorm(lin_reg$residuals)
# or
df_resid <- data.frame(resid = lin_reg$residuals)
p <- ggplot(df_resid, aes(sample = resid))
# import packages
library(data.table)
library(dtplyr)
library(dplyr, warn.conflicts = FALSE)
library(microbenchmark)
library(stringi)
# specify size
n <- 1e7
# generate data.table with random data
data_dt <- data.table(id=stri_rand_strings(n, 3, pattern = "[A-Z]"),
product=stri_rand_strings(n, 3, pattern = "[A-Z]"),
date=sample(seq(as.Date('2019/01/01'), as.Date('2019/04/01'), by="day"), n, replace=TRUE),
amount=sample(1:10000,n,replace=TRUE),
price=rnorm(n, mean = 100, sd = 20))
data_dt_lazy %>%
filter(amount < 500) %>%
arrange(id)
mbm <- microbenchmark("dplyr" = {
result_df <- data_df %>%
filter(date < as.Date('2019-02-01')) %>%
select(c(id, product, date)) %>%
arrange(date)
},
"data.table" = {
result_dt <- data_dt[date < as.Date('2019-02-01'), .(id, product, date)][order(date)]
},
"dtplyr" = {
data_dt_lazy <- lazy_dt(data_dt, immutable=FALSE)
mbm <- microbenchmark("dplyr" = {
result_df <- data_df %>%
filter(amount >= 5000) %>%
mutate(order_value = amount * price)
},
"data.table" = {
result_dt <- data_dt[amount >= 5000][, order_value := amount * price]
},
data_dt_lazy <- lazy_dt(data_dt, immutable=FALSE)
mbm <- microbenchmark("dplyr" = {
result_df <- data_df %>%
filter(amount <= 4000) %>%
mutate(order_value = amount * price) %>%
group_by(id) %>%
summarise(avg_order_value = mean(order_value))
},
"data.table" = {