Eryk Lewinson erykml

Data Scientist, Author of the Python for Finance Cookbook (published by Packt).

erykml / corr_test.py

Created June 3, 2019 19:15

	from scipy.stats.stats import pearsonr

	for column in X.columns:
	corr_test = pearsonr(X[column], lin_reg.resid)
	print(f'Variable: {column} --- correlation: {corr_test[0]:.4f}, p-value: {corr_test[1]:.4f}')

erykml / corr_test_r.py

Created June 3, 2019 19:17

	%%R

	for (i in 1:(dim(X)[2])){
	cor_test <- cor.test(X[, i], lin_reg$residuals) #
	print(paste('Variable:', colnames(X)[i],
	'--- correlation:', as.character(cor_test$estimate),
	', p-value:', as.character(cor_test$p.value), sep = " ", collapse = NULL))
	}

erykml / normality.py

Created June 3, 2019 19:23

	from scipy import stats

	def normality_of_residuals_test(model):
	'''
	Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to
	investigate the normality of residuals.

	Arg:
	* model - fitted OLS models from statsmodels
	'''

erykml / normality_r.py

Created June 3, 2019 19:24

	%%R
	library(tseries)
	library(olsrr)

	qqnorm(lin_reg$residuals)

	# or

	df_resid <- data.frame(resid = lin_reg$residuals)
	p <- ggplot(df_resid, aes(sample = resid))

erykml / import_packages.r

Created July 25, 2019 00:18

	# import packages

	library(data.table)
	library(dtplyr)
	library(dplyr, warn.conflicts = FALSE)
	library(microbenchmark)
	library(stringi)

erykml / generate_data.r

Created July 25, 2019 00:20

	# specify size
	n <- 1e7

	# generate data.table with random data
	data_dt <- data.table(id=stri_rand_strings(n, 3, pattern = "[A-Z]"),
	product=stri_rand_strings(n, 3, pattern = "[A-Z]"),
	date=sample(seq(as.Date('2019/01/01'), as.Date('2019/04/01'), by="day"), n, replace=TRUE),
	amount=sample(1:10000,n,replace=TRUE),
	price=rnorm(n, mean = 100, sd = 20))

erykml / preview.r

Created July 25, 2019 00:23

erykml / use_case_1.r

Created July 25, 2019 00:28

	mbm <- microbenchmark("dplyr" = {
	result_df <- data_df %>%
	filter(date < as.Date('2019-02-01')) %>%
	select(c(id, product, date)) %>%
	arrange(date)
	},
	"data.table" = {
	result_dt <- data_dt[date < as.Date('2019-02-01'), .(id, product, date)][order(date)]
	},
	"dtplyr" = {

erykml / use_case_2.r

Created July 25, 2019 00:36

	data_dt_lazy <- lazy_dt(data_dt, immutable=FALSE)

	mbm <- microbenchmark("dplyr" = {
	result_df <- data_df %>%
	filter(amount >= 5000) %>%
	mutate(order_value = amount * price)
	},
	"data.table" = {
	result_dt <- data_dt[amount >= 5000][, order_value := amount * price]
	},

erykml / use_case_3.r

Created July 25, 2019 00:39

	data_dt_lazy <- lazy_dt(data_dt, immutable=FALSE)

	mbm <- microbenchmark("dplyr" = {
	result_df <- data_df %>%
	filter(amount <= 4000) %>%
	mutate(order_value = amount * price) %>%
	group_by(id) %>%
	summarise(avg_order_value = mean(order_value))
	},
	"data.table" = {