flxw · January 1, 2018 20:16
diff --git a/user_order_relationship_feature_engineering.R b/user_order_relationship_feature_engineering.R
 source('load_data.R')
 d = read_and_preprocess_data_file('data/BADS_WS1718_known.csv')
 library(dplyr)

 # 1st idea (the image I previously shared on WhatsApp)
 # relationship between total orders (item_count, not transaction count) and return count (again the item_count)
 # Are users who order more "wiser" in terms of picking the right items than small-volume users?
 orders_and_returns_per_user = d %>%
  group_by(user_id) %>%
  summarize(nreturn=sum(return), norder=n())
 orders_and_returns_per_user$return_ratio = orders_and_returns_per_user$nreturn/orders_and_returns_per_user$norder
 orders_and_returns_per_user = arrange(orders_and_returns_per_user, norder)
 plot(x=orders_and_returns_per_user$norder, y=orders_and_returns_per_user$return_ratio, type="p")

 # 2nd idea
 # do users return less with each returned item? In relationship to time, that is.
 # I.e. do they "learn" over time that some things fit, and others do not?
 d = arrange(d, order_date)
 userids = sort(unique(d$user_id))
 cumulated_returns = unlist(sapply(userids, function(x) cumsum(d$return[d$user_id == x]))) 
 d$cumulated_returns[with(d, order(user_id))] = cumulated_returns
 cumulated_itemcount = unlist(sapply(userids, function(x) cumsum(d$return[d$user_id == x] != 10))) 
 d$cumulated_itemcount[with(d, order(user_id))] = cumulated_itemcount
 d$volatile_return_ratio = d$cumulated_returns / d$cumulated_returns
 plot(x=d$cumulated_itemcount, y=d$cumulated_returns, type="p")
	source('load_data.R')
	d = read_and_preprocess_data_file('data/BADS_WS1718_known.csv')
	library(dplyr)

	# 1st idea (the image I previously shared on WhatsApp)
	# relationship between total orders (item_count, not transaction count) and return count (again the item_count)
	# Are users who order more "wiser" in terms of picking the right items than small-volume users?
	orders_and_returns_per_user = d %>%
	group_by(user_id) %>%
	summarize(nreturn=sum(return), norder=n())
	orders_and_returns_per_user$return_ratio = orders_and_returns_per_user$nreturn/orders_and_returns_per_user$norder
	orders_and_returns_per_user = arrange(orders_and_returns_per_user, norder)
	plot(x=orders_and_returns_per_user$norder, y=orders_and_returns_per_user$return_ratio, type="p")

	# 2nd idea
	# do users return less with each returned item? In relationship to time, that is.
	# I.e. do they "learn" over time that some things fit, and others do not?
	d = arrange(d, order_date)
	userids = sort(unique(d$user_id))
	cumulated_returns = unlist(sapply(userids, function(x) cumsum(d$return[d$user_id == x])))
	d$cumulated_returns[with(d, order(user_id))] = cumulated_returns
	cumulated_itemcount = unlist(sapply(userids, function(x) cumsum(d$return[d$user_id == x] != 10)))
	d$cumulated_itemcount[with(d, order(user_id))] = cumulated_itemcount
	d$volatile_return_ratio = d$cumulated_returns / d$cumulated_returns
	plot(x=d$cumulated_itemcount, y=d$cumulated_returns, type="p")