hibernado · May 2, 2017 21:47
diff --git a/.gitignore b/.gitignore
 TES Data Science technical screen.pdf
 TESDataScienceData.csv
 *.RData
 *.Rhistory
diff --git a/analysis_2.txt b/analysis_2.txt
 # Analysis Task:

 # Are things popular because they are popular ?

 We could look at the conditional popularity as a plot.
 So we could look at what was popular last week / month 
 and compare it to what is popular to this week / month.
 We could bin the 'prior' popularity and compare it to 
 this period's binned popularity.

 To really understand this we might want to test.
 If we say that x resource has 10000 downloads in one group
 versus 4 in another group. What is the eventual download rate? 
 -> Probably ok to do this for things that are free (ethics!). 


 # Are thing popular because the author is popular ? 
 Here I suggest taking 'prolific' authors.
 Does the 'histogram' of downloads by author follow a similar shape? 
 Does it differ only by scale (mean & var) ? 
 We would probably need to account for different subjects and other factors. 



 # Are things popular because of the price ?
 I suspect that the interaction between price and popularity isn't simple / linear. 
 I'd imagine that free things generally have a lower download rate than things which cost a little. 
 Then I'd expect popularity to rapidly decrease for things which cost more. 
 -> a scatter plot of price to popularity

 At the same time I'd expect that things which cost more would have greater 'longevity' ? In 
 other words I'd expect that more expensive things are less popular but provide a more reliable stream
 of downloads/income. 


 # Are things popular because of the rating ? 

 I suspect that a resource's rating has an impact on the number of downloads.
 -> a scatter plot of rating to number of downloads. 
 I suspect that the first 3/4/5 ratings have a big impact on the eventual number of downloads.
 -> a scatter plot of the average ratings after 3/4/5 ratings v total downloads.
diff --git a/sql_1_1.sql b/sql_1_1.sql
 -- 1) Write a SQL query that will return the top downloaded 5 resources by subject for each month 
 -- since 1st September 2015. Only include Mathematics, English and History.


 with filter_raw_events as ( 

 	select 
 		date_trunc('month', e.eventTime) eventMonth
 		,r.id
 		,r.title
 		,r.subject
 		,count(1) event_count

 	from events.eventStream e
 		join content.resourceDetails r on e.assetId = r.id
 		where 1=1
 			and e.assetType = 'resource'
 			and e.event = 'download'
 			and e.eventTime >= '2015-09-01'::timestamp
 			and r.subject in ('Mathematics','English','History')

 	group by 1,2,3,4
 ), summarise_events as (

 	select
 		 eventMonth  
 		,r.title
 		,r.subject
 		,row_number() over (partition by eventMonth,subject order by event_count desc) rank_per_subject_per_month

 	from filter_raw_events
 )

 select *
 from summarise_events
 where rank_per_subject_per_month <= 5

diff --git a/sql_1_2.sql b/sql_1_2.sql
 -- 2) We consider active users to be users that have been active in the last 28 days. 
 -- Please write a SQL query to show the percentage of users in the dataset considered active, split by week of their first visit.

 with munge_raw_events as (

 	select 
 		date_trunc('week', min(e.eventTime)) firstVisitWeek
 		,max(case when trunc(e.eventTime) >= current_date - 28 then 1 end) isActive
 		,userId
 	from events.eventStream e
 	group by userId

 )

 select
 	firstVisitWeek
 	,100.0 * count(isActive) / sum(count(userId)) over () as percentActive

 from munge_raw_events
 group by firstVisitWeek

diff --git a/tes_interview.R b/tes_interview.R
 library(tidyverse)
 library(data.table)

 dir = '~/Documents/data_science/TES_interview/48ed0f570043a81687fa0317f8776df1/'

 setwd(dir)
 getwd()

 TesData = 'TESDataScienceData.csv'
 df <- read.csv(TesData)

 summary(df)
 nrow(df)


 #########################
 # DATA EXPLORATION 
 #########################

 # Approach: look at every column for evidence of a link to the total number of views
 # the goal is to gain an understanding of how different the total views are when 
 # split by a particular field (mean and var).

 # info_gender 

 qplot(data = df, x = info_gender, fill = info_gender)
 qplot(data = df, x = info_gender, y = total_views, geom = 'blank') + geom_boxplot()

 df %>%
  filter(total_views < 4000) %>% 
  qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6)

 df %>%
  filter(total_views < 4000) %>% 
  qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6) + 
  facet_wrap(~info_gender, scales = 'free')

 # -> info_gender does not appear to have much influence on the total number of views




 # advert_start_date / advert_end_date / advert_duration 
 df$advert_start_date <- as.Date(df$advert_start_date, '%d/%m/%Y')
 df$advert_end_date <- as.Date(df$advert_end_date, '%d/%m/%Y')
 df$advert_duration <- as.double(df$advert_end_date - df$advert_start_date)

 qplot(data = df, x = advert_start_date, geom = 'blank') + geom_bar()
 qplot(data = df, x = advert_end_date, geom = 'blank') + geom_bar()
 qplot(data = df, x = advert_duration, geom = 'blank') + geom_bar()

 df %>% 
  group_by(advert_duration) %>% 
  summarise( mn_views = mean(total_views), stdDev_views = sqrt(var(total_views))) %>% 
  select(advert_duration, mn_views,stdDev_views) %>% 
  # glimpse
  # filter( advert_duration < 40) %>% 
  gather(key = variable, value = value, mn_views,stdDev_views) %>% 
  qplot( data = ., x = advert_duration, y = value, colour = variable, geom = 'line')
 # clear link between advert duration and total_views

 df %>% 
  # filter(advert_duration < 40) %>% 
  qplot(data = ., x = advert_duration, y = total_views) + 
  geom_boxplot(aes(group = advert_duration)) +
  geom_smooth(method = 'lm', formula = y~ poly(x,2)) +
  geom_smooth(method = 'lm', formula = y~ exp(x)) +
  geom_smooth(method = 'lm', formula = y~ x)

 # advert duration has some impact on total views but less than I expected


 # job_category 

 qplot(data = df, x = job_category, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # head of department & teacher have a big 'tail' in total views (not surprising). 
 # the rest have reasonably tight boxplots -> job_category might give reasonable prediction


 # workplace

 qplot(data = df, x = workplace, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # apparently useless


 # subject

 qplot(data = df, x = subject, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # some variability -> subject might give reasonable prediction
 # a lot less difference observed than expected.

 # country_group
 qplot(data = df, x = country_group, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # big difference between internation / uk. Expected but probably not that useful ? 
 # still can provide good prediction



 # admin_level_2
 data.table(df) %>% 
  .[, .N, .(country_group, admin_level_2)]

 df %>% 
  filter(country_group != 'International') %>% 
  qplot(data = ., x = admin_level_2, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # area does influence the number of views. 
 # london etc have long 'tails'. difficult

 # package
 qplot(data = df, x = package, y = total_views, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 # small difference between packages


 # salary_displayed
 qplot(data = df, x = salary_displayed, y = total_views, group = salary_displayed, geom = 'blank') + 
  geom_boxplot() + 
  theme(axis.text.x = element_text(hjust = 0, angle = -30))
 data.table(df) %>% 
  .[, mean(total_views), .(salary_displayed)]
 # definite difference. 
 # salary displayed --> lower views !

 # we don't have the school names ... but we do have their ids
 DT = data.table(df)
 DT[, cnt := length(unique(job_id)), school_id]
 length(unique(DT$job_id))
 length(unique(DT$school_id))

 DT %>% 
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter( cnt > 20 ) %>%  
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

 DT %>% 
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 20 & cnt > 15) %>%  
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

 DT %>% 
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 15 & cnt > 9) %>%  
  qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

 DT %>% 
  group_by(school_id) %>%
  mutate(cnt = n()) %>%
  filter(cnt < 9) %>%  
  # qplot( data = ., x = factor(1), y = total_views, geom = 'blank') + geom_boxplot()
  qplot( data = ., x = total_views, geom = 'blank') + geom_histogram()

 # predictable_schools <- factor(DT[, .N, school_id][N >= 10]$school_id)





 set.seed(100)
 rowsForTraining = 0.5
 index <- sample(x = seq(nrow(df)), size = floor(nrow(df) * rowsForTraining))

 train <- df[index,]
 test  <- df[-index,]

 train <- df[index,] %>% 
    filter(school_id %in% predictable_schools)
 test  <- df[-index,] %>%
    filter(school_id %in% predictable_schools)

 linReg <- lm(total_views ~ school_id,data = train)
 summary(linReg) # big impact. big proportion of the variation is explained

 # To make the model somewhat sane / understandable let's not use the school ids directly
 # let's classify the schools by mean view and bucket these into x bands

 # an alternative would be to do a regression per school. 




 #########################
 # MODEL
 #########################


 set.seed(100)

 qplot(data= df,x = advert_start_date)
 train <- df %>% filter(advert_start_date <  as.Date('2015-03-01'))
 test  <- df %>% filter(advert_start_date >= as.Date('2015-03-01'))


 DT <- data.table(train)
 school_jobs = DT[, .(job_cnt = .N), school_id] %>% filter(job_cnt > 3)

 train <- merge(train,school_jobs, by = 'school_id', all.x = T) %>%
  data.table
 train[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
 train[!is.na(job_cnt), school_for_reg := school_id]
 train[,.N, school_for_reg]

 linReg <- lm( total_views ~ school_for_reg + job_category + advert_duration + subject + country_group + admin_level_2 + salary_displayed
             ,data = train)
 summary(linReg) 


 #########################
 # 
 #########################


 test <- merge(test,school_jobs, by = 'school_id', all.x = T)
 test <- data.table(test) 
 test[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
 test[!is.na(job_cnt), school_for_reg := school_id]
 test <- data.frame(test)

 pred <- predict.lm(linReg,test)
 err <- abs(test['total_views'] - pred)$total_views
 mean(err)
	TES Data Science technical screen.pdf
	TESDataScienceData.csv
	*.RData
	*.Rhistory
	# Analysis Task:

	# Are things popular because they are popular ?

	We could look at the conditional popularity as a plot.
	So we could look at what was popular last week / month
	and compare it to what is popular to this week / month.
	We could bin the 'prior' popularity and compare it to
	this period's binned popularity.

	To really understand this we might want to test.
	If we say that x resource has 10000 downloads in one group
	versus 4 in another group. What is the eventual download rate?
	-> Probably ok to do this for things that are free (ethics!).


	# Are thing popular because the author is popular ?
	Here I suggest taking 'prolific' authors.
	Does the 'histogram' of downloads by author follow a similar shape?
	Does it differ only by scale (mean & var) ?
	We would probably need to account for different subjects and other factors.



	# Are things popular because of the price ?
	I suspect that the interaction between price and popularity isn't simple / linear.
	I'd imagine that free things generally have a lower download rate than things which cost a little.
	Then I'd expect popularity to rapidly decrease for things which cost more.
	-> a scatter plot of price to popularity

	At the same time I'd expect that things which cost more would have greater 'longevity' ? In
	other words I'd expect that more expensive things are less popular but provide a more reliable stream
	of downloads/income.


	# Are things popular because of the rating ?

	I suspect that a resource's rating has an impact on the number of downloads.
	-> a scatter plot of rating to number of downloads.
	I suspect that the first 3/4/5 ratings have a big impact on the eventual number of downloads.
	-> a scatter plot of the average ratings after 3/4/5 ratings v total downloads.
	-- 1) Write a SQL query that will return the top downloaded 5 resources by subject for each month
	-- since 1st September 2015. Only include Mathematics, English and History.


	with filter_raw_events as (

	select
	date_trunc('month', e.eventTime) eventMonth
	,r.id
	,r.title
	,r.subject
	,count(1) event_count

	from events.eventStream e
	join content.resourceDetails r on e.assetId = r.id
	where 1=1
	and e.assetType = 'resource'
	and e.event = 'download'
	and e.eventTime >= '2015-09-01'::timestamp
	and r.subject in ('Mathematics','English','History')

	group by 1,2,3,4
	), summarise_events as (

	select
	eventMonth
	,r.title
	,r.subject
	,row_number() over (partition by eventMonth,subject order by event_count desc) rank_per_subject_per_month

	from filter_raw_events
	)

	select *
	from summarise_events
	where rank_per_subject_per_month <= 5
	-- 2) We consider active users to be users that have been active in the last 28 days.
	-- Please write a SQL query to show the percentage of users in the dataset considered active, split by week of their first visit.

	with munge_raw_events as (

	select
	date_trunc('week', min(e.eventTime)) firstVisitWeek
	,max(case when trunc(e.eventTime) >= current_date - 28 then 1 end) isActive
	,userId
	from events.eventStream e
	group by userId

	)

	select
	firstVisitWeek
	,100.0 * count(isActive) / sum(count(userId)) over () as percentActive

	from munge_raw_events
	group by firstVisitWeek
	library(tidyverse)
	library(data.table)

	dir = '~/Documents/data_science/TES_interview/48ed0f570043a81687fa0317f8776df1/'

	setwd(dir)
	getwd()

	TesData = 'TESDataScienceData.csv'
	df <- read.csv(TesData)

	summary(df)
	nrow(df)


	#########################
	# DATA EXPLORATION
	#########################

	# Approach: look at every column for evidence of a link to the total number of views
	# the goal is to gain an understanding of how different the total views are when
	# split by a particular field (mean and var).

	# info_gender

	qplot(data = df, x = info_gender, fill = info_gender)
	qplot(data = df, x = info_gender, y = total_views, geom = 'blank') + geom_boxplot()

	df %>%
	filter(total_views < 4000) %>%
	qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6)

	df %>%
	filter(total_views < 4000) %>%
	qplot(data = ., x = total_views, fill = info_gender, geom = 'blank') + geom_histogram(alpha = 0.6) +
	facet_wrap(~info_gender, scales = 'free')

	# -> info_gender does not appear to have much influence on the total number of views




	# advert_start_date / advert_end_date / advert_duration
	df$advert_start_date <- as.Date(df$advert_start_date, '%d/%m/%Y')
	df$advert_end_date <- as.Date(df$advert_end_date, '%d/%m/%Y')
	df$advert_duration <- as.double(df$advert_end_date - df$advert_start_date)

	qplot(data = df, x = advert_start_date, geom = 'blank') + geom_bar()
	qplot(data = df, x = advert_end_date, geom = 'blank') + geom_bar()
	qplot(data = df, x = advert_duration, geom = 'blank') + geom_bar()

	df %>%
	group_by(advert_duration) %>%
	summarise( mn_views = mean(total_views), stdDev_views = sqrt(var(total_views))) %>%
	select(advert_duration, mn_views,stdDev_views) %>%
	# glimpse
	# filter( advert_duration < 40) %>%
	gather(key = variable, value = value, mn_views,stdDev_views) %>%
	qplot( data = ., x = advert_duration, y = value, colour = variable, geom = 'line')
	# clear link between advert duration and total_views

	df %>%
	# filter(advert_duration < 40) %>%
	qplot(data = ., x = advert_duration, y = total_views) +
	geom_boxplot(aes(group = advert_duration)) +
	geom_smooth(method = 'lm', formula = y~ poly(x,2)) +
	geom_smooth(method = 'lm', formula = y~ exp(x)) +
	geom_smooth(method = 'lm', formula = y~ x)

	# advert duration has some impact on total views but less than I expected


	# job_category

	qplot(data = df, x = job_category, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# head of department & teacher have a big 'tail' in total views (not surprising).
	# the rest have reasonably tight boxplots -> job_category might give reasonable prediction


	# workplace

	qplot(data = df, x = workplace, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# apparently useless


	# subject

	qplot(data = df, x = subject, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# some variability -> subject might give reasonable prediction
	# a lot less difference observed than expected.

	# country_group
	qplot(data = df, x = country_group, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# big difference between internation / uk. Expected but probably not that useful ?
	# still can provide good prediction



	# admin_level_2
	data.table(df) %>%
	.[, .N, .(country_group, admin_level_2)]

	df %>%
	filter(country_group != 'International') %>%
	qplot(data = ., x = admin_level_2, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# area does influence the number of views.
	# london etc have long 'tails'. difficult

	# package
	qplot(data = df, x = package, y = total_views, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	# small difference between packages


	# salary_displayed
	qplot(data = df, x = salary_displayed, y = total_views, group = salary_displayed, geom = 'blank') +
	geom_boxplot() +
	theme(axis.text.x = element_text(hjust = 0, angle = -30))
	data.table(df) %>%
	.[, mean(total_views), .(salary_displayed)]
	# definite difference.
	# salary displayed --> lower views !

	# we don't have the school names ... but we do have their ids
	DT = data.table(df)
	DT[, cnt := length(unique(job_id)), school_id]
	length(unique(DT$job_id))
	length(unique(DT$school_id))

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter( cnt > 20 ) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 20 & cnt > 15) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 15 & cnt > 9) %>%
	qplot( data = ., x = school_id, y = total_views, geom = 'blank') + geom_boxplot()

	DT %>%
	group_by(school_id) %>%
	mutate(cnt = n()) %>%
	filter(cnt < 9) %>%
	# qplot( data = ., x = factor(1), y = total_views, geom = 'blank') + geom_boxplot()
	qplot( data = ., x = total_views, geom = 'blank') + geom_histogram()

	# predictable_schools <- factor(DT[, .N, school_id][N >= 10]$school_id)





	set.seed(100)
	rowsForTraining = 0.5
	index <- sample(x = seq(nrow(df)), size = floor(nrow(df) * rowsForTraining))

	train <- df[index,]
	test <- df[-index,]

	train <- df[index,] %>%
	filter(school_id %in% predictable_schools)
	test <- df[-index,] %>%
	filter(school_id %in% predictable_schools)

	linReg <- lm(total_views ~ school_id,data = train)
	summary(linReg) # big impact. big proportion of the variation is explained

	# To make the model somewhat sane / understandable let's not use the school ids directly
	# let's classify the schools by mean view and bucket these into x bands

	# an alternative would be to do a regression per school.




	#########################
	# MODEL
	#########################


	set.seed(100)

	qplot(data= df,x = advert_start_date)
	train <- df %>% filter(advert_start_date < as.Date('2015-03-01'))
	test <- df %>% filter(advert_start_date >= as.Date('2015-03-01'))


	DT <- data.table(train)
	school_jobs = DT[, .(job_cnt = .N), school_id] %>% filter(job_cnt > 3)

	train <- merge(train,school_jobs, by = 'school_id', all.x = T) %>%
	data.table
	train[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
	train[!is.na(job_cnt), school_for_reg := school_id]
	train[,.N, school_for_reg]

	linReg <- lm( total_views ~ school_for_reg + job_category + advert_duration + subject + country_group + admin_level_2 + salary_displayed
	,data = train)
	summary(linReg)


	#########################
	#
	#########################


	test <- merge(test,school_jobs, by = 'school_id', all.x = T)
	test <- data.table(test)
	test[is.na(job_cnt), school_for_reg := 'few_jobs_logged']
	test[!is.na(job_cnt), school_for_reg := school_id]
	test <- data.frame(test)

	pred <- predict.lm(linReg,test)
	err <- abs(test['total_views'] - pred)$total_views
	mean(err)