cbare · October 24, 2013 22:37
diff --git a/clean_participant_data.R b/clean_participant_data.R
 ## A script to read participant data out of Synapse and
 ## (help) clean it up.
 ##
 ## J. Christopher Bare
 ## chris.bare@sagebase.org
 ## Oct. 24, 2013
 ############################################################

 ## read evaluations
 ev1 <- synGetEvaluation(1917695)
 ev2 <- synGetEvaluation(1917696)

 ## check if all participants for both challenges are the same
 par1 <- synGetParticipants(ev1@properties$id, limit=10000)
 par2 <- synGetParticipants(ev2@properties$id, limit=10000)

 ## note that the only discrepancy is me :)
 setdiff(sapply(par1@results, function(par) { par$userId }), sapply(par2@results, function(par) { par$userId }))
 synGetUserProfile(377358)

 ## get user profile objects
 user_ids_1 <- sapply(par1@results, function(par) { par$userId })
 users <- lapply(user_ids_1, synGetUserProfile)

 ## a helper function to deal with empty slots
 nonempty <- function(x) { if (length(x)==0) { NA } else { x } }

 ## build a data.frame
 user_data <- do.call(rbind, lapply(users, function(user) {
    data.frame(
        id=user$ownerId,
        displayName=user$displayName,
        firstName=user$firstName,
        lastName=user$lastName,
        email=nonempty(user$email),
        position=nonempty(user$position),
        industry=nonempty(user$industry),
        organization=nonempty(user$company),
        location=nonempty(user$location),
        team=nonempty(user$teamName)
    )}
 ))

 write.csv(user_data, file='tox_challenge_user_data.csv', row.names=F)

 ## insert lots of manual scrubbing in the text editor here
 ## write.csv(participants, file='Desktop/tox_challenge_participants_scrubbed.csv', row.names=F)

 ## we now have nicely regularized locations
 participants <- read.csv('Desktop/tox_challenge_participants_scrubbed.csv', header=T, stringsAsFactors=F)

 ## split into country, city and, for US locations, state
 split_locations <- strsplit(participants$Location.inferred, ", *")
 participants$country <- sapply(split_locations, function(loc) { loc[length(loc)] })
 participants$city <- sapply(split_locations, function(loc) { if (length(loc) > 1) { loc[1]} else { NA } })
 participants$state <- sapply(split_locations, function(loc) { if (length(loc) ==3 && loc[3]=='USA') { loc[2]} else { NA } })

 ## what an international crew we have!
 table(participants$country, useNA='ifany')
	## A script to read participant data out of Synapse and
	## (help) clean it up.
	##
	## J. Christopher Bare
	## chris.bare@sagebase.org
	## Oct. 24, 2013
	############################################################

	## read evaluations
	ev1 <- synGetEvaluation(1917695)
	ev2 <- synGetEvaluation(1917696)

	## check if all participants for both challenges are the same
	par1 <- synGetParticipants(ev1@properties$id, limit=10000)
	par2 <- synGetParticipants(ev2@properties$id, limit=10000)

	## note that the only discrepancy is me :)
	setdiff(sapply(par1@results, function(par) { par$userId }), sapply(par2@results, function(par) { par$userId }))
	synGetUserProfile(377358)

	## get user profile objects
	user_ids_1 <- sapply(par1@results, function(par) { par$userId })
	users <- lapply(user_ids_1, synGetUserProfile)

	## a helper function to deal with empty slots
	nonempty <- function(x) { if (length(x)==0) { NA } else { x } }

	## build a data.frame
	user_data <- do.call(rbind, lapply(users, function(user) {
	data.frame(
	id=user$ownerId,
	displayName=user$displayName,
	firstName=user$firstName,
	lastName=user$lastName,
	email=nonempty(user$email),
	position=nonempty(user$position),
	industry=nonempty(user$industry),
	organization=nonempty(user$company),
	location=nonempty(user$location),
	team=nonempty(user$teamName)
	)}
	))

	write.csv(user_data, file='tox_challenge_user_data.csv', row.names=F)

	## insert lots of manual scrubbing in the text editor here
	## write.csv(participants, file='Desktop/tox_challenge_participants_scrubbed.csv', row.names=F)

	## we now have nicely regularized locations
	participants <- read.csv('Desktop/tox_challenge_participants_scrubbed.csv', header=T, stringsAsFactors=F)

	## split into country, city and, for US locations, state
	split_locations <- strsplit(participants$Location.inferred, ", *")
	participants$country <- sapply(split_locations, function(loc) { loc[length(loc)] })
	participants$city <- sapply(split_locations, function(loc) { if (length(loc) > 1) { loc[1]} else { NA } })
	participants$state <- sapply(split_locations, function(loc) { if (length(loc) ==3 && loc[3]=='USA') { loc[2]} else { NA } })

	## what an international crew we have!
	table(participants$country, useNA='ifany')
No results found