karthik · November 21, 2012 17:44
diff --git a/plyr_bug.R b/plyr_bug.R
 # Download and open this data file, size = 1.4mb
 # https://dl.dropbox.com/u/2223411/results_list.rda
 load('results_list.rda') 

 class(results)
 # 1] "list"

 length(results)
 # [1] 72860

 > results[1]
 # $SPEC5
 #   sites      means
 # 1    S4 0.06795411
 # 2    S5 0.07508747

 # I'm trying to condense this into a data.frame. plyr's ldply is key because otherwise I would lose the simulation ids and cannot match these results back to the original parameters.

 results_df <- ldply(results, data.frame, .progress = 'text')
 # freezes at 100%

 # ---------------------------------
 # This is the plyr bug.
 result_df <- ldply(results, rbind, .progress = 'text')
 # Stops at 100% and does not actually finish (I usually kill it after seeing it stuck at 100% for several minutes). I see this behavior both on my dual core MB pro or on my large cluster.

 # My work around
 r1 <- ldply(results[1:20000], rbind, .progress = 'text')
 r2 <- ldply(results[20001:40000], rbind, .progress = 'text')
 r3 <- ldply(results[40001:60000], rbind, .progress = 'text')
 r4 <- ldply(results[60001:length(results)], rbind, .progress = 'text')
 result_df <- rbind(r1,r2,r3,r4)
 > dim(result_df)
 # [1] 145720      3
	# Download and open this data file, size = 1.4mb
	# https://dl.dropbox.com/u/2223411/results_list.rda
	load('results_list.rda')

	class(results)
	# 1] "list"

	length(results)
	# [1] 72860

	> results[1]
	# $SPEC5
	# sites means
	# 1 S4 0.06795411
	# 2 S5 0.07508747

	# I'm trying to condense this into a data.frame. plyr's ldply is key because otherwise I would lose the simulation ids and cannot match these results back to the original parameters.

	results_df <- ldply(results, data.frame, .progress = 'text')
	# freezes at 100%

	# ---------------------------------
	# This is the plyr bug.
	result_df <- ldply(results, rbind, .progress = 'text')
	# Stops at 100% and does not actually finish (I usually kill it after seeing it stuck at 100% for several minutes). I see this behavior both on my dual core MB pro or on my large cluster.

	# My work around
	r1 <- ldply(results[1:20000], rbind, .progress = 'text')
	r2 <- ldply(results[20001:40000], rbind, .progress = 'text')
	r3 <- ldply(results[40001:60000], rbind, .progress = 'text')
	r4 <- ldply(results[60001:length(results)], rbind, .progress = 'text')
	result_df <- rbind(r1,r2,r3,r4)
	> dim(result_df)
	# [1] 145720 3