Xachriel · December 15, 2015 08:29
diff --git a/a_merging_bench.R b/a_merging_bench.R
 #Using 64bit R 2.15.3
 #Packages we are going to use

 library(rjson)
 library(data.table)
 library(rbenchmark)

 #set your WD where your unpackaged file is, or add path.
 #Reading only partial data for this test, you can naturally do as you wish
 eve = readLines("kms-2013-01-06-s.json", n=3001)

 #json => data.frames (data lines)

 eve = matrix(eve, ncol=1)
 eve.list = apply(eve, 1, list)

 #First and last lines are useless
 eve.list = eve.list[-1]
 #eve.list = eve.list[-length(eve.list)] #didn't read the last line in this case

 #Handling json...
 eve.list = lapply(eve.list, function(x){
  x = unlist(x)
 	return(fromJSON(x))	
 	})

 #Handling data.frame
 eve.data.frame = function(eve.list){
 	
 	framo = unlist(eve.list)
 	
 	dups = unique(names(framo)[duplicated(names(framo))]) 
 	
 	if(length(dups) > 0){
 	
 	for(i in seq(dups)){
 		bol = names(framo) %in% dups[i]
 		framo[unique(dups[i])] = paste(framo[bol], collapse= ",")
 	}
 	
 	framo = framo[!duplicated(names(framo))]
 	}
 	
 	return(as.matrix(t(framo)))	
 }


 #This takes awhile if you read the whole data
 eve.frames = lapply(eve.list, eve.data.frame)

 #For example
 eve.frames[[1]]
 eve.frames[[100]]

 #The amount of variables in each data line
 N.variables = sapply(eve.frames, ncol)
 table(N.variables) 

 #Converting data.frames to data.tables
 eve.data.table = lapply(eve.frames, as.data.table)
 #Is this correct?

 #The functions used

 core.forLoop <- function(data.list) {
 	complete = data.list[[1]]
 		for(i in 2:length(data.list)){
 			part = data.list[[i]]
 			complete = merge(complete, part, 
 					all=TRUE, 
 					sort=FALSE)
 	}
 	return(complete)
 }

 data.table.forLoop <- function(data.list){
 	complete = data.list[[1]]
 		for(i in 2:length(data.list)){
 			part = data.list[[i]]
 			complete = merge(complete, part, 
 					by=intersect( names(complete), names(part) ), 
 					all=TRUE, 
 					sort=FALSE)
 	}
 	return(complete)
 }

 core.Reduce.merge = function(data.list){
 	Reduce(function(a, b) merge(a, b, all=TRUE, sort=FALSE), data.list)
 }

 data.table.Reduce.merge = function(data.list){
 	Reduce(function(a, b) merge(a, b, by=intersect(names(a), names(b)), all=TRUE, sort=FALSE), data.list)
 }


 fast.merging = function(data.list, nparts){

 if(!is.list(data.list)) stop("data.list isn't a list")

 while(length(data.list) != 1){ #Loop until everything is merged

 	if(length(data.list) > nparts){
 		starts = seq(1, length(data.list), nparts)
 		ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts.
 		if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even
 		sections = matrix(c(starts, ends), ncol=2, byrow=FALSE)
 		sections = apply(sections, 1, list)
 	}else{
 		sections = list(c(1, length(data.list)))
 	}
 	
 	#We have the standard way inside lapply
 	data.list = lapply(sections, function(x, data.list){
 		if(is.list(x)) x = x[[1]]
 		#the standard way starts ->
 		part = data.list[[x[1]]]
 		for(i in x[1]:x[2]){ 
 			part = merge(part, data.list[[i]], all=TRUE, sort=FALSE) 
 		}	
 		#<- standard way ends
 		return(part)
 	}, data.list = data.list)
 	
 	}
 	return(data.list[[1]]) #returning the merged data frame
 }


 #This will take awhile, go to lunch or something like that.

 benchmark(core.forLoop(eve.frames), 
 		  data.table.forLoop(eve.data.table),
 		  core.Reduce.merge(eve.frames),
 		  data.table.Reduce.merge(eve.data.table),
 		  fast.merging(eve.frames, 10),
 		  replications=5)   

 		  
 #                                     test replications elapsed relative user.self sys.self user.child sys.child
 #1                core.forLoop(eve.frames)            5 1398.29    4.932   1393.98     0.06         NA        NA
 #3           core.Reduce.merge(eve.frames)            5 1405.77    4.958   1401.02     0.31         NA        NA
 #2      data.table.forLoop(eve.data.table)            5  797.76    2.814    794.61     0.09         NA        NA
 #4 data.table.Reduce.merge(eve.data.table)            5  801.93    2.829    799.91     0.16         NA        NA
 #5            fast.merging(eve.frames, 10)            5  283.51    1.000    282.95     0.00         NA        NA

 #If you are checking are going to check are the results the same, remember to sort your rows and columns
 #basically data[order(data[, 1]), order(colnames(data))] for all data.
	#Using 64bit R 2.15.3
	#Packages we are going to use

	library(rjson)
	library(data.table)
	library(rbenchmark)

	#set your WD where your unpackaged file is, or add path.
	#Reading only partial data for this test, you can naturally do as you wish
	eve = readLines("kms-2013-01-06-s.json", n=3001)

	#json => data.frames (data lines)

	eve = matrix(eve, ncol=1)
	eve.list = apply(eve, 1, list)

	#First and last lines are useless
	eve.list = eve.list[-1]
	#eve.list = eve.list[-length(eve.list)] #didn't read the last line in this case

	#Handling json...
	eve.list = lapply(eve.list, function(x){
	x = unlist(x)
	return(fromJSON(x))
	})

	#Handling data.frame
	eve.data.frame = function(eve.list){

	framo = unlist(eve.list)

	dups = unique(names(framo)[duplicated(names(framo))])

	if(length(dups) > 0){

	for(i in seq(dups)){
	bol = names(framo) %in% dups[i]
	framo[unique(dups[i])] = paste(framo[bol], collapse= ",")
	}

	framo = framo[!duplicated(names(framo))]
	}

	return(as.matrix(t(framo)))
	}


	#This takes awhile if you read the whole data
	eve.frames = lapply(eve.list, eve.data.frame)

	#For example
	eve.frames[[1]]
	eve.frames[[100]]

	#The amount of variables in each data line
	N.variables = sapply(eve.frames, ncol)
	table(N.variables)

	#Converting data.frames to data.tables
	eve.data.table = lapply(eve.frames, as.data.table)
	#Is this correct?

	#The functions used

	core.forLoop <- function(data.list) {
	complete = data.list[[1]]
	for(i in 2:length(data.list)){
	part = data.list[[i]]
	complete = merge(complete, part,
	all=TRUE,
	sort=FALSE)
	}
	return(complete)
	}

	data.table.forLoop <- function(data.list){
	complete = data.list[[1]]
	for(i in 2:length(data.list)){
	part = data.list[[i]]
	complete = merge(complete, part,
	by=intersect( names(complete), names(part) ),
	all=TRUE,
	sort=FALSE)
	}
	return(complete)
	}

	core.Reduce.merge = function(data.list){
	Reduce(function(a, b) merge(a, b, all=TRUE, sort=FALSE), data.list)
	}

	data.table.Reduce.merge = function(data.list){
	Reduce(function(a, b) merge(a, b, by=intersect(names(a), names(b)), all=TRUE, sort=FALSE), data.list)
	}


	fast.merging = function(data.list, nparts){

	if(!is.list(data.list)) stop("data.list isn't a list")

	while(length(data.list) != 1){ #Loop until everything is merged

	if(length(data.list) > nparts){
	starts = seq(1, length(data.list), nparts)
	ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts.
	if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even
	sections = matrix(c(starts, ends), ncol=2, byrow=FALSE)
	sections = apply(sections, 1, list)
	}else{
	sections = list(c(1, length(data.list)))
	}

	#We have the standard way inside lapply
	data.list = lapply(sections, function(x, data.list){
	if(is.list(x)) x = x[[1]]
	#the standard way starts ->
	part = data.list[[x[1]]]
	for(i in x[1]:x[2]){
	part = merge(part, data.list[[i]], all=TRUE, sort=FALSE)
	}
	#<- standard way ends
	return(part)
	}, data.list = data.list)

	}
	return(data.list[[1]]) #returning the merged data frame
	}


	#This will take awhile, go to lunch or something like that.

	benchmark(core.forLoop(eve.frames),
	data.table.forLoop(eve.data.table),
	core.Reduce.merge(eve.frames),
	data.table.Reduce.merge(eve.data.table),
	fast.merging(eve.frames, 10),
	replications=5)


	# test replications elapsed relative user.self sys.self user.child sys.child
	#1 core.forLoop(eve.frames) 5 1398.29 4.932 1393.98 0.06 NA NA
	#3 core.Reduce.merge(eve.frames) 5 1405.77 4.958 1401.02 0.31 NA NA
	#2 data.table.forLoop(eve.data.table) 5 797.76 2.814 794.61 0.09 NA NA
	#4 data.table.Reduce.merge(eve.data.table) 5 801.93 2.829 799.91 0.16 NA NA
	#5 fast.merging(eve.frames, 10) 5 283.51 1.000 282.95 0.00 NA NA

	#If you are checking are going to check are the results the same, remember to sort your rows and columns
	#basically data[order(data[, 1]), order(colnames(data))] for all data.