Last active
December 15, 2015 08:29
-
-
Save Xachriel/5230961 to your computer and use it in GitHub Desktop.
a merging bench
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Using 64bit R 2.15.3 | |
#Packages we are going to use | |
library(rjson) | |
library(data.table) | |
library(rbenchmark) | |
#set your WD where your unpackaged file is, or add path. | |
#Reading only partial data for this test, you can naturally do as you wish | |
eve = readLines("kms-2013-01-06-s.json", n=3001) | |
#json => data.frames (data lines) | |
eve = matrix(eve, ncol=1) | |
eve.list = apply(eve, 1, list) | |
#First and last lines are useless | |
eve.list = eve.list[-1] | |
#eve.list = eve.list[-length(eve.list)] #didn't read the last line in this case | |
#Handling json... | |
eve.list = lapply(eve.list, function(x){ | |
x = unlist(x) | |
return(fromJSON(x)) | |
}) | |
#Handling data.frame | |
eve.data.frame = function(eve.list){ | |
framo = unlist(eve.list) | |
dups = unique(names(framo)[duplicated(names(framo))]) | |
if(length(dups) > 0){ | |
for(i in seq(dups)){ | |
bol = names(framo) %in% dups[i] | |
framo[unique(dups[i])] = paste(framo[bol], collapse= ",") | |
} | |
framo = framo[!duplicated(names(framo))] | |
} | |
return(as.matrix(t(framo))) | |
} | |
#This takes awhile if you read the whole data | |
eve.frames = lapply(eve.list, eve.data.frame) | |
#For example | |
eve.frames[[1]] | |
eve.frames[[100]] | |
#The amount of variables in each data line | |
N.variables = sapply(eve.frames, ncol) | |
table(N.variables) | |
#Converting data.frames to data.tables | |
eve.data.table = lapply(eve.frames, as.data.table) | |
#Is this correct? | |
#The functions used | |
core.forLoop <- function(data.list) { | |
complete = data.list[[1]] | |
for(i in 2:length(data.list)){ | |
part = data.list[[i]] | |
complete = merge(complete, part, | |
all=TRUE, | |
sort=FALSE) | |
} | |
return(complete) | |
} | |
data.table.forLoop <- function(data.list){ | |
complete = data.list[[1]] | |
for(i in 2:length(data.list)){ | |
part = data.list[[i]] | |
complete = merge(complete, part, | |
by=intersect( names(complete), names(part) ), | |
all=TRUE, | |
sort=FALSE) | |
} | |
return(complete) | |
} | |
core.Reduce.merge = function(data.list){ | |
Reduce(function(a, b) merge(a, b, all=TRUE, sort=FALSE), data.list) | |
} | |
data.table.Reduce.merge = function(data.list){ | |
Reduce(function(a, b) merge(a, b, by=intersect(names(a), names(b)), all=TRUE, sort=FALSE), data.list) | |
} | |
fast.merging = function(data.list, nparts){ | |
if(!is.list(data.list)) stop("data.list isn't a list") | |
while(length(data.list) != 1){ #Loop until everything is merged | |
if(length(data.list) > nparts){ | |
starts = seq(1, length(data.list), nparts) | |
ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts. | |
if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even | |
sections = matrix(c(starts, ends), ncol=2, byrow=FALSE) | |
sections = apply(sections, 1, list) | |
}else{ | |
sections = list(c(1, length(data.list))) | |
} | |
#We have the standard way inside lapply | |
data.list = lapply(sections, function(x, data.list){ | |
if(is.list(x)) x = x[[1]] | |
#the standard way starts -> | |
part = data.list[[x[1]]] | |
for(i in x[1]:x[2]){ | |
part = merge(part, data.list[[i]], all=TRUE, sort=FALSE) | |
} | |
#<- standard way ends | |
return(part) | |
}, data.list = data.list) | |
} | |
return(data.list[[1]]) #returning the merged data frame | |
} | |
#This will take awhile, go to lunch or something like that. | |
benchmark(core.forLoop(eve.frames), | |
data.table.forLoop(eve.data.table), | |
core.Reduce.merge(eve.frames), | |
data.table.Reduce.merge(eve.data.table), | |
fast.merging(eve.frames, 10), | |
replications=5) | |
# test replications elapsed relative user.self sys.self user.child sys.child | |
#1 core.forLoop(eve.frames) 5 1398.29 4.932 1393.98 0.06 NA NA | |
#3 core.Reduce.merge(eve.frames) 5 1405.77 4.958 1401.02 0.31 NA NA | |
#2 data.table.forLoop(eve.data.table) 5 797.76 2.814 794.61 0.09 NA NA | |
#4 data.table.Reduce.merge(eve.data.table) 5 801.93 2.829 799.91 0.16 NA NA | |
#5 fast.merging(eve.frames, 10) 5 283.51 1.000 282.95 0.00 NA NA | |
#If you are checking are going to check are the results the same, remember to sort your rows and columns | |
#basically data[order(data[, 1]), order(colnames(data))] for all data. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment