Skip to content

Instantly share code, notes, and snippets.

@Xachriel
Last active December 15, 2015 08:29
Show Gist options
  • Save Xachriel/5230961 to your computer and use it in GitHub Desktop.
Save Xachriel/5230961 to your computer and use it in GitHub Desktop.
a merging bench
#Using 64bit R 2.15.3
#Packages we are going to use
library(rjson)
library(data.table)
library(rbenchmark)
#set your WD where your unpackaged file is, or add path.
#Reading only partial data for this test, you can naturally do as you wish
eve = readLines("kms-2013-01-06-s.json", n=3001)
#json => data.frames (data lines)
eve = matrix(eve, ncol=1)
eve.list = apply(eve, 1, list)
#First and last lines are useless
eve.list = eve.list[-1]
#eve.list = eve.list[-length(eve.list)] #didn't read the last line in this case
#Handling json...
eve.list = lapply(eve.list, function(x){
x = unlist(x)
return(fromJSON(x))
})
#Handling data.frame
eve.data.frame = function(eve.list){
framo = unlist(eve.list)
dups = unique(names(framo)[duplicated(names(framo))])
if(length(dups) > 0){
for(i in seq(dups)){
bol = names(framo) %in% dups[i]
framo[unique(dups[i])] = paste(framo[bol], collapse= ",")
}
framo = framo[!duplicated(names(framo))]
}
return(as.matrix(t(framo)))
}
#This takes awhile if you read the whole data
eve.frames = lapply(eve.list, eve.data.frame)
#For example
eve.frames[[1]]
eve.frames[[100]]
#The amount of variables in each data line
N.variables = sapply(eve.frames, ncol)
table(N.variables)
#Converting data.frames to data.tables
eve.data.table = lapply(eve.frames, as.data.table)
#Is this correct?
#The functions used
core.forLoop <- function(data.list) {
complete = data.list[[1]]
for(i in 2:length(data.list)){
part = data.list[[i]]
complete = merge(complete, part,
all=TRUE,
sort=FALSE)
}
return(complete)
}
data.table.forLoop <- function(data.list){
complete = data.list[[1]]
for(i in 2:length(data.list)){
part = data.list[[i]]
complete = merge(complete, part,
by=intersect( names(complete), names(part) ),
all=TRUE,
sort=FALSE)
}
return(complete)
}
core.Reduce.merge = function(data.list){
Reduce(function(a, b) merge(a, b, all=TRUE, sort=FALSE), data.list)
}
data.table.Reduce.merge = function(data.list){
Reduce(function(a, b) merge(a, b, by=intersect(names(a), names(b)), all=TRUE, sort=FALSE), data.list)
}
fast.merging = function(data.list, nparts){
if(!is.list(data.list)) stop("data.list isn't a list")
while(length(data.list) != 1){ #Loop until everything is merged
if(length(data.list) > nparts){
starts = seq(1, length(data.list), nparts)
ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts.
if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even
sections = matrix(c(starts, ends), ncol=2, byrow=FALSE)
sections = apply(sections, 1, list)
}else{
sections = list(c(1, length(data.list)))
}
#We have the standard way inside lapply
data.list = lapply(sections, function(x, data.list){
if(is.list(x)) x = x[[1]]
#the standard way starts ->
part = data.list[[x[1]]]
for(i in x[1]:x[2]){
part = merge(part, data.list[[i]], all=TRUE, sort=FALSE)
}
#<- standard way ends
return(part)
}, data.list = data.list)
}
return(data.list[[1]]) #returning the merged data frame
}
#This will take awhile, go to lunch or something like that.
benchmark(core.forLoop(eve.frames),
data.table.forLoop(eve.data.table),
core.Reduce.merge(eve.frames),
data.table.Reduce.merge(eve.data.table),
fast.merging(eve.frames, 10),
replications=5)
# test replications elapsed relative user.self sys.self user.child sys.child
#1 core.forLoop(eve.frames) 5 1398.29 4.932 1393.98 0.06 NA NA
#3 core.Reduce.merge(eve.frames) 5 1405.77 4.958 1401.02 0.31 NA NA
#2 data.table.forLoop(eve.data.table) 5 797.76 2.814 794.61 0.09 NA NA
#4 data.table.Reduce.merge(eve.data.table) 5 801.93 2.829 799.91 0.16 NA NA
#5 fast.merging(eve.frames, 10) 5 283.51 1.000 282.95 0.00 NA NA
#If you are checking are going to check are the results the same, remember to sort your rows and columns
#basically data[order(data[, 1]), order(colnames(data))] for all data.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment