Last active
June 8, 2017 03:23
-
-
Save malomarrec/3b351b07e38f1991f523827ad2d73224 to your computer and use it in GitHub Desktop.
"Slide" dataframe (merge two columns that are identical but are NAs in a partition of rows)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This function "slides" dataframe blocks based on NA values | |
#Takes a dataframe and two column names | |
#On each row, one of these columns have to be NA | |
#The function returns a dataframe with a new column called <newname> with the combined non NA values of the first columns | |
#Different factor levels in both columns are dealt with by casting to character, then back to factor after the merge | |
#If newname is not specified, the first columns is overriden by the new column | |
#Optionnal parameters allows to drop the old columns | |
slide_df <- function(df,name1,name2,newname = NULL, drop = FALSE){ | |
require(dplyr) | |
if(is.null(newname)){ | |
newname = name1 | |
} | |
#Factor data needs to be converted to characters first in order no to lose levels | |
if(is.factor(df[,name1]) || is.factor(df[,name2])){ | |
df[,name1] <- as.character(df[,name1]) | |
df[,name2] <- as.character(df[,name2]) | |
} | |
slided_df = dplyr::mutate(df, !!newname := ifelse(is.na(df[,name1]),df[,name2],df[,name1])) | |
#If factor data, we need to cast character back to factor data | |
# if(is.factor(df[,name1]) || is.factor(df[,name2])){ | |
# df[,newname] <- as.factor(df[,newname]) | |
# } | |
if(drop){ | |
if(is.null(newname)){ | |
slided_df[,name1] <- NULL #dropping i only if a newname was specified, otherwise it contains the result | |
} | |
slided_df[,name2] <- NULL #dropping name_j | |
} | |
return(slided_df) | |
} | |
#Demo | |
a1 <- c(1,2,3,4,5,NA,NA,NA) | |
a2 <- c(NA,NA,NA,NA,NA,8,8,8) | |
b <- rep("a",8) | |
c1 <- c(1,2,3,4,5,NA,NA,NA) | |
c2 <- c(NA,NA,NA,NA,NA,'1','1','1') | |
b <- rep("a",8) | |
c1 <- c(1,2,3,4,5,NA,NA,NA) | |
c2 <- c(NA,NA,NA,NA,NA,'a','a','1') | |
df <- data.frame(a1 = a1,b = b, c1 = c1,a2=a2,c2=c2) | |
# a1 b c1 a2 c2 | |
# 1 1 a 1 NA <NA> | |
# 2 2 a 2 NA <NA> | |
# 3 3 a 3 NA <NA> | |
# 4 4 a 4 NA <NA> | |
# 5 5 a 5 NA <NA> | |
# 6 NA a NA 8 a | |
# 7 NA a NA 8 a | |
# 8 NA a NA 8 1 | |
> merge_cols(df,"a1","a2","a",drop=T) | |
merge_cols(df,"a1","a2","a",drop=T) | |
# a1 b c1 a2 c2 | |
# 1 1 a 1 NA <NA> | |
# 2 2 a 2 NA <NA> | |
# 3 3 a 3 NA <NA> | |
# 4 4 a 4 NA <NA> | |
# 5 5 a 5 NA <NA> | |
# 6 NA a NA 8 a | |
# 7 NA a NA 8 a | |
# 8 NA a NA 8 1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment