Last active
December 23, 2015 13:48
-
-
Save kgturner/6644150 to your computer and use it in GitHub Desktop.
Squish multiple columns with non-overlapping values into a single column, and get rid of NAs. Also, squish multiple groups of columns together, over a whole data frame. See http://wp.me/p1Ye5e-aJ for more information. Useful following data extraction from WorldClim.org geoTIFF files (https://gist.github.com/kgturner/6643334)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Functions to squish groups of columns together | |
#Kathryn Turner Sept 20, 2013 | |
#With help from Andrew MacDonald @polesasunder | |
#given data that have many NAs and non-overlapping values | |
#across several columns, with similarly named columns within a group | |
> head(clim) | |
alt_07.tif alt_11.tif alt_12.tif alt_15.tif alt_16.tif alt_17.tif bio1_07.tif bio1_11.tif bio1_12.tif bio1_15.tif bio1_16.tif bio1_17.tif bio10_07.tif | |
BG001 NA NA NA NA 61 NA NA NA NA NA 121 NA NA | |
CA001 NA 24 NA NA NA NA NA 100 NA NA NA NA NA | |
CA008 NA NA 1308 NA NA NA NA NA 39 NA NA NA NA | |
CA009 NA NA 599 NA NA NA NA NA 73 NA NA NA NA | |
CA010 NA NA 935 NA NA NA NA NA 57 NA NA NA NA | |
GR001 NA NA NA NA 1 NA NA NA NA NA 159 NA NA | |
#Here, all columns that start with "alt" should be squished together, | |
#and all columns with "bio1" should be squished togther to eliminate NAs | |
#To squish a single group of columns | |
#dat is the dataframe, cols is the columns to be squished togther, | |
#newcol is the name of the new column of concentrated values. | |
#Returns dataframe with new column only. | |
squish <- function(dat=dat, cols=cols, newcol="newcol"){ | |
dat$temp <- NA | |
for (i in dat[,cols]){ | |
## where is temp NA? | |
ss <- which(is.na(dat$temp)) | |
## what are the values of i there? put them in temp | |
dat$temp[ss] <- i[ss] | |
} | |
names(dat)[names(dat)=="temp"] <- newcol | |
return(subset(dat,select=ncol(dat))) | |
} | |
#for one group of columns | |
clim.2 <- squish(dat=clim, cols=1:6, newcol="alt") | |
head(clim.2) | |
> clim.2 | |
alt | |
BG001 61 | |
CA001 24 | |
CA008 1308 | |
CA009 599 | |
CA010 935 | |
GR001 1 | |
#To squish multiple groups of colummns across a dataframe. | |
#List of names to give the new concentrated data columns, | |
#shared between all columns to be squished. | |
#dat is the dataframe, split is the character to split | |
#column names on. | |
squishsplit <- function(dat, split="_"){ | |
varnames <- colnames(dat) | |
splitvars <- strsplit(varnames, split) | |
squishvars <- sapply(splitvars,"[[",i=1) | |
return(squishvars) | |
} | |
#Breaks up new column names and feeds them to squish | |
#dat is the dataframe, squishvar is the list of names | |
#produced by squishsplit. | |
squishr <- function(dat, squishvar){ | |
return(squish(dat=dat, cols=grep(squishvar,names(dat)),newcol=squishvar)) | |
} | |
vars <- squishsplit(clim) | |
#lapply to each unique value of vars, and cbind into a pretty dataframe | |
test1 <- do.call(cbind,lapply(unique(vars),squishr,dat=clim)) | |
> head(test1) | |
alt bio1 bio10 bio11 bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19 bio2 bio3 bio4 bio5 bio6 bio7 bio8 bio9 | |
BG001 61 121 211 31 519 60 35 17 160 112 120 135 72 26 7087 260 -11 271 91 62 | |
CA001 24 100 161 41 1035 153 30 51 449 110 110 406 86 39 4700 224 7 217 48 161 | |
CA008 1308 39 141 -61 455 52 28 22 139 94 118 127 116 33 7893 236 -112 348 -54 -6 | |
CA009 599 73 178 -33 501 59 31 21 158 101 123 145 124 34 8165 277 -82 359 -25 130 | |
CA010 935 57 160 -45 561 67 34 21 180 114 136 166 115 33 8008 255 -93 348 -38 154 | |
GR001 1 159 255 65 445 54 17 29 153 67 69 128 112 34 7388 336 10 326 121 250 | |
#120 columns condensed down to 20! No NAs! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment