Created
March 23, 2017 17:47
-
-
Save sahilseth/3625dd6e9129bc1b1d46260c85f1e3b0 to your computer and use it in GitHub Desktop.
a function to split TCGA barcodes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
split_tcga_barcode <- function(x, split.char="-"){ | |
tmp = strsplit(x, split.char)[[1]] | |
# default values | |
s.tss = "NA"; | |
s.participant = "NA"; | |
s.programcode = "NA"; | |
s.nucleic = "NA"; | |
s.portion = "NA"; | |
s.sampletype = "00"; | |
# tooling needs this | |
s.portionanalyte = "NA"; | |
error = 0 | |
# Do some validations as well, TSS - 2 chars, sampletype - 2chars | |
s.tss = tmp[2]; | |
s.participant = tmp[3]; | |
s.sampletype = substr(tmp[4], 1, 2) | |
# splitting aliquot and its type DNA RNA... | |
tmp2 <- strsplit(gsub("([0-9]*)([A-Z]*)","\\1!\\2",tmp[5]),"!")[[1]] | |
# extract | |
if(tmp2[2] %in% c("D","G","W","X")){ | |
s.nucleic <- "DNA" | |
}else if(tmp2[2] %in% c("H","R","T")){ | |
s.nucleic <- "RNA" | |
} | |
s.portion = tmp2[1]; | |
s.portionanalyte = tmp2[2] | |
return(list(sampleid = x, | |
tissuesourcesite = s.tss, | |
participant = s.participant, | |
sampletype = s.sampletype, | |
programcode = s.programcode, | |
nucleicacidtype = s.nucleic, | |
portion = s.portion, | |
portionanalyte = s.portionanalyte, | |
error = error)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment