Last active
December 22, 2015 06:49
-
-
Save explodecomputer/6434065 to your computer and use it in GitHub Desktop.
Extract SNPs from 1kg data when it is split up into separate chromosomes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# R --no-save --args snplist.txt /ibscratch/wrayvisscher/imputation/arichu/data/imputed/chr\*/arichu_1kg_p1v3_\* outputfile < extractSnps.R | |
# - The first argument is the SNP list (just a list of rs SNP ids and nothing else) | |
# - The second argument specifies the root name for the genotype data (in binary plink format). Replace the chromosome number with "\*" | |
# - The third argument is the output file root name that you want to save everything to. | |
library(snpStats) | |
findChromosomes <- function(snp, plinkrt) | |
{ | |
cmd <- paste("grep -P -m 1 '", snp, "\t' ", plinkrt, ".bim | head -n 1 | cut -d \":\" -f 2 | cut -f 1", sep="") | |
cat(snp) | |
chr <- system(cmd, intern=TRUE) | |
cat(" ", chr, "\n") | |
return(chr) | |
} | |
findChromosomesAll <- function(snplist, plinkrt) | |
{ | |
chr <- snplist | |
for(i in 1:length(snplist)) | |
{ | |
a <- findChromosomes(snplist[i], plinkrt) | |
chr[i] <- ifelse(is.null(a), NA, a) | |
} | |
dat <- data.frame(snp = snplist, chr = as.numeric(chr)) | |
dat <- subset(dat, !is.na(chr)) | |
dat$snp <- as.character(dat$snp) | |
dat <- dat[order(dat$chr), ] | |
return(dat) | |
} | |
extractSnps <- function(snpnames, plinkrt) | |
{ | |
require(snpStats) | |
rawdata <- read.plink(bed=plinkrt, select.snps=snpnames) | |
return(rawdata) | |
} | |
extractSnpsAll <- function(snpdat, plinkrt) | |
{ | |
chr <- unique(snpdat$chr) | |
l <- length(chr) | |
cat(chr[1], ":", snpdat$snp[snpdat$chr == chr[1]], "\n") | |
dat <- extractSnps(snpdat$snp[snpdat$chr == chr[1]], gsub("\\*", chr[1], plinkrt)) | |
if(l == 1) return(dat) | |
for(i in 2:l) | |
{ | |
cat(chr[i], ":", snpdat$snp[snpdat$chr == chr[i]], "\n") | |
dat1 <- extractSnps(snpdat$snp[snpdat$chr == chr[i]], gsub("\\*", chr[i], plinkrt)) | |
dat$map <- rbind(dat$map, dat1$map) | |
dat$genotypes <- cbind(dat$genotypes, dat1$genotypes) | |
} | |
return(dat) | |
} | |
extractInfo <- function(snp, plinkrt) | |
{ | |
cmd <- paste("zgrep -P '", snp, " ' ", plinkrt, " | head -n 1 | cut -d \":\" -f 2", sep="") | |
info <- system(cmd, intern=TRUE) | |
print(info) | |
return(info) | |
} | |
extractInfoAll <- function(snpdat, plinkrt) | |
{ | |
l <- nrow(snpdat) | |
cmd <- paste("zcat ", gsub("\\*", "1", plinkrt), "_info.txt.gz | head -n 1", sep="") | |
print(cmd) | |
dat <- system(cmd, intern=TRUE) | |
for(i in 1:l) | |
{ | |
nom <- paste(gsub("\\*", snpdat$chr[i], plinkrt), "_info.txt.gz", sep="") | |
snp <- snpdat$snp[i] | |
cat(i, ":", snp, "\n") | |
dat <- c(dat, extractInfo(snp, nom)) | |
} | |
return(dat) | |
} | |
ar <- commandArgs(T) | |
snplistfile <- ar[1] | |
plinkrt <- ar[2] | |
output <- ar[3] | |
snplist <- scan(snplistfile, what="character") | |
snpdat <- findChromosomesAll(snplist, plinkrt) | |
info <- extractInfoAll(snpdat, plinkrt) | |
dat <- extractSnpsAll(snpdat, plinkrt) | |
write.table(info, file=paste(output, "_info.txt", sep=""), row=F, col=F, qu=F) | |
write.plink(file.base = output, | |
snps = dat$genotypes, | |
pedigree = dat$fam$pedigree, | |
id = dat$fam$member, | |
father = dat$fam$father, | |
mother = dat$fam$mother, | |
sex = dat$fam$sex, | |
phenotype = dat$fam$affected, | |
chromosome = dat$map$chromosome, | |
genetic.distance = dat$map$cM, | |
position = dat$map$position, | |
allele.1 = dat$map$allele.1, | |
allele.2 = dat$map$allele.2 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment