Created
June 21, 2011 18:55
-
-
Save stephenturner/1038587 to your computer and use it in GitHub Desktop.
iona-pull-711-snps-priority-3.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d <- query(" #new | |
SELECT | |
if(b.snp='.' OR b.snp IS NULL, CONCAT_WS('_','chr12',a.pos19), b.snp) as Locus_Name, | |
'SNP' as Target_Type, | |
NULL as Sequence, | |
a.chrom AS Chromosome, | |
a.pos19 as Coordinate, | |
'19' as Genome_Build_Version, | |
'HSapiensReferenceSequence' as Source, | |
'19' as Source_version, | |
'Forward' as Sequence_Orientation, | |
'Plus' as Plus_minus, | |
a.pos19, CONCAT('[',alleles,']') as alleles, priority, | |
bestmaf as priority_tiebreaker | |
FROM igf1.union_nomismatch_priority a | |
LEFT JOIN igf1.1000g_sites b on a.pos19=b.pos | |
LEFT JOIN igf1.gwas_overlap c on b.snp=c.snp | |
WHERE ((offtarget=0 and maxmaf>.05) OR (maf1000g>0.05)) | |
;") | |
dim(d) | |
dev.off() | |
hist(d$pos19, 100, col="black") | |
bins <- 100 | |
par(mfrow=c(2,1));hist(dold$pos19,bins, col=1);hist(d$pos19,bins, col=1) | |
i <- 1 | |
t1=Sys.time() | |
for (i in 1:nrow(d)) d$Sequence[i] <- getflank(d$pos19[i], alleles=d$alleles[i], offset=60) #this takes a while! | |
t2=Sys.time() | |
t2-t1 | |
sum(is.na(d$Sequence)) | |
#write.table(d,"igf1-snps-with-flanking-seq.csv",quote=F,row=F,sep=',') | |
rs_not_found <- query("select * from igf1.rs_not_found;",strings=F)$snp | |
rs_not_found <- rs_not_found[rs_not_found %in% d$Locus_Name] | |
d_seq <- d[grep("^rs",d$Locus_Name, invert=T),1:10] | |
d_id <- d[grep("^rs",d$Locus_Name, invert=F), 1:10] | |
lapply(list(d_seq,d_id,d),nrow); sum(rs_not_found %in% d$Locus_Name) | |
d_seq <- rbind(d_seq, d_id[d_id$Locus_Name %in% rs_not_found, ]) | |
d_seq$Locus_Name <- sub("^rs","snp-rs",d_seq$Locus_Name) | |
d_seq <- d_seq[order(d_seq$Coordinate), ] | |
d_id <- d_id[d_id$Locus_Name %nin% rs_not_found, ][1] | |
lapply(list(d_seq,d_id,d),nrow); sum(rs_not_found %in% d$Locus_Name) | |
write.table(d_seq, name("sequence priority 1-3, 151 snps.csv"), row=F, quote=F, sep=',') | |
write.table(d_id, name("identity priority 1-3, 275 snps.csv"), row=F, quote=F, sep=',') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment