Created
September 10, 2024 12:02
-
-
Save DrK-Lo/a3e7f0c442a8eb806dbf450cb6f597f8 to your computer and use it in GitHub Desktop.
This code in R takes a VCFR object as input and converts it to a 012 geno format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pl_vcf is read in with VCFR package | |
ext_vcfgt_num2 <- extract.gt(pl_vcf, as.numeric = FALSE) # extract the genotype matrix from the vcf fil | |
str(ext_vcfgt_num2) | |
# “0/0” --> 0 | |
# “0/1" or “1/0” --> 1 | |
# “1/1" is --> 2 | |
FirstAllele<- matrix(as.numeric(substring(ext_vcfgt_num2,1,1)), ncol=ncol(ext_vcfgt_num2)) #first character is first allele | |
SecondAllele<- matrix(as.numeric(substring(ext_vcfgt_num2,3,3)), ncol=ncol(ext_vcfgt_num2)) #third character is second allele | |
Genotype <- FirstAllele + SecondAllele | |
str(Genotype) | |
print(dim(Genotype)) | |
print(table(Genotype)) # counts of 0,1,2 | |
sum(is.na(Genotype)) # missing data | |
hist(colSums(is.na(Genotype)), breaks=seq(0,10000,10), main=paste(vcf_obj[i],“Missing loci per individual”)) #histogram of missing data per individual | |
hist(rowSums(is.na(Genotype)), breaks=seq(0,100,1), main=paste(vcf_obj[i],“Missing individuals per locus”)) #histogram of missing data per locus | |
colnames(Genotype) <- colnames(ext_vcfgt_num2) | |
rownames(Genotype) <- rownames(ext_vcfgt_num2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment