Last active
December 19, 2015 09:39
-
-
Save CnrLwlss/5935118 to your computer and use it in GitHub Desktop.
Example script demonstrating the use of BioPython to compare genomic sequences at a given gene location. See http://cnr.lwlss.net/seqConsensus
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://cnr.lwlss.net/seqConsensus | |
import os | |
from Bio import SeqIO | |
def checkMutations(ref_dict, roots, genedict,showSeq=False): | |
'''Comparing a reference sequence (ref_dict) with a series of sample sequences.''' | |
# Sample sequences are read in from .fastq files in a "pileups" directory. | |
# Filenames are constructed from a list of roots (e.g. filenames without extensions). | |
# A Python dictionary specifying genes to analyze connects gene labels (e.g. "HIS3") with a tuple of chromosome number, start position and stop position (e.g. (15,721946,722608)). | |
# The flag "showSeq" specifies whether sample and reference gene sequences should be printed to screen. | |
for root in roots: | |
# Read in the sample sequence from the pileups directory | |
fq_dict = SeqIO.index(os.path.join("pileups",root+"_cons.fq"), "fastq") | |
for gene in genedict: | |
print(gene) | |
chrom=genedict[gene][0] | |
start=genedict[gene][1] | |
end=genedict[gene][2] | |
# Get sequence at coords of gene | |
# Reverse complement if gene is on the Crick strand | |
if start>end: | |
refc=ref_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement() | |
sampc=fq_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement() | |
else: | |
refc=ref_dict[chrno[chrom]].seq[(start-1):(end)] | |
sampc=fq_dict[chrno[chrom]].seq[(start-1):(end)] | |
# Optionally print actual sequences to screen for visual comparison | |
if showSeq: | |
print("\nReference "+gene+" sequence:") | |
print(refc) | |
print("\n"+root+" Sample "+gene+" sequence:") | |
print(sampc) | |
# Check for SNPs in gene | |
diff=[1 if refc[i]!=sampc[i] else 0 for i in xrange(0,len(refc))] | |
print("Sample: "+root+" Gene: "+gene) | |
# If there are fewer than 5 SNPs in the gene, print a report about each one | |
if sum(diff)<5: | |
# Remember, chromosome coordinates are indexed from 1 | |
# Python lists are indexed from 0 | |
SNPlocs=[i for i,d in enumerate(diff) if d==1] | |
for SNPloc in SNPlocs: | |
a=SNPloc-5 | |
b=SNPloc+5 | |
print("Location of SNPs: "+str(SNPloc+1)) | |
print("Ref ("+str(a+1)+" to "+str(b+1)+"): "+refc[a:b]) | |
print("Smp ("+str(a+1)+" to "+str(b+1)+"): "+sampc[a:b]) | |
# Report number and fraction of SNPs in this gene | |
print("Number/percentage of differing nucleotides: "+str(sum(diff))+" "+str(100.0*float(sum(diff))/len(diff))) | |
# Read in reference genome (downloaded from SGD) | |
ref_dict = SeqIO.index(os.path.join("reference_SGD","S288C_reference_genome_R64-1-1_20110203","S288C_reference_sequence_R64-1-1_20110203.fsa"), "fasta") | |
# Assume alphabetical chromosome id order is the same as numerical order | |
# http://www.genome.jp/dbget-bin/www_bget?refseq+NC_001224 | |
# Build dictionary to translate chromosome number to id | |
chrids=ref_dict.keys() | |
chrids.sort() | |
chrno={} | |
print("Chromosome IDs:") | |
for i,chrid in enumerate(chrids): | |
if i<=15: | |
chrno[i+1]=chrid | |
# Print translation to screen | |
print(str(i+1)+": "+chrid) | |
else: | |
chrno["mtDNA"]=chrid | |
print("mtDNA: "+chrid) | |
# For each genen to be examined, specify gene coordinates (chromosome number, start, stop) | |
genedict={} | |
genedict["HIS3"]=(15,721946,722608) | |
genedict["BRR1"]=(16,672471,673496) | |
# List of file roots for names of samples to be analyzed | |
roots=[ | |
"SN7640178_10172_8183", | |
"SN7640178_10173_8265" | |
] | |
# Generate reports (prints to screen) | |
checkMutations(ref_dict,roots,genedict,showSeq=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment