Skip to content

Instantly share code, notes, and snippets.

@CnrLwlss
Last active December 19, 2015 09:39
Show Gist options
  • Save CnrLwlss/5935118 to your computer and use it in GitHub Desktop.
Save CnrLwlss/5935118 to your computer and use it in GitHub Desktop.
Example script demonstrating the use of BioPython to compare genomic sequences at a given gene location. See http://cnr.lwlss.net/seqConsensus
# http://cnr.lwlss.net/seqConsensus
import os
from Bio import SeqIO
def checkMutations(ref_dict, roots, genedict,showSeq=False):
'''Comparing a reference sequence (ref_dict) with a series of sample sequences.'''
# Sample sequences are read in from .fastq files in a "pileups" directory.
# Filenames are constructed from a list of roots (e.g. filenames without extensions).
# A Python dictionary specifying genes to analyze connects gene labels (e.g. "HIS3") with a tuple of chromosome number, start position and stop position (e.g. (15,721946,722608)).
# The flag "showSeq" specifies whether sample and reference gene sequences should be printed to screen.
for root in roots:
# Read in the sample sequence from the pileups directory
fq_dict = SeqIO.index(os.path.join("pileups",root+"_cons.fq"), "fastq")
for gene in genedict:
print(gene)
chrom=genedict[gene][0]
start=genedict[gene][1]
end=genedict[gene][2]
# Get sequence at coords of gene
# Reverse complement if gene is on the Crick strand
if start>end:
refc=ref_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
sampc=fq_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
else:
refc=ref_dict[chrno[chrom]].seq[(start-1):(end)]
sampc=fq_dict[chrno[chrom]].seq[(start-1):(end)]
# Optionally print actual sequences to screen for visual comparison
if showSeq:
print("\nReference "+gene+" sequence:")
print(refc)
print("\n"+root+" Sample "+gene+" sequence:")
print(sampc)
# Check for SNPs in gene
diff=[1 if refc[i]!=sampc[i] else 0 for i in xrange(0,len(refc))]
print("Sample: "+root+" Gene: "+gene)
# If there are fewer than 5 SNPs in the gene, print a report about each one
if sum(diff)<5:
# Remember, chromosome coordinates are indexed from 1
# Python lists are indexed from 0
SNPlocs=[i for i,d in enumerate(diff) if d==1]
for SNPloc in SNPlocs:
a=SNPloc-5
b=SNPloc+5
print("Location of SNPs: "+str(SNPloc+1))
print("Ref ("+str(a+1)+" to "+str(b+1)+"): "+refc[a:b])
print("Smp ("+str(a+1)+" to "+str(b+1)+"): "+sampc[a:b])
# Report number and fraction of SNPs in this gene
print("Number/percentage of differing nucleotides: "+str(sum(diff))+" "+str(100.0*float(sum(diff))/len(diff)))
# Read in reference genome (downloaded from SGD)
ref_dict = SeqIO.index(os.path.join("reference_SGD","S288C_reference_genome_R64-1-1_20110203","S288C_reference_sequence_R64-1-1_20110203.fsa"), "fasta")
# Assume alphabetical chromosome id order is the same as numerical order
# http://www.genome.jp/dbget-bin/www_bget?refseq+NC_001224
# Build dictionary to translate chromosome number to id
chrids=ref_dict.keys()
chrids.sort()
chrno={}
print("Chromosome IDs:")
for i,chrid in enumerate(chrids):
if i<=15:
chrno[i+1]=chrid
# Print translation to screen
print(str(i+1)+": "+chrid)
else:
chrno["mtDNA"]=chrid
print("mtDNA: "+chrid)
# For each genen to be examined, specify gene coordinates (chromosome number, start, stop)
genedict={}
genedict["HIS3"]=(15,721946,722608)
genedict["BRR1"]=(16,672471,673496)
# List of file roots for names of samples to be analyzed
roots=[
"SN7640178_10172_8183",
"SN7640178_10173_8265"
]
# Generate reports (prints to screen)
checkMutations(ref_dict,roots,genedict,showSeq=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment