CnrLwlss · December 19, 2015 09:39
diff --git a/CheckGenes.py b/CheckGenes.py
 # http://cnr.lwlss.net/seqConsensus

 import os
 from Bio import SeqIO

 def checkMutations(ref_dict, roots, genedict,showSeq=False):
    '''Comparing a reference sequence (ref_dict) with a series of sample sequences.'''
    # Sample sequences are read in from .fastq files in a "pileups" directory.
    # Filenames are constructed from a list of roots (e.g. filenames without extensions).
    # A Python dictionary specifying genes to analyze connects gene labels (e.g. "HIS3") with a tuple of chromosome number, start position and stop position (e.g. (15,721946,722608)).
    # The flag "showSeq" specifies whether sample and reference gene sequences should be printed to screen.
    for root in roots:
        # Read in the sample sequence from the pileups directory
        fq_dict = SeqIO.index(os.path.join("pileups",root+"_cons.fq"), "fastq")
        for gene in genedict:
            print(gene)
            chrom=genedict[gene][0]
            start=genedict[gene][1]
            end=genedict[gene][2]
            # Get sequence at coords of gene
            # Reverse complement if gene is on the Crick strand
            if start>end:
                refc=ref_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
                sampc=fq_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
            else:
                refc=ref_dict[chrno[chrom]].seq[(start-1):(end)]
                sampc=fq_dict[chrno[chrom]].seq[(start-1):(end)]
            # Optionally print actual sequences to screen for visual comparison
            if showSeq:
                print("\nReference "+gene+" sequence:")
                print(refc)
                print("\n"+root+" Sample "+gene+" sequence:")
                print(sampc)
            # Check for SNPs in gene
            diff=[1 if refc[i]!=sampc[i] else 0 for i in xrange(0,len(refc))]
            print("Sample: "+root+" Gene: "+gene)
            # If there are fewer than 5 SNPs in the gene, print a report about each one
            if sum(diff)<5:
                # Remember, chromosome coordinates are indexed from 1
                # Python lists are indexed from 0
                SNPlocs=[i for i,d in enumerate(diff) if d==1]
                for SNPloc in SNPlocs:
                    a=SNPloc-5
                    b=SNPloc+5
                    print("Location of SNPs: "+str(SNPloc+1))
                    print("Ref ("+str(a+1)+" to "+str(b+1)+"): "+refc[a:b])
                    print("Smp ("+str(a+1)+" to "+str(b+1)+"): "+sampc[a:b])
            # Report number and fraction of SNPs in this gene
            print("Number/percentage of differing nucleotides: "+str(sum(diff))+" "+str(100.0*float(sum(diff))/len(diff)))                   
            
 # Read in reference genome (downloaded from SGD)
 ref_dict = SeqIO.index(os.path.join("reference_SGD","S288C_reference_genome_R64-1-1_20110203","S288C_reference_sequence_R64-1-1_20110203.fsa"), "fasta")

 # Assume alphabetical chromosome id order is the same as numerical order
 # http://www.genome.jp/dbget-bin/www_bget?refseq+NC_001224
 # Build dictionary to translate chromosome number to id
 chrids=ref_dict.keys()
 chrids.sort()
 chrno={}
 print("Chromosome IDs:")
 for i,chrid in enumerate(chrids):
    if i<=15:
        chrno[i+1]=chrid
        # Print translation to screen
        print(str(i+1)+": "+chrid)
    else:
        chrno["mtDNA"]=chrid
        print("mtDNA: "+chrid)

 # For each genen to be examined, specify gene coordinates (chromosome number, start, stop)
 genedict={}
 genedict["HIS3"]=(15,721946,722608)
 genedict["BRR1"]=(16,672471,673496)

 # List of file roots for names of samples to be analyzed
 roots=[
    "SN7640178_10172_8183",
    "SN7640178_10173_8265"
 ]

 # Generate reports (prints to screen)
 checkMutations(ref_dict,roots,genedict,showSeq=True)
	# http://cnr.lwlss.net/seqConsensus

	import os
	from Bio import SeqIO

	def checkMutations(ref_dict, roots, genedict,showSeq=False):
	'''Comparing a reference sequence (ref_dict) with a series of sample sequences.'''
	# Sample sequences are read in from .fastq files in a "pileups" directory.
	# Filenames are constructed from a list of roots (e.g. filenames without extensions).
	# A Python dictionary specifying genes to analyze connects gene labels (e.g. "HIS3") with a tuple of chromosome number, start position and stop position (e.g. (15,721946,722608)).
	# The flag "showSeq" specifies whether sample and reference gene sequences should be printed to screen.
	for root in roots:
	# Read in the sample sequence from the pileups directory
	fq_dict = SeqIO.index(os.path.join("pileups",root+"_cons.fq"), "fastq")
	for gene in genedict:
	print(gene)
	chrom=genedict[gene][0]
	start=genedict[gene][1]
	end=genedict[gene][2]
	# Get sequence at coords of gene
	# Reverse complement if gene is on the Crick strand
	if start>end:
	refc=ref_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
	sampc=fq_dict[chrno[chrom]].seq[(end-1):(start)].reverse_complement()
	else:
	refc=ref_dict[chrno[chrom]].seq[(start-1):(end)]
	sampc=fq_dict[chrno[chrom]].seq[(start-1):(end)]
	# Optionally print actual sequences to screen for visual comparison
	if showSeq:
	print("\nReference "+gene+" sequence:")
	print(refc)
	print("\n"+root+" Sample "+gene+" sequence:")
	print(sampc)
	# Check for SNPs in gene
	diff=[1 if refc[i]!=sampc[i] else 0 for i in xrange(0,len(refc))]
	print("Sample: "+root+" Gene: "+gene)
	# If there are fewer than 5 SNPs in the gene, print a report about each one
	if sum(diff)<5:
	# Remember, chromosome coordinates are indexed from 1
	# Python lists are indexed from 0
	SNPlocs=[i for i,d in enumerate(diff) if d==1]
	for SNPloc in SNPlocs:
	a=SNPloc-5
	b=SNPloc+5
	print("Location of SNPs: "+str(SNPloc+1))
	print("Ref ("+str(a+1)+" to "+str(b+1)+"): "+refc[a:b])
	print("Smp ("+str(a+1)+" to "+str(b+1)+"): "+sampc[a:b])
	# Report number and fraction of SNPs in this gene
	print("Number/percentage of differing nucleotides: "+str(sum(diff))+" "+str(100.0*float(sum(diff))/len(diff)))

	# Read in reference genome (downloaded from SGD)
	ref_dict = SeqIO.index(os.path.join("reference_SGD","S288C_reference_genome_R64-1-1_20110203","S288C_reference_sequence_R64-1-1_20110203.fsa"), "fasta")

	# Assume alphabetical chromosome id order is the same as numerical order
	# http://www.genome.jp/dbget-bin/www_bget?refseq+NC_001224
	# Build dictionary to translate chromosome number to id
	chrids=ref_dict.keys()
	chrids.sort()
	chrno={}
	print("Chromosome IDs:")
	for i,chrid in enumerate(chrids):
	if i<=15:
	chrno[i+1]=chrid
	# Print translation to screen
	print(str(i+1)+": "+chrid)
	else:
	chrno["mtDNA"]=chrid
	print("mtDNA: "+chrid)

	# For each genen to be examined, specify gene coordinates (chromosome number, start, stop)
	genedict={}
	genedict["HIS3"]=(15,721946,722608)
	genedict["BRR1"]=(16,672471,673496)

	# List of file roots for names of samples to be analyzed
	roots=[
	"SN7640178_10172_8183",
	"SN7640178_10173_8265"
	]

	# Generate reports (prints to screen)
	checkMutations(ref_dict,roots,genedict,showSeq=True)