Skip to content

Instantly share code, notes, and snippets.

@indapa
Created June 3, 2012 21:22
Show Gist options
  • Save indapa/2865090 to your computer and use it in GitHub Desktop.
Save indapa/2865090 to your computer and use it in GitHub Desktop.
get the mean base composition for a fastq file after running fastq_stats.py from Galaxy
import numpy as np
from pylab import *
import sys
file=sys.argv[1]
fh=open(file,'r')
headerline=fh.readline()
As=[]
Cs=[]
Gs=[]
Ts=[]
Ns=[]
mtrx=[] # this will be a list of numpy arrays
for line in fh:
fields=line.strip().split('\t')
#get the base counts of A G T C Ns from the cycle number
basecounts= np.array( [int(x) for x in fields[13:18] ], dtype=float )
rowsum=np.sum(basecounts)
#get the % base composition
basecomp=np.divide(basecounts, rowsum)
#now append it to the list of numpy arrays
mtrx.append(basecomp)
#convert it to a proper numpy matrix
basecomptable=np.matrix(mtrx)
#get the mean percentage of each type of base across all cycles (where #cyclces equals read length)
basecompmeans=np.mean(basecomptable, axis=0)
row=basecompmeans[0,:].tolist()
for r in row:
outstr="\t".join(map(str,r))
print "\t".join([file, outstr])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment