Created
June 3, 2012 21:22
-
-
Save indapa/2865090 to your computer and use it in GitHub Desktop.
get the mean base composition for a fastq file after running fastq_stats.py from Galaxy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from pylab import * | |
import sys | |
file=sys.argv[1] | |
fh=open(file,'r') | |
headerline=fh.readline() | |
As=[] | |
Cs=[] | |
Gs=[] | |
Ts=[] | |
Ns=[] | |
mtrx=[] # this will be a list of numpy arrays | |
for line in fh: | |
fields=line.strip().split('\t') | |
#get the base counts of A G T C Ns from the cycle number | |
basecounts= np.array( [int(x) for x in fields[13:18] ], dtype=float ) | |
rowsum=np.sum(basecounts) | |
#get the % base composition | |
basecomp=np.divide(basecounts, rowsum) | |
#now append it to the list of numpy arrays | |
mtrx.append(basecomp) | |
#convert it to a proper numpy matrix | |
basecomptable=np.matrix(mtrx) | |
#get the mean percentage of each type of base across all cycles (where #cyclces equals read length) | |
basecompmeans=np.mean(basecomptable, axis=0) | |
row=basecompmeans[0,:].tolist() | |
for r in row: | |
outstr="\t".join(map(str,r)) | |
print "\t".join([file, outstr]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment