Created
February 19, 2012 22:43
-
-
Save xguse/1866279 to your computer and use it in GitHub Desktop.
Simple Python FastQ Parser class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ParseFastQ(object): | |
"""Returns a read-by-read fastQ parser analogous to file.readline()""" | |
def __init__(self,filePath,headerSymbols=['@','+']): | |
"""Returns a read-by-read fastQ parser analogous to file.readline(). | |
Exmpl: parser.next() | |
-OR- | |
Its an iterator so you can do: | |
for rec in parser: | |
... do something with rec ... | |
rec is tuple: (seqHeader,seqStr,qualHeader,qualStr) | |
""" | |
if filePath.endswith('.gz'): | |
self._file = gzip.open(filePath) | |
else: | |
self._file = open(filePath, 'rU') | |
self._currentLineNumber = 0 | |
self._hdSyms = headerSymbols | |
def __iter__(self): | |
return self | |
def next(self): | |
"""Reads in next element, parses, and does minimal verification. | |
Returns: tuple: (seqHeader,seqStr,qualHeader,qualStr)""" | |
# ++++ Get Next Four Lines ++++ | |
elemList = [] | |
for i in range(4): | |
line = self._file.readline() | |
self._currentLineNumber += 1 ## increment file position | |
if line: | |
elemList.append(line.strip('\n')) | |
else: | |
elemList.append(None) | |
# ++++ Check Lines For Expected Form ++++ | |
trues = [bool(x) for x in elemList].count(True) | |
nones = elemList.count(None) | |
# -- Check for acceptable end of file -- | |
if nones == 4: | |
raise StopIteration | |
# -- Make sure we got 4 full lines of data -- | |
assert trues == 4,\ | |
"** ERROR: It looks like I encountered a premature EOF or empty line.\n\ | |
Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (self._currentLineNumber) | |
# -- Make sure we are in the correct "register" -- | |
assert elemList[0].startswith(self._hdSyms[0]),\ | |
"** ERROR: The 1st line in fastq element does not start with '%s'.\n\ | |
Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (self._hdSyms[0],self._currentLineNumber) | |
assert elemList[2].startswith(self._hdSyms[1]),\ | |
"** ERROR: The 3rd line in fastq element does not start with '%s'.\n\ | |
Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (self._hdSyms[1],self._currentLineNumber) | |
# -- Make sure the seq line and qual line have equal lengths -- | |
assert len(elemList[1]) == len(elemList[3]), "** ERROR: The length of Sequence data and Quality data of the last record aren't equal.\n\ | |
Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (self._currentLineNumber) | |
# ++++ Return fatsQ data as tuple ++++ | |
return tuple(elemList) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment