Skip to content

Instantly share code, notes, and snippets.

@mpkocher
Last active August 29, 2015 13:56
Show Gist options
  • Save mpkocher/8951962 to your computer and use it in GitHub Desktop.
Save mpkocher/8951962 to your computer and use it in GitHub Desktop.
New Pysiv API to support file Validation
"""Unittests for P_Filter module"""
import logging
from pysiv.core.base import TestBase, TestModuleBase
from pysiv.core.utils import monkey_patch_task, monkey_patch_module
from pysiv.validators import FastaValidator, FastqValidator, CsvValidator
log = logging.getLogger(__name__)
_CSV_SUBREAD_FIELDS = 'MovieName,HoleNumber,Start,End,Length,PassedFilter'.split(',')
_CSV_READS_FIELDS = 'Movie,ReadId,#Bases,Readlength,ReadScore,SequencingZMW,Productivity,PassedFilter'.split(',')
@monkey_patch_module
class TestFilterModule(TestModuleBase):
MODULE_NAME = 'P_Filter'
@monkey_patch_task
class TestFilterTask(TestBase):
TASK_NAME = 'filter'
# 'filtered_regions_fofn': 'filtered_regions.fofn'
REGISTERED_DATA_FILES = {'filtered_subreads_fastq': FastqValidator('filtered_subreads.fastq'),
'filtered_summary_csv': CsvValidator('filtered_summary.csv', fields=_CSV_READS_FIELDS)}
REPORT_FILES = None
MODULE_NAME = TestFilterModule.MODULE_NAME
@monkey_patch_task
class TestSubreadsTask(TestBase):
TASK_NAME = 'subreads'
REGISTERED_DATA_FILES = {'filtered_subreads_fasta': FastaValidator('filtered_subreads.fasta'),
'filtered_subreads_fastq': FastqValidator('filtered_subreads.fastq')}
REPORT_FILES = None
MODULE_NAME = TestFilterModule.MODULE_NAME
@monkey_patch_task
class TestSubreadSummaryTask(TestBase):
"""
Movie,ReadId,#Bases,Readlength,ReadScore,SequencingZMW,Productivity,PassedFilter
"""
TASK_NAME = 'subreadSummary'
REGISTERED_DATA_FILES = {'filtered_subread_summary_csv': CsvValidator('filtered_subread_summary.csv', fields=_CSV_SUBREAD_FIELDS)}
MODULE_NAME = TestFilterModule.MODULE_NAME
REPORT_FILES = None
_REQUIRED_JSON_REPORT_KEYS = 'attributes id plotGroups tables'.split()
class ValidatorBase(object):
__metaclass__ = abc.ABCMeta
def __init__(self, path):
if not isinstance(path, str):
_d = dict(k=self.__class__.__name__, t=type(path), p=path)
raise TypeError("{k} require path '{p}' to be provided as a str. Got type {t}".format(**_d))
self._path = path
@property
def path(self):
return self._path
@property
def name(self):
return self.__class__.__name__
def validate(self):
return self.validate_file(self.path)
@abc.abstractmethod
def validate_file(self, path):
pass
def _validate_csv(path, required_fields):
"""Test to see if the required fields are in the header of the CSV file"""
with open(path, 'r') as f:
header = f.readline()
msg = "Invalid CSV file."
if ',' in header:
header_fields = header.rstrip().split(',')
for required_field in required_fields:
if required_field not in header_fields:
msg += "Unable to find header field '{f}' in {p}".format(f=required_field, p=path)
log.error(msg)
raise ValueError(msg)
# Maybe should make sure there's more than one record in the CSV file?
return True
def _validate_json(path):
"""Simple smoke test to validate the json is well formed"""
with open(path, 'r') as f:
s = json.loads(f.read())
return True
def _validate_json_report(path):
"""Smoke Test to make sure pbreports JSON file has the required
root level keys.
"""
with open(path, 'r') as f:
s = json.loads(f.read())
for key in _REQUIRED_JSON_REPORT_KEYS:
if key not in s:
_d = dict(s=key, p=path)
msg = "Unable to find {s} in {p}".format(**_d)
log.error(msg)
raise KeyError(msg)
return True
def _validate_chemistry_mapping_xml(path):
"""Quick validation of ChemistryMapping XML file"""
t = ElementTree(file=path)
msg = "Invalid chemistry mapping XML {p}".format(p=path)
mapping_nodes = t.findall('Mapping')
if len(mapping_nodes) == 0:
msg += " Unable to find 'Mapping' in XML"
raise ValueError(msg)
else:
required_keys = ["Movie", "SequencingChemistry"]
for node in mapping_nodes:
for key in required_keys:
if node.find(key) is None:
msg += " Unable to find '{k}' in XML".format(k=key)
raise KeyError(msg)
return True
class CsvValidator(ValidatorBase):
def __init__(self, path, fields):
super(CsvValidator, self).__init__(path)
# list of expected CSV headers
self.fields = fields
def validate_file(self, path):
return _validate_csv(path, self.fields)
class GzipValidator(ValidatorBase):
def validate_file(self, path):
return True
class JsonValidator(ValidatorBase):
def validate_file(self, path):
with open(path, 'r') as f:
json.loads(f.read())
return True
class JsonReportValidator(ValidatorBase):
"""Smoke test to pbreports created json files
This should be fleshed out more.
"""
def validate_file(self, path):
return _validate_json_report(path)
class XmlValidator(ValidatorBase):
def validate_file(self, path):
t = ElementTree(file=path)
r = t.getroot()
return True
class MappingChemistryXmlValidator(ValidatorBase):
""" Should have this scheme
<Map><Mapping><Movie>m130306_023456_42129_c100422252550000001523053002121396_s1_p0</Movie><SequencingChemistry>C2</SequencingChemistry></Mapping></Map>
"""
def validate_file(self, path):
return _validate_chemistry_mapping_xml(path)
class _PbcoreReaderValidator(ValidatorBase):
READER_CLASS = FastaReader
def validate_file(self, path):
total = 0
with self.READER_CLASS(path) as f:
for record in f:
total += 1
# if we've got here, assume the file is valid
log.debug("{r} parsed {n} Records from {p}".format(n=total, p=path, r=self.READER_CLASS.__name__))
return True
class FastaValidator(_PbcoreReaderValidator):
READER_CLASS = FastaReader
class FastqValidator(_PbcoreReaderValidator):
READER_CLASS = FastqReader
class GffValidator(_PbcoreReaderValidator):
READER_CLASS = GffReader
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment