Last active
August 29, 2015 13:56
-
-
Save mpkocher/8951962 to your computer and use it in GitHub Desktop.
New Pysiv API to support file Validation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Unittests for P_Filter module""" | |
import logging | |
from pysiv.core.base import TestBase, TestModuleBase | |
from pysiv.core.utils import monkey_patch_task, monkey_patch_module | |
from pysiv.validators import FastaValidator, FastqValidator, CsvValidator | |
log = logging.getLogger(__name__) | |
_CSV_SUBREAD_FIELDS = 'MovieName,HoleNumber,Start,End,Length,PassedFilter'.split(',') | |
_CSV_READS_FIELDS = 'Movie,ReadId,#Bases,Readlength,ReadScore,SequencingZMW,Productivity,PassedFilter'.split(',') | |
@monkey_patch_module | |
class TestFilterModule(TestModuleBase): | |
MODULE_NAME = 'P_Filter' | |
@monkey_patch_task | |
class TestFilterTask(TestBase): | |
TASK_NAME = 'filter' | |
# 'filtered_regions_fofn': 'filtered_regions.fofn' | |
REGISTERED_DATA_FILES = {'filtered_subreads_fastq': FastqValidator('filtered_subreads.fastq'), | |
'filtered_summary_csv': CsvValidator('filtered_summary.csv', fields=_CSV_READS_FIELDS)} | |
REPORT_FILES = None | |
MODULE_NAME = TestFilterModule.MODULE_NAME | |
@monkey_patch_task | |
class TestSubreadsTask(TestBase): | |
TASK_NAME = 'subreads' | |
REGISTERED_DATA_FILES = {'filtered_subreads_fasta': FastaValidator('filtered_subreads.fasta'), | |
'filtered_subreads_fastq': FastqValidator('filtered_subreads.fastq')} | |
REPORT_FILES = None | |
MODULE_NAME = TestFilterModule.MODULE_NAME | |
@monkey_patch_task | |
class TestSubreadSummaryTask(TestBase): | |
""" | |
Movie,ReadId,#Bases,Readlength,ReadScore,SequencingZMW,Productivity,PassedFilter | |
""" | |
TASK_NAME = 'subreadSummary' | |
REGISTERED_DATA_FILES = {'filtered_subread_summary_csv': CsvValidator('filtered_subread_summary.csv', fields=_CSV_SUBREAD_FIELDS)} | |
MODULE_NAME = TestFilterModule.MODULE_NAME | |
REPORT_FILES = None |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_REQUIRED_JSON_REPORT_KEYS = 'attributes id plotGroups tables'.split() | |
class ValidatorBase(object): | |
__metaclass__ = abc.ABCMeta | |
def __init__(self, path): | |
if not isinstance(path, str): | |
_d = dict(k=self.__class__.__name__, t=type(path), p=path) | |
raise TypeError("{k} require path '{p}' to be provided as a str. Got type {t}".format(**_d)) | |
self._path = path | |
@property | |
def path(self): | |
return self._path | |
@property | |
def name(self): | |
return self.__class__.__name__ | |
def validate(self): | |
return self.validate_file(self.path) | |
@abc.abstractmethod | |
def validate_file(self, path): | |
pass | |
def _validate_csv(path, required_fields): | |
"""Test to see if the required fields are in the header of the CSV file""" | |
with open(path, 'r') as f: | |
header = f.readline() | |
msg = "Invalid CSV file." | |
if ',' in header: | |
header_fields = header.rstrip().split(',') | |
for required_field in required_fields: | |
if required_field not in header_fields: | |
msg += "Unable to find header field '{f}' in {p}".format(f=required_field, p=path) | |
log.error(msg) | |
raise ValueError(msg) | |
# Maybe should make sure there's more than one record in the CSV file? | |
return True | |
def _validate_json(path): | |
"""Simple smoke test to validate the json is well formed""" | |
with open(path, 'r') as f: | |
s = json.loads(f.read()) | |
return True | |
def _validate_json_report(path): | |
"""Smoke Test to make sure pbreports JSON file has the required | |
root level keys. | |
""" | |
with open(path, 'r') as f: | |
s = json.loads(f.read()) | |
for key in _REQUIRED_JSON_REPORT_KEYS: | |
if key not in s: | |
_d = dict(s=key, p=path) | |
msg = "Unable to find {s} in {p}".format(**_d) | |
log.error(msg) | |
raise KeyError(msg) | |
return True | |
def _validate_chemistry_mapping_xml(path): | |
"""Quick validation of ChemistryMapping XML file""" | |
t = ElementTree(file=path) | |
msg = "Invalid chemistry mapping XML {p}".format(p=path) | |
mapping_nodes = t.findall('Mapping') | |
if len(mapping_nodes) == 0: | |
msg += " Unable to find 'Mapping' in XML" | |
raise ValueError(msg) | |
else: | |
required_keys = ["Movie", "SequencingChemistry"] | |
for node in mapping_nodes: | |
for key in required_keys: | |
if node.find(key) is None: | |
msg += " Unable to find '{k}' in XML".format(k=key) | |
raise KeyError(msg) | |
return True | |
class CsvValidator(ValidatorBase): | |
def __init__(self, path, fields): | |
super(CsvValidator, self).__init__(path) | |
# list of expected CSV headers | |
self.fields = fields | |
def validate_file(self, path): | |
return _validate_csv(path, self.fields) | |
class GzipValidator(ValidatorBase): | |
def validate_file(self, path): | |
return True | |
class JsonValidator(ValidatorBase): | |
def validate_file(self, path): | |
with open(path, 'r') as f: | |
json.loads(f.read()) | |
return True | |
class JsonReportValidator(ValidatorBase): | |
"""Smoke test to pbreports created json files | |
This should be fleshed out more. | |
""" | |
def validate_file(self, path): | |
return _validate_json_report(path) | |
class XmlValidator(ValidatorBase): | |
def validate_file(self, path): | |
t = ElementTree(file=path) | |
r = t.getroot() | |
return True | |
class MappingChemistryXmlValidator(ValidatorBase): | |
""" Should have this scheme | |
<Map><Mapping><Movie>m130306_023456_42129_c100422252550000001523053002121396_s1_p0</Movie><SequencingChemistry>C2</SequencingChemistry></Mapping></Map> | |
""" | |
def validate_file(self, path): | |
return _validate_chemistry_mapping_xml(path) | |
class _PbcoreReaderValidator(ValidatorBase): | |
READER_CLASS = FastaReader | |
def validate_file(self, path): | |
total = 0 | |
with self.READER_CLASS(path) as f: | |
for record in f: | |
total += 1 | |
# if we've got here, assume the file is valid | |
log.debug("{r} parsed {n} Records from {p}".format(n=total, p=path, r=self.READER_CLASS.__name__)) | |
return True | |
class FastaValidator(_PbcoreReaderValidator): | |
READER_CLASS = FastaReader | |
class FastqValidator(_PbcoreReaderValidator): | |
READER_CLASS = FastqReader | |
class GffValidator(_PbcoreReaderValidator): | |
READER_CLASS = GffReader |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment