lomereiter · February 24, 2013 19:03
diff --git a/gff3_gtf_sketch.d b/gff3_gtf_sketch.d
 unittest {
 import std.conv;
 import bio.gff3.reader;
 import bio.gtf.reader;
 import bio.gff3.writer;
 import bio.gff3.validator;
  
 void main() {
  // Keeps all comments and pragmas -- this is cheap.
  // If user doesn't need them -- std.algorithm.filter to the rescue!
  //
  // One simple constructor with filename is enough.
  auto gff = new Gff3Reader("data/test.gff3");
  
  // GtfReader should also be presented in bio.gtf.reader
  //
  // These two should probably be inherited from some abstract GffGtfReader,
  // and only call set_input_... damn, I can't recall what this method is called, 
  // that's why two classes are needed! *looks at the code* set_data_format --
  // in the constructor with different parameters.
  auto gtf = new GtfReader("data/test.gtf");
  
  auto validator = new Gff3Validator();
  validator.setStringency(ValidationStringency.silent);
  validator.setOutputFilename("validation.log"); // print there instead of stderr
  // enum ValidationStringency {
  //    silent,   /// skip validation altogether
  //    lenient,  /// print warnings
  //    strict    /// throw exceptions
  // }
  
  gff.setValidator(validator); // will be used in later calls
                               // default stringency should be 'strict'
  
  auto records = gff.records;
  
  assert(record.front.is_comment);
  assert(!record.front.is_pragma);
  assert(!record.front.is_regular);
  
  records = gff.records; // gff.records gives a brand-new range each time
  
  // Having such trivial things might seem weird, but this makes for consistent syntax,
  // as for binary formats one can't just convert each record to a string and print it.
  // Among other advantages, Gff3Writer may use less allocations, 
  // and users may not forget to print a newline in some place.
  {
    auto w = new Gff3Writer(gff.filename ~ ".modified");
    scope(exit) {
      foreach (fr; gff.fasta_records)
        w.writeFastaRecord(fr);
      w.close();
    }
  
    foreach (record; records) {
      record.start += 1;   // Should be integer, not string!
                           // If parsing time is a big deal - store it in string internally
                           // but provide access via property.
                           // Same for end.
  
      record.strand = '+'; // Strand should be of type 'char' and checked for sanity 
                           // (only '+', '-', '.', '?' values should be allowed).
                           //
                           // Or, better yet, give strand a type and use 'alias this' magic:
                           // struct Strand { char strand; alias strand this; 
                           //  bool is_unknown() @property const { ... }      <- '?'
                           //  bool is_undefined() @property const { ... } }  <- '.'
                           // And then implement property of this type.
  
      // Explicit conversion is fine, but the word 'attributes' is way too long to type.
      //
      // It's easy to use opIndexAssign to delegate this to the dictionary.
      // Clearly, it should have at least two overloads -- for strings and arrays of strings.
      record["Awwww1"] = to!string(12345); 
      record["Awwww2"] = ["1", "2", "3"];
      assert(record["Awwww1"] == "12345"); // checks that length is 1 and compares first element
      // i.e. opEquals should also have two overloads -- for strings and arrays of strings
  
      w.writeRecord(record);
    }
  }
  
  // How to access features? 
  // I looked at FeatureRange, and almost all of its methods 
  // do nothing but just delegate to underlying record range.
  //
  // Therefore, it makes sense to have a simple function
  // which will turn record range into a feature range, with signature
  // auto groupIntoFeatures(R)(R records, bool link=false, size_t cache_size=1024);
  
  auto features = gff.records.groupIntoFeatures();
  
  // provide a few shortcuts as well in Gff3Reader interface
  features = gff.features;               // no cache size parameter for not linked features
  features = gff.linkedFeatures();       // use defaults
  features = gff.linkedFeatures(15000);  // increased cache size
  
  foreach (feature; features) {
    writeln(feature.parent.id);
  }
 }
	unittest {
	import std.conv;
	import bio.gff3.reader;
	import bio.gtf.reader;
	import bio.gff3.writer;
	import bio.gff3.validator;

	void main() {
	// Keeps all comments and pragmas -- this is cheap.
	// If user doesn't need them -- std.algorithm.filter to the rescue!
	//
	// One simple constructor with filename is enough.
	auto gff = new Gff3Reader("data/test.gff3");

	// GtfReader should also be presented in bio.gtf.reader
	//
	// These two should probably be inherited from some abstract GffGtfReader,
	// and only call set_input_... damn, I can't recall what this method is called,
	// that's why two classes are needed! looks at the code set_data_format --
	// in the constructor with different parameters.
	auto gtf = new GtfReader("data/test.gtf");

	auto validator = new Gff3Validator();
	validator.setStringency(ValidationStringency.silent);
	validator.setOutputFilename("validation.log"); // print there instead of stderr
	// enum ValidationStringency {
	// silent, /// skip validation altogether
	// lenient, /// print warnings
	// strict /// throw exceptions
	// }

	gff.setValidator(validator); // will be used in later calls
	// default stringency should be 'strict'

	auto records = gff.records;

	assert(record.front.is_comment);
	assert(!record.front.is_pragma);
	assert(!record.front.is_regular);

	records = gff.records; // gff.records gives a brand-new range each time

	// Having such trivial things might seem weird, but this makes for consistent syntax,
	// as for binary formats one can't just convert each record to a string and print it.
	// Among other advantages, Gff3Writer may use less allocations,
	// and users may not forget to print a newline in some place.
	{
	auto w = new Gff3Writer(gff.filename ~ ".modified");
	scope(exit) {
	foreach (fr; gff.fasta_records)
	w.writeFastaRecord(fr);
	w.close();
	}

	foreach (record; records) {
	record.start += 1; // Should be integer, not string!
	// If parsing time is a big deal - store it in string internally
	// but provide access via property.
	// Same for end.

	record.strand = '+'; // Strand should be of type 'char' and checked for sanity
	// (only '+', '-', '.', '?' values should be allowed).
	//
	// Or, better yet, give strand a type and use 'alias this' magic:
	// struct Strand { char strand; alias strand this;
	// bool is_unknown() @property const { ... } <- '?'
	// bool is_undefined() @property const { ... } } <- '.'
	// And then implement property of this type.

	// Explicit conversion is fine, but the word 'attributes' is way too long to type.
	//
	// It's easy to use opIndexAssign to delegate this to the dictionary.
	// Clearly, it should have at least two overloads -- for strings and arrays of strings.
	record["Awwww1"] = to!string(12345);
	record["Awwww2"] = ["1", "2", "3"];
	assert(record["Awwww1"] == "12345"); // checks that length is 1 and compares first element
	// i.e. opEquals should also have two overloads -- for strings and arrays of strings

	w.writeRecord(record);
	}
	}

	// How to access features?
	// I looked at FeatureRange, and almost all of its methods
	// do nothing but just delegate to underlying record range.
	//
	// Therefore, it makes sense to have a simple function
	// which will turn record range into a feature range, with signature
	// auto groupIntoFeatures(R)(R records, bool link=false, size_t cache_size=1024);

	auto features = gff.records.groupIntoFeatures();

	// provide a few shortcuts as well in Gff3Reader interface
	features = gff.features; // no cache size parameter for not linked features
	features = gff.linkedFeatures(); // use defaults
	features = gff.linkedFeatures(15000); // increased cache size

	foreach (feature; features) {
	writeln(feature.parent.id);
	}
	}