/** * The module csv is a set of function to parse many format using a delimitter as csv file * Supported format: * - .mat matrix file * - .bed UCSC file * For parse a .csv file use std.csv */ module bed; import std.conv; import std.stdio; import std.csv; import std.traits; import std.file; import std.array; import std.algorithm; import std.range; import std.string; import std.exception; /** * loadMatrixFile * load a matrix from a file. * Params: * filePath = path to file who contain matrix * separator = set delimiter used into the file for separate each column default it is tab * Returns: * A 2D array */ T[][] matrixReader( T )( string filePath, string separator = "\t" ){ File matrixFile = File( filePath, "r"); T[][] matrix; size_t length = 10; size_t counter = 0; matrix.length = 10; foreach( line; matrixFile.byLine() ){ if( length == counter ){ length += 10; matrix.length = length; } matrix[counter] = array( map!(to!T)( filter!"!a.empty"(line.split( separator ) ) ) );// Use filter like split bug and do not merge consecutive delimiter counter++; } matrix.length = counter; return matrix; } struct BedData3{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 } struct BedData4{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 } struct BedData5{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 } struct BedData6{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 } struct BedData7{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 size_t thickEnd; // 7 } struct BedData8{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 size_t thickEnd; // 7 size_t[3] itemRgb; // 8 } struct BedData9{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 size_t thickEnd; // 7 size_t[3] itemRgb; // 8 size_t blockCount; // 9 } struct BedData10{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 size_t thickEnd; // 7 size_t[3] itemRgb; // 8 size_t blockCount; // 9 size_t blockSizes; // 10 } struct BedData11{ string chrom; // 0 size_t chromStart; // 1 size_t chromEnd; // 2 string name; // 3 size_t score; // 4 char strand; // 5 size_t thickStart; // 6 size_t thickEnd; // 7 size_t[3] itemRgb; // 8 size_t blockCount; // 9 size_t blockSizes; // 10 size_t blockStarts; // 11 } struct BedMetadata{ string name; string description; size_t visibility; string itemRgb; size_t browserStart; size_t browserEnd; string chromosome; string hide; string toString(){ string result = ""; if( chromosome != "" && browserStart != 0 && browserEnd != 0 ) result ~= "browser position %s:%d-%d\n".format( chromosome, browserStart, browserEnd ); if( hide != "" ) result ~= "browser hide %s\n".format( hide ); if( name != "" && description != "" && visibility != 0 ) result ~= "track name=%s description=%s visibility=%d\n".format( name, description, visibility ); if( itemRgb != "" ) result ~= "itemRgb=\"%s\"".format( itemRgb ); return result; } } struct TrackLine{ string name; string description; string type; size_t visibility; size_t[3] color; string itemRgb; size_t[3] colorByStrand; size_t useScore; string group; string db; size_t offset; size_t maxItems; string url; string htmlUrl; string bigDataUrl; } struct Bed( T ){ BedMetadata metadata; TrackLine trackLine; T[] bedDataList; } TrackLine trackLineReader( in char[] trackLine ){ TrackLine result; size_t nameStart = line.countUntil("name=\""); if( nameStart != -1 ){ size_t nameEnd = nameStart + line[nameStart + 1 .. $ ].countUntil('"'); result.name = line[nameStart + 1 .. nameEnd]; } size_t descriptionStart = line.countUntil("description=\""); if( descriptionStart != -1 ){ size_t descriptionEnd = descriptionStart + line[descriptionStart + 1 .. $ ].countUntil('"'); result.description = line[descriptionStart + 1 .. descriptionEnd]; } size_t typeStart = line.countUntil("type=\""); if( typeStart != -1 ){ size_t typeEnd = typeStart + line[typeStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t visibilityStart = line.countUntil("visibility="); if( typeStart != -1 ){ size_t visibilityEnd = visibilityStart + line[visibilityStart + 1 .. $].countUntil(' '); result.type = line[typeStart + 1 .. typeEnd]; } size_t colorStart = line.countUntil("color=\""); if( typeStart != -1 ){ size_t colorEnd = colorStart + line[colorStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t itemRgbStart = line.countUntil("itemRgb=\""); if( typeStart != -1 ){ size_t itemRgbEnd = itemRgbStart + line[itemRgbStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t colorByStrandStart = line.countUntil("colorByStrand=\""); if( typeStart != -1 ){ size_t colorByStrandEnd = colorByStrand + line[colorByStrand + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t useScoreStart = line.countUntil("useScore="); if( typeStart != -1 ){ size_t useScoreEnd = nameStart + line[nameStart + 1 .. $ ].countUntil(' '); result.type = line[typeStart + 1 .. typeEnd]; } size_t groupStart = line.countUntil("group=\""); if( typeStart != -1 ){ size_t groupEnd = groupStart + line[groupStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t dbStart = line.countUntil("db=\""); if( typeStart != -1 ){ size_t dbEnd = dbStart + line[dbStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t offsetStart = line.countUntil("offset="); if( typeStart != -1 ){ size_t offsetEnd = offsetStart + line[offsetStart + 1 .. $ ].countUntil(' '); result.type = line[typeStart + 1 .. typeEnd]; } size_t maxItemsStart = line.countUntil("maxItems="); if( typeStart != -1 ){ size_t maxItemsEnd = maxItemsStart + line[maxItemsStart + 1 .. $ ].countUntil(' '); result.type = line[typeStart + 1 .. typeEnd]; } size_t urlStart = line.countUntil("url=\""); if( typeStart != -1 ){ size_t urlEnd = urlStart + line[urlStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t htmlUrlStart = line.countUntil("htmlUrl=\""); if( typeStart != -1 ){ size_t htmlUrlEnd = htmlUrlStart + line[htmlUrlStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } size_t bigDataUrlStart = line.countUntil("bigDataUrl=\""); if( typeStart != -1 ){ size_t bigDataUrlEnd = nameStart + line[bigDataUrlStart + 1 .. $ ].countUntil('"'); result.type = line[typeStart + 1 .. typeEnd]; } return result; } auto bedReader( T = BedData3 )( in char[] filePath, char delimiter='\t' ){ if( !filePath.exists ) throw new FileException( "File %s is do not exist".format(filePath) ); else if( !filePath.isFile ) throw new FileException( "File %s is not a file".format(filePath) ); File bedFile = File( to!string(filePath), "r" ); BedMetadata metadata; TrackLine trackLine; Bed!(ReturnType!(csvReader!T())[]) bedInstance; const string browserToken1 = "browser position"; const string browserToken2 = "browser hide"; const string trackToken = "track "; foreach( char[] line; bedFile.byLine() ){ if( line.startsWith( '#' ) ) // comment continue; else if( line.empty ) // empty line continue; else if( line.startsWith(browserToken1) ){ size_t colonIndex = line.countUntil(':'); size_t minusIndex = line[colonIndex .. $].countUntil('-'); string reversed = to!string( retro( line[browserToken1.length .. colonIndex] ) ); size_t spaceIndexBeforeChrom = reversed.countUntil(' '); size_t spaceIndexAfterPosition = line[minusIndex..$].countUntil(' '); size_t endPositionIndex = 0; if(spaceIndexAfterPosition == -1) endPositionIndex = line.length; else endPositionIndex = colonIndex + minusIndex + spaceIndexAfterPosition; metadata.chromosome = to!string( retro(reversed[0 .. spaceIndexBeforeChrom]) ); metadata.browserStart = to!size_t(line[colonIndex + 1 .. colonIndex + minusIndex]); metadata.browserEnd = to!size_t(line[colonIndex + minusIndex + 1 .. endPositionIndex]); } else if( line.startsWith(browserToken2) ){ if(line.length > browserToken2.length + 1) data.hide = line[browserToken2.length + 1 .. $].idup; else throw new Exception("Malformed metadata line"); } else if( line.startsWith(trackToken) ){ trackLine = trackLineReader( line ); } else{ // data in csv format auto records = csvReader!T(line, delimiter); bedInstance ~= records; } } bedInstance.metadata = metadata; bedInstance.trackLine = trackLine; return bedInstance; }