Last active
June 13, 2018 14:22
-
-
Save haxiomic/a78ced3c70f5db065b0b5679b6fdb9e1 to your computer and use it in GitHub Desktop.
Parses genomic feature files in the gff3 format – purely a line-parser in that it does not build up objects but instead calls callbacks for each line-type
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* # GFF3 File format | |
* https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md | |
*/ | |
export type LineCallbacks = { | |
// directives | |
onVersion: (versionString: string) => void, | |
onSequenceRegion: (seqId: string | null, start: number | null, end: number | null) => void, | |
onFeatureOntology: (uri: string) => void, | |
onAttributeOntology: (uri: string) => void, | |
onSourceOntology: (uri: string) => void, | |
onSpecies: (ncbiTaxonomyUri: string) => void, | |
onGenomeBuild: (source: string, buildName: string) => void, | |
onFeatureGroupTermination: () => void, // ### | |
onFastaStart: () => void, | |
onComment: (comment: string) => void, | |
onFeature: ( | |
seqId: string, | |
source: string | null, | |
type: string, | |
start: number | null, | |
end: number | null, | |
score: number | null, | |
strand: Strand, | |
phase: Phase | null, | |
attributes: Attributes, | |
) => void, | |
// error handling | |
onInvalidFeature: (line: string, reason: string) => void, | |
onInvalidAttribute: (assignment: string, reason: string) => void, | |
onUnknownDirective: (name: string, parameter: string | null) => void, | |
onInvalidDirective: (content: string, reason: string) => void, | |
// once the ##FASTA directive has been encountered the rest of the file is FASTA content | |
onFastaChunk: (string: string) => void, | |
onComplete: () => void | |
} | |
export type Attributes = { | |
id?: string, | |
name?: string, | |
aliases?: Array<string>, | |
parentIds?: Array<string>, | |
target?: { // Target=EST23 1 21 | |
id: string, | |
start: number, | |
end: number, | |
strand?: Strand, | |
}, | |
gap?: string, | |
derivesFromId?: string, | |
notes?: Array<string>, | |
dbxrefs?: Array<string>, | |
ontologyTerms?: Array<string>, | |
isCircular: boolean, | |
custom: { [key: string]: Array<string> } | |
} | |
export enum Strand { | |
None, | |
Unknown, | |
Positive, | |
Negative, | |
} | |
export type Phase = number; | |
export class GFF3LineParser { | |
protected callbacks: LineCallbacks = { | |
onVersion: (versionString: string) => {}, | |
onSequenceRegion: (seqId: string, start: number, end: number) => {}, | |
onFeatureOntology: (uri: string) => {}, | |
onAttributeOntology: (uri: string) => {}, | |
onSourceOntology: (uri: string) => {}, | |
onSpecies: (ncbiTaxonomyUri: string) => {}, | |
onGenomeBuild: (source: string, buildName: string) => {}, | |
onFeatureGroupTermination: () => {}, // ### | |
onFastaStart: () => {}, | |
onComment: (comment: string) => {}, | |
onFeature: ( | |
seqId: string, | |
source: string | null, | |
type: string, | |
start: number | null, | |
end: number | null, | |
score: number | null, | |
strand: Strand, | |
phase: Phase | null, | |
attributes: Attributes, | |
) => {}, | |
// error handling | |
onUnknownDirective: (name: string, parameter: string | null) => {}, | |
onInvalidDirective: (content: string, reason: string) => {}, | |
onInvalidFeature: (line: string, reason: string) => {}, | |
onInvalidAttribute: (assignment: string, reason: string) => {}, | |
onFastaChunk: (string: string) => {}, | |
onComplete: () => {} | |
}; | |
// parse state | |
protected lineNumber = 0; | |
protected fastaMode: boolean; | |
protected incompleteLineBuffer: string; | |
constructor(callbacks: Partial<LineCallbacks>) { | |
this.callbacks = { | |
...this.callbacks, | |
...callbacks | |
} | |
this.reset(); | |
} | |
parseChunk = (string: string) => { | |
if (this.fastaMode) { | |
this.callbacks.onFastaChunk(string); | |
return; | |
} | |
let lineStart = 0; | |
for (let i = 0; i < string.length; i++) { | |
let char = string.charAt(i); | |
if (char === '\n') { | |
this.parseLine(this.incompleteLineBuffer + string.substring(lineStart, i), this.lineNumber++); | |
lineStart = i + 1; | |
this.incompleteLineBuffer = ''; | |
} | |
} | |
this.incompleteLineBuffer = string.substring(lineStart); | |
} | |
end = () => { | |
this.parseLine(this.incompleteLineBuffer, this.lineNumber++); | |
this.callbacks.onComplete(); | |
} | |
reset = () => { | |
this.lineNumber = 0; | |
this.incompleteLineBuffer = ''; | |
this.fastaMode = false; | |
} | |
protected parseLine(line: string, lineNumber: number) { | |
// empty lines are allowed and skipped | |
if (line === '') return; | |
// if line starts with a # it's a meta line – a comment or a directive | |
if (line[0] === '#') { | |
this.parseMeta(line); | |
} else { | |
// parse line | |
let columns = line.split('\t'); | |
if (columns.length !== 9) { | |
this.callbacks.onInvalidFeature(line, `Expected 9 tab-separated columns, got ${columns.length}`); | |
} else { | |
this.callbacks.onFeature( | |
// seqId | |
decodeURIComponent(columns[0]), | |
// source | |
this.parseOptional(columns[1]) === null ? null : decodeURIComponent(columns[1]), | |
// type | |
decodeURIComponent(columns[2]), | |
// start | |
this.parseOptional(columns[3]) === null ? null : parseInt(columns[3]), | |
// end | |
this.parseOptional(columns[4]) === null ? null : parseInt(columns[4]), | |
// score | |
this.parseOptional(columns[5]) === null ? null : parseFloat(columns[5]), | |
// strand | |
this.parseStrand(this.parseOptional(columns[6])), | |
// phase | |
this.parseOptional(columns[7]) === null ? null : parseInt(columns[7]), | |
// attributes | |
this.parseAttributes(this.parseOptional(columns[8])) | |
); | |
} | |
} | |
} | |
protected parseOptional(field: string): string | null { | |
return field === '.' ? null : field; | |
} | |
protected parseStrand(field: string | null): Strand { | |
switch (field) { | |
case '+': return Strand.Positive; | |
case '-': return Strand.Negative; | |
case '?': return Strand.Unknown; | |
default: return Strand.None; | |
} | |
} | |
protected parseAttributes(field: string | null): Attributes { | |
// create empty attributes object | |
let attributes: Attributes = { | |
isCircular: false, | |
custom: {} | |
}; | |
// field can be null | |
if (field == null) { | |
return attributes; | |
} | |
let assignments = field.split(';'); | |
for (let assignment of assignments) { | |
try { | |
let e = assignment.indexOf('='); | |
if (e === -1) { | |
throw `Assignment must contain a '=' character`; | |
} | |
let tag = decodeURIComponent(assignment.substring(0, e)).trim(); | |
let values = assignment.substring(e + 1).split(',').map(decodeURIComponent); | |
// tags are case sensitive | |
switch (tag) { | |
case 'ID': { | |
attributes.id = values[0]; | |
break; | |
} | |
case 'Name': { | |
attributes.name = values[0]; | |
break; | |
} | |
case 'Alias': { | |
attributes.aliases = values; | |
break; | |
} | |
case 'Parent': { | |
attributes.parentIds = values; | |
break; | |
} | |
case 'Target': { | |
let result = values[0].match(/([^\s]+)\s+(\d+)\s+(\d+)(\s+([+-]))?/); | |
if (result !== null) { | |
attributes.target = { | |
id: result[1], | |
start: parseInt(result[2]), | |
end: parseInt(result[3]), | |
strand: this.parseStrand(result[5]), | |
}; | |
} else { | |
throw 'Could not parse target format'; | |
} | |
break; | |
} | |
case 'Gap': { | |
attributes.gap = values[0]; | |
break; | |
} | |
case 'Derives_from': { | |
attributes.derivesFromId = values[0]; | |
break; | |
} | |
case 'Note': { | |
attributes.notes = values; | |
break; | |
} | |
case 'Dbxref': { | |
attributes.dbxrefs = values; | |
break; | |
} | |
case 'Ontology_term': { | |
attributes.ontologyTerms = values; | |
break; | |
} | |
case 'Is_circular': { | |
attributes.isCircular = (values[0].toLowerCase().trim()) === 'true'; | |
break; | |
} | |
default: { | |
attributes.custom[tag] = values; | |
break; | |
} | |
} | |
} catch (e) { | |
this.callbacks.onInvalidAttribute(assignment, e); | |
} | |
} | |
return attributes; | |
} | |
// #... | |
protected parseMeta(line: string) { | |
if (line[1] === '#') { | |
// if a meta starts with ## then it's a directive | |
this.parseDirective(line); | |
} else { | |
this.callbacks.onComment(line.substr(1)); | |
} | |
} | |
// ##... | |
protected parseDirective(line: string) { | |
let content = line.substring(2); | |
let namePattern = /^([^\s]+)(\s+(.*))?/; | |
let result = namePattern.exec(content); | |
if (result === null) { | |
this.callbacks.onInvalidDirective(content, 'Invalid directive name'); | |
return; | |
} | |
let name: string = result[1].toLowerCase(); | |
let parameter: string | null = result[3]; | |
try { | |
switch (name) { | |
case '#': { | |
if (parameter != null) throw 'Feature termination directive must not have any parameter'; | |
this.callbacks.onFeatureGroupTermination(); | |
break; | |
} | |
case 'gff-version': { | |
if (parameter == null) throw 'Missing version string'; | |
this.callbacks.onVersion(parameter.trim()); | |
break; | |
} | |
case 'sequence-region': { | |
let seqId: string | null = null; | |
let start: number | null = null; | |
let end: number | null = null; | |
let match = (parameter || '').match(/^([^\s]+)(\s+(\d+))?(\s+(\d+))?/); | |
if (match !== null) { | |
seqId = decodeURIComponent(match[1]); | |
start = match[3] === undefined ? null : parseInt(match[3]); | |
end = match[5] === undefined ? null : parseInt(match[5]); | |
} | |
this.callbacks.onSequenceRegion(seqId, start, end); | |
break; | |
} | |
case 'feature-ontology': { | |
if (parameter == null) throw 'Missing URI'; | |
this.callbacks.onFeatureOntology(parameter); | |
break; | |
} | |
case 'attribute-ontology': { | |
if (parameter == null) throw 'Missing URI'; | |
this.callbacks.onAttributeOntology(parameter); | |
break; | |
} | |
case 'source-ontology': { | |
if (parameter == null) throw 'Missing URI'; | |
this.callbacks.onSourceOntology(parameter); | |
break; | |
} | |
case 'species': { | |
if (parameter == null) throw 'Missing species'; | |
this.callbacks.onSpecies(decodeURIComponent(parameter)); | |
break; | |
} | |
case 'genome-build': { | |
if (parameter == null) throw 'Missing source and build name'; | |
let parts = (parameter || '').split(/\s+/); | |
this.callbacks.onGenomeBuild(decodeURIComponent(parts[0]), decodeURIComponent(parts[1])); | |
break; | |
} | |
case '#': { // ### | |
this.callbacks.onFeatureGroupTermination(); | |
break; | |
} | |
case 'fasta': { | |
this.fastaMode = true; | |
this.callbacks.onFastaStart(); | |
break; | |
} | |
default: { | |
this.callbacks.onUnknownDirective(name, parameter); | |
break; | |
} | |
} | |
} catch (reason) { | |
this.callbacks.onInvalidDirective(content, reason); | |
} | |
} | |
} | |
export default GFF3LineParser; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment