Created
April 8, 2018 11:16
-
-
Save thanhleviet/5efe525ef80b46669726fae324ca5839 to your computer and use it in GitHub Desktop.
De-homopolymer in nim-lang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Some proc are stolen from | |
# https://github.com/jhbadger/nimbioseq | |
import sequtils, strutils, math, tables, osproc, streams, os, typetraits | |
type Record* = object | |
## This type represents a genetic sequence with optional quality | |
id*: string | |
description*: string | |
quality*: string | |
sequence*: string | |
proc length*(self:Record): int = | |
## returns length of sequence | |
self.sequence.len() | |
proc check_app*(appname: string): bool = | |
## Check app existed? | |
var command = "type" | |
var process = startProcess(command, args=[appname], options={poUsePath}) | |
var line = TaintedString("") | |
var exited = true | |
while process.outputStream.readLine(line): | |
if line.find("not found") > -1: | |
exited = false | |
process.close | |
return exited | |
iterator compressedLines*(filename: string): string = | |
## iterator to read lines of a (maybe) compressed text file transparently | |
var command = "none" | |
if filename.find(".gz") > -1: | |
command = "gzcat" | |
elif filename.find(".bz2") > -1: | |
command = "bzcat" | |
if command == "none": | |
for line in lines filename: | |
yield line | |
else: | |
var process = startProcess(command, args=[filename], options={poUsePath}) | |
var line = TaintedString("") | |
while process.outputStream.readLine(line): | |
yield line | |
process.close | |
iterator readFasta*(filename: string): Record = | |
## iterator to iterate over the FASTA records in a file | |
var s = Record(id:nil, description:"", sequence:"") | |
var seqLines = @[""] | |
for line in compressedLines filename: | |
if line[0] == '>': | |
if s.id != nil: | |
s.sequence = seqLines.join | |
yield s | |
s.id = nil | |
s.description = "" | |
s.sequence = "" | |
seqLines = @[] | |
var fields = split(line[1..len(line)-1], ' ', 1) | |
if len(fields) > 1: | |
(s.id, s.description) = fields | |
else: | |
s.id = fields[0] | |
else: | |
seqLines.add(line) | |
if s.id != nil: | |
s.sequence = seqLines.join | |
yield s | |
iterator readFastq*(filename:string): Record = | |
## iterator to iterate over the FASTQ records in a file | |
var s = Record(id:nil, description:"", quality: "", sequence:"") | |
var lineNum = 0 | |
for line in compressedLines filename: | |
if lineNum == 0: | |
if s.id != nil: | |
yield s | |
s.id = nil | |
s.description = "" | |
s.sequence = "" | |
s.quality = "" | |
var fields = split(line[1..len(line)-1], ' ', 1) | |
if len(fields) > 1: | |
(s.id, s.description) = fields | |
else: | |
s.id = fields[0] | |
elif lineNum == 1: | |
s.sequence = line | |
elif lineNum == 3: | |
s.quality = line | |
lineNum = (lineNum + 1) mod 4 | |
if s.id != nil: | |
yield s | |
iterator readSeqs*(filename:string):Record = | |
for line in compressedLines filename: | |
if line[0] == '>': | |
for s in readFasta(filename): | |
yield s | |
break | |
elif line[0] == '@': | |
for s in readFastq(filename): | |
yield s | |
break | |
else: | |
echo "I don't know what type of file is " & filename | |
proc dehomo*(input:Record): Record = | |
var i = 0 | |
var dehomo = Record(id:nil, description:"", quality: "", sequence:"") | |
dehomo.id = input.id | |
dehomo.description = input.description | |
while i < input.length: | |
if i == 0: | |
dehomo.sequence.add(input.sequence[i]) | |
dehomo.quality.add(input.quality[i]) | |
else: | |
if input.sequence[i] != input.sequence[i-1]: | |
dehomo.sequence.add(input.sequence[i]) | |
dehomo.quality.add(input.quality[i]) | |
i += 1 | |
return dehomo | |
proc dehomopolymerate(input: string, output: string, summary = false) = | |
let progName = split(getAppFilename(), "/")[getAppFileName().count("/")] | |
if input != nil: | |
var o = system.open(output, fmWrite) | |
var s_sum = 0 | |
var dehomo_sum = 0 | |
var num_reads = 0 | |
for s in readSeqs(input): | |
var o_dehomo: Record | |
o_dehomo = dehomo(s) | |
o.writeline("@",o_dehomo.id," ", o_dehomo.description) | |
o.writeline(o_dehomo.sequence) | |
o.writeline("+") | |
o.writeline(o_dehomo.quality) | |
s_sum += s.length | |
dehomo_sum += o_dehomo.length | |
num_reads += 1 | |
o.close() | |
if summary: | |
echo "INPUT: ", s_sum, " bases" | |
echo "OUTPUT: ", dehomo_sum, " bases" | |
echo "Total reads: ", num_reads | |
else: | |
echo progName & ": needs file name" | |
# Need to have cligen installed, with nimble | |
when isMainModule: import cligen;dispatch(dehomopolymerate) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment