Skip to content

Instantly share code, notes, and snippets.

@Buttonwood
Last active August 29, 2015 14:01
Show Gist options
  • Select an option

  • Save Buttonwood/703bd319467e38227a2e to your computer and use it in GitHub Desktop.

Select an option

Save Buttonwood/703bd319467e38227a2e to your computer and use it in GitHub Desktop.
FastQC
// #=============================================================================
// # FileName: fastqdup.cpp
// # Desc: A short program for duplication reads statistics.
// # Author: tanhao
// # Email: tanhao2013@gmail.com
// # HomePage: http://buttonwood.github.io
// # Version: 0.0.1
// # LastChange: 2014-04-15 10:37:49
// # History:
// #=============================================================================
// gcc -Wall fastqdup.cpp -lstdc++ -lgzstream -lz -o fastqdup
#include <stdio.h>
#include <bitset>
#include <string>
#include <iostream>
#include <fstream>
#include <functional>
#include <algorithm>
#include <map>
#include <gzstream.h>
#include <unordered_map>
//#include <unordered_set>
//#include <boost/unordered_map.hpp>
using namespace std;
string seq2bit(const string &str){
string res = "";
for(size_t i = 0; i < str.length(); ++i)
{
if(str[i] == 'A' || str[i] == 'a'){
res += "00";
}
if(str[i] == 'T' || str[i] == 't'){
res += "01";
}
if(str[i] == 'C' || str[i] == 'c'){
res += "10";
}
if(str[i] == 'G' || str[i] == 'g'){
res += "11";
}
if(str[i] == 'N' || str[i] == 'n'){
res += "00";
}
}
return res;
}
void usage(){
cout << "Usage:\n\tfastqdup read1.fq[.gz] read2.fq[.gz] clean1.fq clean2.fq dup.tab 1>log 2>err\n" << endl;
exit(1);
}
int main(int argc, char const *argv[])
{
if (argc<5){
usage();
exit(1);
}
igzstream r1(argv[1]);
igzstream r2(argv[2]);
ofstream c1(argv[3]);
ofstream c2(argv[4]);
ofstream s1(argv[5]);
typedef bitset<100> basebit;
//typedef boost::unordered_map<basebit, long> simap;
typedef map<basebit, long> simap;
simap mymap;
string read1, seq1, qual1, read2, seq2, qual2, line;
long num = 0;
while(getline(r1, read1, '\n')) {
num++;
getline(r2, read2, '\n');
getline(r1, seq1, '\n');
getline(r2, seq2, '\n');
getline(r1, line, '\n');
getline(r2, line, '\n');
getline(r1, qual1, '\n');
getline(r2, qual2, '\n');
line = seq1 + seq2;
string t = seq2bit(line);
basebit f(t);
simap::const_iterator got = mymap.find(f);
if ( got == mymap.end() ){
mymap.insert(make_pair(f, num));
c1 << read1 << "\n" << seq1 << "\n" << "+\n" << qual1 << endl;
c2 << read2 << "\n" << seq2 << "\n" << "+\n" << qual2 << endl;
}else{
s1 << got->second << "\t" << num << endl;
}
}
r1.close();
r2.close();
c1.close();
c2.close();
s1.close();
return 0;
}
#=============================================================================
# FileName: fastqdup.py
# Desc: A short script for fastq duplication statistics.
# Author: tanhao
# Email: tanhao2013@gmail.com
# HomePage: http://buttonwood.github.io
# Version: 0.0.1
# LastChange: 2014-04-15 11:52:31
# History:
#=============================================================================
import argparse
import gzip
# def byLineReader(filename):
# if filename.endswith(".gz"):
# f = gzip.open(filename,'r')
# else:
# f = open(filename,'r')
# line = f.readline().rstrip("\n")
# while line:
# yield line
# line = f.readline().rstrip("\n")
# f.close()
# yield None
def main(args):
prefix = args.read1.split("\.")[0]
suffix = "clean.dup.gz"
if args.suffix:
suffix = args.suffix + ".dup.gz"
clean1 = prefix + ".R1." + suffix
clean2 = prefix + ".R2." + suffix
statis = prefix + ".dup.stat"
if not args.read1.endswith(".gz"):
a = open(args.read1,'r')
b = open(args.read2,'r')
else:
a = gzip.open(args.read1,'r')
b = gzip.open(args.read2,'r')
# a = byLineReader(args.read1)
# b = byLineReader(args.read2)
c = gzip.open(clean1,'w')
d = gzip.open(clean2,'w')
e = open(statis, "w")
dup = {}
num = 0
while 1:
num += 1
f1_tag = a.readline().rstrip("\n")
if not f1_tag:
break
f1_seq = a.readline().rstrip("\n")
a.readline().rstrip("\n")
f1_qua = a.readline().rstrip("\n")
f2_tag = b.readline().rstrip("\n")
f2_seq = b.readline().rstrip("\n")
b.readline().rstrip("\n")
f2_qua = b.readline().rstrip("\n")
pe_seq = f1_seq + f2_seq
kseq = hash(pe_seq)
if kseq in dup:
e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n")
else:
dup[kseq] = num
c.write(f1_tag + "\n")
c.write(f1_seq + "\n")
c.write("+\n")
c.write(f1_qua + "\n")
d.write(f2_tag + "\n")
d.write(f2_seq + "\n")
d.write("+\n")
d.write(f2_qua + "\n")
for ifile in [a,b,c,d,e]:
ifile.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description="""
A script for fastq duplication statistics.
Input ->
A pair of reads, also support for gz files.
Output ->
Clean PE reads; and a stat file records for duplication
reads locations for checking up.
Examples ->
python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean
""")
parser.add_argument(
"-r1", "--read1",
required=True,
help="input read1 file [must]")
parser.add_argument(
"-r2", "--read2",
required=True,
help="input read2 file [must]")
parser.add_argument(
"-s", "--suffix",
required=False,
help="output files suffix [Default:clean]")
args = parser.parse_args()
main(args)
#================ Float like a butterfly! Stand like a buttonwood! ================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment