Last active
August 29, 2015 14:01
-
-
Save Buttonwood/703bd319467e38227a2e to your computer and use it in GitHub Desktop.
FastQC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // #============================================================================= | |
| // # FileName: fastqdup.cpp | |
| // # Desc: A short program for duplication reads statistics. | |
| // # Author: tanhao | |
| // # Email: tanhao2013@gmail.com | |
| // # HomePage: http://buttonwood.github.io | |
| // # Version: 0.0.1 | |
| // # LastChange: 2014-04-15 10:37:49 | |
| // # History: | |
| // #============================================================================= | |
| // gcc -Wall fastqdup.cpp -lstdc++ -lgzstream -lz -o fastqdup | |
| #include <stdio.h> | |
| #include <bitset> | |
| #include <string> | |
| #include <iostream> | |
| #include <fstream> | |
| #include <functional> | |
| #include <algorithm> | |
| #include <map> | |
| #include <gzstream.h> | |
| #include <unordered_map> | |
| //#include <unordered_set> | |
| //#include <boost/unordered_map.hpp> | |
| using namespace std; | |
| string seq2bit(const string &str){ | |
| string res = ""; | |
| for(size_t i = 0; i < str.length(); ++i) | |
| { | |
| if(str[i] == 'A' || str[i] == 'a'){ | |
| res += "00"; | |
| } | |
| if(str[i] == 'T' || str[i] == 't'){ | |
| res += "01"; | |
| } | |
| if(str[i] == 'C' || str[i] == 'c'){ | |
| res += "10"; | |
| } | |
| if(str[i] == 'G' || str[i] == 'g'){ | |
| res += "11"; | |
| } | |
| if(str[i] == 'N' || str[i] == 'n'){ | |
| res += "00"; | |
| } | |
| } | |
| return res; | |
| } | |
| void usage(){ | |
| cout << "Usage:\n\tfastqdup read1.fq[.gz] read2.fq[.gz] clean1.fq clean2.fq dup.tab 1>log 2>err\n" << endl; | |
| exit(1); | |
| } | |
| int main(int argc, char const *argv[]) | |
| { | |
| if (argc<5){ | |
| usage(); | |
| exit(1); | |
| } | |
| igzstream r1(argv[1]); | |
| igzstream r2(argv[2]); | |
| ofstream c1(argv[3]); | |
| ofstream c2(argv[4]); | |
| ofstream s1(argv[5]); | |
| typedef bitset<100> basebit; | |
| //typedef boost::unordered_map<basebit, long> simap; | |
| typedef map<basebit, long> simap; | |
| simap mymap; | |
| string read1, seq1, qual1, read2, seq2, qual2, line; | |
| long num = 0; | |
| while(getline(r1, read1, '\n')) { | |
| num++; | |
| getline(r2, read2, '\n'); | |
| getline(r1, seq1, '\n'); | |
| getline(r2, seq2, '\n'); | |
| getline(r1, line, '\n'); | |
| getline(r2, line, '\n'); | |
| getline(r1, qual1, '\n'); | |
| getline(r2, qual2, '\n'); | |
| line = seq1 + seq2; | |
| string t = seq2bit(line); | |
| basebit f(t); | |
| simap::const_iterator got = mymap.find(f); | |
| if ( got == mymap.end() ){ | |
| mymap.insert(make_pair(f, num)); | |
| c1 << read1 << "\n" << seq1 << "\n" << "+\n" << qual1 << endl; | |
| c2 << read2 << "\n" << seq2 << "\n" << "+\n" << qual2 << endl; | |
| }else{ | |
| s1 << got->second << "\t" << num << endl; | |
| } | |
| } | |
| r1.close(); | |
| r2.close(); | |
| c1.close(); | |
| c2.close(); | |
| s1.close(); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #============================================================================= | |
| # FileName: fastqdup.py | |
| # Desc: A short script for fastq duplication statistics. | |
| # Author: tanhao | |
| # Email: tanhao2013@gmail.com | |
| # HomePage: http://buttonwood.github.io | |
| # Version: 0.0.1 | |
| # LastChange: 2014-04-15 11:52:31 | |
| # History: | |
| #============================================================================= | |
| import argparse | |
| import gzip | |
| # def byLineReader(filename): | |
| # if filename.endswith(".gz"): | |
| # f = gzip.open(filename,'r') | |
| # else: | |
| # f = open(filename,'r') | |
| # line = f.readline().rstrip("\n") | |
| # while line: | |
| # yield line | |
| # line = f.readline().rstrip("\n") | |
| # f.close() | |
| # yield None | |
| def main(args): | |
| prefix = args.read1.split("\.")[0] | |
| suffix = "clean.dup.gz" | |
| if args.suffix: | |
| suffix = args.suffix + ".dup.gz" | |
| clean1 = prefix + ".R1." + suffix | |
| clean2 = prefix + ".R2." + suffix | |
| statis = prefix + ".dup.stat" | |
| if not args.read1.endswith(".gz"): | |
| a = open(args.read1,'r') | |
| b = open(args.read2,'r') | |
| else: | |
| a = gzip.open(args.read1,'r') | |
| b = gzip.open(args.read2,'r') | |
| # a = byLineReader(args.read1) | |
| # b = byLineReader(args.read2) | |
| c = gzip.open(clean1,'w') | |
| d = gzip.open(clean2,'w') | |
| e = open(statis, "w") | |
| dup = {} | |
| num = 0 | |
| while 1: | |
| num += 1 | |
| f1_tag = a.readline().rstrip("\n") | |
| if not f1_tag: | |
| break | |
| f1_seq = a.readline().rstrip("\n") | |
| a.readline().rstrip("\n") | |
| f1_qua = a.readline().rstrip("\n") | |
| f2_tag = b.readline().rstrip("\n") | |
| f2_seq = b.readline().rstrip("\n") | |
| b.readline().rstrip("\n") | |
| f2_qua = b.readline().rstrip("\n") | |
| pe_seq = f1_seq + f2_seq | |
| kseq = hash(pe_seq) | |
| if kseq in dup: | |
| e.write(repr(dup[kseq]) + "\t" + repr(num) + "\n") | |
| else: | |
| dup[kseq] = num | |
| c.write(f1_tag + "\n") | |
| c.write(f1_seq + "\n") | |
| c.write("+\n") | |
| c.write(f1_qua + "\n") | |
| d.write(f2_tag + "\n") | |
| d.write(f2_seq + "\n") | |
| d.write("+\n") | |
| d.write(f2_qua + "\n") | |
| for ifile in [a,b,c,d,e]: | |
| ifile.close() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| formatter_class=argparse.RawTextHelpFormatter, | |
| description=""" | |
| A script for fastq duplication statistics. | |
| Input -> | |
| A pair of reads, also support for gz files. | |
| Output -> | |
| Clean PE reads; and a stat file records for duplication | |
| reads locations for checking up. | |
| Examples -> | |
| python fastqdup.py -r1 r1.fq -r2 r2.fq -s clean | |
| """) | |
| parser.add_argument( | |
| "-r1", "--read1", | |
| required=True, | |
| help="input read1 file [must]") | |
| parser.add_argument( | |
| "-r2", "--read2", | |
| required=True, | |
| help="input read2 file [must]") | |
| parser.add_argument( | |
| "-s", "--suffix", | |
| required=False, | |
| help="output files suffix [Default:clean]") | |
| args = parser.parse_args() | |
| main(args) | |
| #================ Float like a butterfly! Stand like a buttonwood! ================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment