Last active
May 6, 2018 17:04
-
-
Save seb-mueller/900b0a4263782fe4f9b7d85edb1115bd to your computer and use it in GitHub Desktop.
Contamination checking for fasta/fastq files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
export BLASTDB=/data/public_data/NCBI/NCBI_all/ #and specify the db as -db nt |or -db /data/public_data/NCBI/NCBI_nt/nt | |
export PATH=/applications/UCSC-tools/:/applications/ncbi-blast+/ncbi-blast-2.2.30+/bin/:$PATH | |
#subsetting and converting into fasta | |
file=sample.fq.gz | |
filefa=${file%.fq.gz}_subset.fa | |
n=400000 | |
zcat $file | head -n $n > ${file%.fq.gz}_subset.fq | |
fastqToFa ${file%.fq.gz}_subset.fq $filefa | |
#blasting and summarizing | |
#filefa=file.fa | |
blastn -num_threads 10 -evalue 0.001 -outfmt '7 std scomnam sacc staxids qacc sallseqid sallgi sscinames scomnames sblastnames sskingdoms salltitles' -num_alignments 5 -db nt -query $filefa -out ${filefa}.blast | |
#head ${filefa}.blast | |
# BLASTN 2.6.0+ | |
# Query: NB501820:38:H22JFAFXY:1:11101:22479:1239 2:N:0:GGACTCCT | |
# Database: nt | |
# Fields: query acc.ver, subject acc.ver, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score, subject acc., subject tax ids, query acc., subject ids, subject gis, subject sci names, subject com names, subject blast names, subject super kingdoms, subject titles | |
# 5 hits found | |
#NB501820:38:H22JFAFXY:1:11101:22479:1239 NG_054891.1 97.260 73 2 0 1 73 4356 4284 1.70e-25 124 NG_054891 9606 NB501820:38:H22JFAFXY:1:11101:22479:1239 gi|1198817830|ref|NG_054891.1| 1198817830 Homo sapiens human primates Eukaryota Homo sapiens integrin subunit beta 1 binding protein 2 (ITGB1BP2), RefSeqGene on chromosome X | |
#NB501820:38:H22JFAFXY:1:11101:22479:1239 NG_046742.1 97.260 73 2 0 1 73 22439 22367 1.70e-25 124 NG_046742 9606 NB501820:38:H22JFAFXY:1:11101:22479:1239 gi|1000814597|ref|NG_046742.1| 1000814597 Homo sapiens human primates Eukaryota Homo sapiens non-POU domain containing octamer binding (NONO), RefSeqGene on chromosome X | |
less ${filefa}.blast | sort -u -k1,1 --merge | grep -v "# BLAST" | cut -f19 | sort | uniq -c | sort -r -n > ${filefa}.blast.processed | |
#head ${filefa}.blast.processed | |
# 2 human | |
# 1 house mouse |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment