Skip to content

Instantly share code, notes, and snippets.

@l-modolo
l-modolo / reads_length_dist.sh
Created October 31, 2013 09:31
compute reads length distribution from a fastq file
#!/bin/sh
# compute reads length distribution from a fastq file
awk 'NR%4 == 2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' file.fastq
@l-modolo
l-modolo / qsub.pbs
Last active April 11, 2016 14:30
qsub template for torque
#PBS -q q1day
#PBS -j oe
#PBS -o $HOME/working_directory/task.log
#PBS -N task
#PBS -m n
#PBS -l mem=memory_sizegb,nodes=1:ppn=core_number
#PBS -S /bin/bash
# if more than 65gb:
# -l nodes=1:ppn=1:bigmem
@l-modolo
l-modolo / cd-hit output file parser
Created December 3, 2013 13:41
add cluster number to each hit of cd-hist ouput file while removing the cluster number line
#!/bin/sh
cat file.hit.clstr | awk '{if ($1 == ">Cluster") {clusterNumber = $2} else {print(clusterNumber"\t"$0)}}'
@l-modolo
l-modolo / split_concataned_paired_end_fastq.sh
Last active August 29, 2015 13:56
split paired-end fastq file in the case where reads from the two ends are concatened (not interlaced)
#!/bin/sh
# split fastq file in two file for cases where paired-end are concatened (not interlaced)
(gzip -dc intput.fastq.gz) | awk -F"=" 'BEGIN {OFS = "\n"} {name = $0; getline seq; getline name2; getline phred; print name, substr(seq,0,int(length(seq)/2)), name2, substr(phred,0,int(length(seq)/2)) >> "intput-1.fastq"; print name, substr(seq,int(length(seq)/2)+1,length(seq)), name2, substr(phred,int(length(seq)/2)+1,length(seq)) >> "intput-2.fastq"}'
@l-modolo
l-modolo / fastq_by_size.sh
Last active August 29, 2015 13:57
Only read with more than 20bp from a fastq
awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 20){print header, seq, qheader, qseq}}' < input.fastq > output.fastq
@l-modolo
l-modolo / base_number_fqgz.sh
Last active August 27, 2020 15:47
base_number_fqgz.sh
#!/bin/sh
# count number of base in a fastq.gz
(gzip -dc $0) | awk 'NR%4 == 2 {basenumber += length($0)} END {print basenumber}'
@l-modolo
l-modolo / fasta_length.sh
Created May 12, 2014 09:38
list of sequence_id sequence_length from a fasta file
#!/bin/sh
# return a list of sequence_id sequence_length from a fasta file
awk 'BEGIN {OFS = "\n"}; /^>/ {print(substr(sequence_id, 2)" "sequence_length); sequence_length = 0; sequence_id = $0}; /^[^>]/ {sequence_length += length($0)}; END {print(substr(sequence_id, 2)" "sequence_length)}' file.fasta
@l-modolo
l-modolo / fasta2upper.sh
Created May 14, 2014 09:04
convert fasta sequence to uppercase, usefull for nhmmer
#!/bin/sh
# convert fasta sequence to uppercase, usefull for nhmmer
awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' file.fasta > file_upper.fasta
@l-modolo
l-modolo / remove_special.sh
Last active September 18, 2019 17:28
remove all non ASCII character from file/folder names in a file tree
#!/bin/sh
# remove all non ASCII character from file/folder names in a file tree
convmv -f utf8 -t ASCII -r . 2>&1 | grep ascii | perl -pe "s/ascii doesn't cover all needed characters for: //g" | awk '{system("mv "$0" \"$(echo "$0" | uconv -t ASCII -x nfd -c )\"")}'
@l-modolo
l-modolo / back.sh
Created March 23, 2015 22:39
alternative to renaming old file with a .back
#!/bin/sh
# alternative to renaming old file with a .back
# usage : back.sh file_name
mv "$1" "$1"$(date +".%m-%d-%y_%T")