l-modolo’s gists

l-modolo / reads_length_dist.sh

Created October 31, 2013 09:31

compute reads length distribution from a fastq file

	#!/bin/sh
	# compute reads length distribution from a fastq file
	awk 'NR%4 == 2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' file.fastq

l-modolo / qsub.pbs

Last active April 11, 2016 14:30

qsub template for torque

	#PBS -q q1day
	#PBS -j oe
	#PBS -o $HOME/working_directory/task.log
	#PBS -N task
	#PBS -m n
	#PBS -l mem=memory_sizegb,nodes=1:ppn=core_number
	#PBS -S /bin/bash
	# if more than 65gb:
	# -l nodes=1:ppn=1:bigmem

l-modolo / cd-hit output file parser

Created December 3, 2013 13:41

add cluster number to each hit of cd-hist ouput file while removing the cluster number line

	#!/bin/sh
	cat file.hit.clstr \| awk '{if ($1 == ">Cluster") {clusterNumber = $2} else {print(clusterNumber"\t"$0)}}'

l-modolo / split_concataned_paired_end_fastq.sh

Last active August 29, 2015 13:56

split paired-end fastq file in the case where reads from the two ends are concatened (not interlaced)

	#!/bin/sh
	# split fastq file in two file for cases where paired-end are concatened (not interlaced)
	(gzip -dc intput.fastq.gz) \| awk -F"=" 'BEGIN {OFS = "\n"} {name = $0; getline seq; getline name2; getline phred; print name, substr(seq,0,int(length(seq)/2)), name2, substr(phred,0,int(length(seq)/2)) >> "intput-1.fastq"; print name, substr(seq,int(length(seq)/2)+1,length(seq)), name2, substr(phred,int(length(seq)/2)+1,length(seq)) >> "intput-2.fastq"}'

l-modolo / fastq_by_size.sh

Last active August 29, 2015 13:57

Only read with more than 20bp from a fastq

awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 20){print header, seq, qheader, qseq}}' < input.fastq > output.fastq

l-modolo / base_number_fqgz.sh

Last active August 27, 2020 15:47

base_number_fqgz.sh

	#!/bin/sh
	# count number of base in a fastq.gz
	(gzip -dc $0) \| awk 'NR%4 == 2 {basenumber += length($0)} END {print basenumber}'

l-modolo / fasta_length.sh

Created May 12, 2014 09:38

list of sequence_id sequence_length from a fasta file

	#!/bin/sh
	# return a list of sequence_id sequence_length from a fasta file

	awk 'BEGIN {OFS = "\n"}; /^>/ {print(substr(sequence_id, 2)" "sequence_length); sequence_length = 0; sequence_id = $0}; /^[^>]/ {sequence_length += length($0)}; END {print(substr(sequence_id, 2)" "sequence_length)}' file.fasta

l-modolo / fasta2upper.sh

Created May 14, 2014 09:04

convert fasta sequence to uppercase, usefull for nhmmer

	#!/bin/sh
	# convert fasta sequence to uppercase, usefull for nhmmer
	awk '/^>/ {print($0)}; /^[^>]/ {print(toupper($0))}' file.fasta > file_upper.fasta

l-modolo / remove_special.sh

Last active September 18, 2019 17:28

remove all non ASCII character from file/folder names in a file tree

	#!/bin/sh
	# remove all non ASCII character from file/folder names in a file tree
	convmv -f utf8 -t ASCII -r . 2>&1 \| grep ascii \| perl -pe "s/ascii doesn't cover all needed characters for: //g" \| awk '{system("mv "$0" \"$(echo "$0" \| uconv -t ASCII -x nfd -c )\"")}'

l-modolo / back.sh

Created March 23, 2015 22:39

alternative to renaming old file with a .back

	#!/bin/sh
	# alternative to renaming old file with a .back
	# usage : back.sh file_name
	mv "$1" "$1"$(date +".%m-%d-%y_%T")

Laurent Modolo l-modolo