marcelm · September 28, 2022 09:50
diff --git a/split.awk b/split.awk
 # Split reads in a FASTQ file at adapter occurrences
 #
 # Run:
 #  cutadapt -O 100 --times=1000 -g MYADAPTERSEQ --info-file=info.txt -o /dev/null reads.fastq.gz
 #
 # Then:
 #  awk -F "\t" -f split.awk info.txt | gzip > split.fastq.gz


 # Relevant info file fields:
 #
 #  $1 read name
 #  $2 no. of errrors, -1 if no adapter found
 #  $5 sequence before adapter match
 #  $7 sequence after adapter match
 #  $9 qualities before adapter match
 #  $11 qualities after adapter match

 function print_fastq_record(name, seq, qual, n) {
    printf("@%s_%d\n%s\n+\n%s\n", name, n, seq, qual)
 }


 # Counter for read suffixes (_1, _2, _3)
 BEGIN {
    n = 1
 }

 # New read: output the last fragment of the previous read
 ($2 == -1 || $1 != read_name) && seq != "" {
    print_fastq_record(read_name, seq, qual, n);
    n = 1
 }

 # Skip records without adapter matches
 $2 == -1 { next }

 # This is done for all records
 {
    print_fastq_record($1, $5, $9, n)
    n += 1
    read_name = $1
    seq = $7
    qual = $11
 }

 # Output the last fragment of the last read
 END {
    if (seq != "") {
        print_fastq_record(read_name, seq, qual, n)
    }
 }
	# Split reads in a FASTQ file at adapter occurrences
	#
	# Run:
	# cutadapt -O 100 --times=1000 -g MYADAPTERSEQ --info-file=info.txt -o /dev/null reads.fastq.gz
	#
	# Then:
	# awk -F "\t" -f split.awk info.txt \| gzip > split.fastq.gz


	# Relevant info file fields:
	#
	# $1 read name
	# $2 no. of errrors, -1 if no adapter found
	# $5 sequence before adapter match
	# $7 sequence after adapter match
	# $9 qualities before adapter match
	# $11 qualities after adapter match

	function print_fastq_record(name, seq, qual, n) {
	printf("@%s_%d\n%s\n+\n%s\n", name, n, seq, qual)
	}


	# Counter for read suffixes (_1, _2, _3)
	BEGIN {
	n = 1
	}

	# New read: output the last fragment of the previous read
	($2 == -1 \|\| $1 != read_name) && seq != "" {
	print_fastq_record(read_name, seq, qual, n);
	n = 1
	}

	# Skip records without adapter matches
	$2 == -1 { next }

	# This is done for all records
	{
	print_fastq_record($1, $5, $9, n)
	n += 1
	read_name = $1
	seq = $7
	qual = $11
	}

	# Output the last fragment of the last read
	END {
	if (seq != "") {
	print_fastq_record(read_name, seq, qual, n)
	}
	}