Last active
March 7, 2024 09:45
-
-
Save jelber2/53a95d44667cf1a8d84af6cde6931817 to your computer and use it in GitHub Desktop.
Stitch FASTQ reads shredded with shred.sh from BBTools back together
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tested on Julialang v.1.10.2 , DataStructures v0.18.16, and FASTX v2.1.4 | |
# Usage | |
# $ julia stitch-fastq.jl chr20.herro.fasta.Q30.recal.shred.fastq > chr20.herro.fasta.Q30.recal.fastq | |
# | |
import Pkg; Pkg.add("FASTX") | |
import Pkg; Pkg.add("DataStructures") | |
using DataStructures | |
using FASTX | |
function process_fastq_file(filename::String) | |
reader = FASTQReader(open(filename)) | |
sequencesS = Dict{String, String}() | |
sequencesQ = Dict{String, String}() | |
sequences1 = Dict{String, String}() | |
sequences2 = Dict{String, String}() | |
for record in reader | |
# Get identifier | |
main_id = string(identifier(record)) | |
seq = sequence(record) | |
qual = quality(record) | |
sequencesS[main_id] = seq | |
sequencesQ[main_id] = qual | |
end | |
close(reader) | |
sorted_pairs1 = OrderedDict(sort(collect(pairs(sequencesS)))) | |
sorted_pairs2 = OrderedDict(sort(collect(pairs(sequencesQ)))) | |
for (id, seq) in sorted_pairs1 | |
main_id = split(id, '_')[1] | |
# Aggregate sequences by the main identifier | |
if haskey(sequences1, main_id) | |
sequences1[main_id] *= seq | |
else | |
sequences1[main_id] = seq | |
end | |
end | |
for (id, qual) in sorted_pairs2 | |
main_id = split(id, '_')[1] | |
# Aggregate sequences by the main identifier | |
if haskey(sequences2, main_id) | |
sequences2[main_id] *= qual | |
else | |
sequences2[main_id] = qual | |
end | |
end | |
# Print the aggregated sequences | |
for id in keys(sequences1) | |
seq = sequences1[id] | |
qual = sequences2[id] | |
println("@$id") | |
println(seq) | |
println("+") | |
println(qual) | |
end | |
end | |
# Check if a filename is passed as a command line argument | |
if length(ARGS) < 1 | |
println("Usage: julia ", PROGRAM_FILE, " <filename.fastq>") | |
return | |
end | |
# Example usage with command line argument | |
process_fastq_file(ARGS[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment