jelber2 · March 7, 2024 09:45
diff --git a/stitch-fasta.jl b/stitch-fasta.jl
 # tested on Julialang v.1.10.2, DataStructures v0.18.16, and FASTX v2.1.4
 # Usage
 # $ julia stitch-fasta.jl chr20.herro.fasta.Q30.recal.shred.fasta > chr20.herro.fasta.Q30.recal.fasta
 #
 import Pkg; Pkg.add("FASTX")
 import Pkg; Pkg.add("DataStructures")
 using DataStructures
 using FASTX

 function process_fasta_file(filename::String)
    reader = FASTAReader(open(filename))
    sequences = Dict{String, String}()
    sequences2 = Dict{String, String}()

    for record in reader
        # Get identifier
        main_id = string(identifier(record))
        seq = sequence(record)
        sequences[main_id] = seq
    end

    close(reader)
        
    sorted_pairs = OrderedDict(sort(collect(pairs(sequences))))

    for (id, seq) in sorted_pairs
        main_id = split(id, '_')[1]
        # Aggregate sequences by the main identifier
        if haskey(sequences2, main_id)
            sequences2[main_id] *= seq
        else
            sequences2[main_id] = seq
        end
    end

    # Print the aggregated sequences
    for (id, seq) in sequences2
        println(">$id")
        println(seq)
    end
 end

 # Check if a filename is passed as a command line argument
 if length(ARGS) < 1
    println("Usage: julia ", PROGRAM_FILE, " <filename.fasta>")
    return
 end
 # Example usage with command line argument
 process_fasta_file(ARGS[1])
	# tested on Julialang v.1.10.2, DataStructures v0.18.16, and FASTX v2.1.4
	# Usage
	# $ julia stitch-fasta.jl chr20.herro.fasta.Q30.recal.shred.fasta > chr20.herro.fasta.Q30.recal.fasta
	#
	import Pkg; Pkg.add("FASTX")
	import Pkg; Pkg.add("DataStructures")
	using DataStructures
	using FASTX

	function process_fasta_file(filename::String)
	reader = FASTAReader(open(filename))
	sequences = Dict{String, String}()
	sequences2 = Dict{String, String}()

	for record in reader
	# Get identifier
	main_id = string(identifier(record))
	seq = sequence(record)
	sequences[main_id] = seq
	end

	close(reader)

	sorted_pairs = OrderedDict(sort(collect(pairs(sequences))))

	for (id, seq) in sorted_pairs
	main_id = split(id, '_')[1]
	# Aggregate sequences by the main identifier
	if haskey(sequences2, main_id)
	sequences2[main_id] *= seq
	else
	sequences2[main_id] = seq
	end
	end

	# Print the aggregated sequences
	for (id, seq) in sequences2
	println(">$id")
	println(seq)
	end
	end

	# Check if a filename is passed as a command line argument
	if length(ARGS) < 1
	println("Usage: julia ", PROGRAM_FILE, " <filename.fasta>")
	return
	end
	# Example usage with command line argument
	process_fasta_file(ARGS[1])