jelber2 · August 22, 2024 13:22
diff --git a/shredBAM-pairs.jl b/shredBAM-pairs.jl
 # Revisions 1 & 2 use the same read name read1 and read2, this
 # revision 3 appends _1 and _2 to read1 and read2, respectively.
 #
 # Basic idea is to take a long read alignment file that, then shred the long reads into shreds, keeping their alignment information
 # output hi-c-like read-pairs, interleaved in a BAM file!
 #
 # So for example,
 # ----------------------- 1long read
 # 1r1 2r1 3r1  3r2 2r2 1r2
 # results in        1read1, 1read2
 #                   2read1, 2read2
 #                   3read1, 3read2
 #
 #
 # Example usage to convert a normal long-read BAM file to shredded BAM:
 #
 # cat <(samtools view -H test.bam) \
 #     <(echo -e "@PG\tID:shredBAM-pairs.jl\tPN:shredBAM-pairs.jl\tVN:0.01\tCL:julia shredBAM-pairs.jl test.bam 500") \
 #     <(julia /home/jelber43/anonymous/shredBAM-pairs.jl anonymous.fasta.gz-against-finalasm.bp.p_ctg.ul.fa.gz.bam 500) | \
 # samtools view -Sb -@8 > test.shredBAM-pairs.jl.500.bam
 #
 #
 using XAM
 using Base.Iterators: partition
 using ArgParse

 function parse_commandline()
    s = ArgParseSettings()

    @add_arg_table s begin
        "input_file"
            help = "Path to the input BAM file"
            required = true
        "shred_length"
            help = "Length of the shredded segments"
            required = true
            arg_type = Int
    end

    return parse_args(s)
 end

 function split_cigar(cigar_string::String, split_length::Int)
    # Regular expression to match CIGAR operations
    cigar_pattern = r"(\d+)([MIDNSHP=X])"
    operations = collect(eachmatch(cigar_pattern, cigar_string))
    
    # Pre-allocate the split_cigars array with an estimated size
    split_cigars = String[]
    sizehint!(split_cigars, length(operations))

    current_cigar = ""
    current_length = 0

    for op in operations
        length = parse(Int, op.captures[1])
        operation = op.captures[2]
        while length > 0
            if operation == "D" || operation == "H"
                current_cigar *= "$(length)$(operation)"
                break
            else
                add_length = min(length, split_length - current_length)
                current_cigar *= "$(add_length)$(operation)"
                length -= add_length
                current_length += add_length
                
                if current_length == split_length
                    push!(split_cigars, current_cigar)
                    current_cigar = ""
                    current_length = 0
                end
            end
        end
    end

    if current_cigar != ""
        push!(split_cigars, current_cigar)
    end

    return split_cigars
 end

 function split_BAM_records(input_file::String, shred_length::Int)
    reader = open(BAM.Reader, input_file)
    record = BAM.Record()
    counter = 0
    while !eof(reader)
        empty!(record)
        read!(reader, record)
        if BAM.ismapped(record)
            if BAM.isprimaryalignment(record)
                # Define your logic to break the record into smaller parts
                # For example, split the record based on some criteria
                smaller_sequences = break_into_smaller_sequences(record, shred_length)
                smaller_qualities = break_into_smaller_qualities(record, shred_length)
                smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
                smaller_templates = template_names(record, shred_length)
                smaller_pos = new_positions(record, shred_length)
                # Write the new smaller records to the output file
                if iseven(length(smaller_sequences))
                    number_pairs = length(smaller_sequences)  ÷ 2
                    for i in 1:number_pairs
                        counter = counter + 1
                        println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
                        println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i+1], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i+1], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i+1], "\t", smaller_qualities[length(smaller_sequences)-i+1])
                        print(stderr, "\rWrote $(counter[]) BAM record pairs.")
                        flush(stderr)
                    end
                else
                    number_pairs = (length(smaller_sequences)-1) ÷ 2
                    for i in 1:number_pairs
                        counter = counter + 1
                        println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
                        println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i], "\t", smaller_qualities[length(smaller_sequences)-i])
                        print(stderr, "\rWrote $(counter[]) BAM record pairs.")
                        flush(stderr)
                    end
                end
            end
        end
    end
    close(reader)
 end

 function main()
    parsed_args = parse_commandline()
    input_file = parsed_args["input_file"]
    output_file = parsed_args["output_file"]
    make_pairs(input_file, output_file)
 end

 function template_names(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # template name
        new_template_name = string(BAM.tempname(record)) * "_" * string(i)
        # Add the new record to the array
        push!(new_records, new_template_name)
    end

    return new_records
 end

 function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        # Extract the segment sequence
        segment_sequence = full_sequence[start_pos:end_pos]

        # Add the new record to the array
        push!(new_records, segment_sequence)
    end

    return new_records
 end

 function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        # Extract the segment sequence
        segment_sequence = full_sequence[start_pos:end_pos]

        # Add the new record to the array
        push!(new_records, segment_sequence)
    end

    return new_records
 end

 function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
    full_sequence = BAM.quality(record)
    full_length = length(full_sequence)
    offset = 33
    num_segments = ceil(Int, full_length / segment_length)

    new_records = []
    
    for i in 1:num_segments
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
        push!(new_records, join(new_chars)) 
    end

    return new_records
 end

 function new_positions(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + BAM.position(record)

        # Add the new record to the array
        push!(new_records, start_pos)
    end

    return new_records
 end

 function main()
    parsed_args = parse_commandline()
    input_file = parsed_args["input_file"]
    shred_length = parsed_args["shred_length"]

    split_BAM_records(input_file, shred_length)
 end

 main()
	# Revisions 1 & 2 use the same read name read1 and read2, this
	# revision 3 appends _1 and _2 to read1 and read2, respectively.
	#
	# Basic idea is to take a long read alignment file that, then shred the long reads into shreds, keeping their alignment information
	# output hi-c-like read-pairs, interleaved in a BAM file!
	#
	# So for example,
	# ----------------------- 1long read
	# 1r1 2r1 3r1 3r2 2r2 1r2
	# results in 1read1, 1read2
	# 2read1, 2read2
	# 3read1, 3read2
	#
	#
	# Example usage to convert a normal long-read BAM file to shredded BAM:
	#
	# cat <(samtools view -H test.bam) \
	# <(echo -e "@PG\tID:shredBAM-pairs.jl\tPN:shredBAM-pairs.jl\tVN:0.01\tCL:julia shredBAM-pairs.jl test.bam 500") \
	# <(julia /home/jelber43/anonymous/shredBAM-pairs.jl anonymous.fasta.gz-against-finalasm.bp.p_ctg.ul.fa.gz.bam 500) \| \
	# samtools view -Sb -@8 > test.shredBAM-pairs.jl.500.bam
	#
	#
	using XAM
	using Base.Iterators: partition
	using ArgParse

	function parse_commandline()
	s = ArgParseSettings()

	@add_arg_table s begin
	"input_file"
	help = "Path to the input BAM file"
	required = true
	"shred_length"
	help = "Length of the shredded segments"
	required = true
	arg_type = Int
	end

	return parse_args(s)
	end

	function split_cigar(cigar_string::String, split_length::Int)
	# Regular expression to match CIGAR operations
	cigar_pattern = r"(\d+)([MIDNSHP=X])"
	operations = collect(eachmatch(cigar_pattern, cigar_string))

	# Pre-allocate the split_cigars array with an estimated size
	split_cigars = String[]
	sizehint!(split_cigars, length(operations))

	current_cigar = ""
	current_length = 0

	for op in operations
	length = parse(Int, op.captures[1])
	operation = op.captures[2]
	while length > 0
	if operation == "D" \|\| operation == "H"
	current_cigar *= "$(length)$(operation)"
	break
	else
	add_length = min(length, split_length - current_length)
	current_cigar *= "$(add_length)$(operation)"
	length -= add_length
	current_length += add_length

	if current_length == split_length
	push!(split_cigars, current_cigar)
	current_cigar = ""
	current_length = 0
	end
	end
	end
	end

	if current_cigar != ""
	push!(split_cigars, current_cigar)
	end

	return split_cigars
	end

	function split_BAM_records(input_file::String, shred_length::Int)
	reader = open(BAM.Reader, input_file)
	record = BAM.Record()
	counter = 0
	while !eof(reader)
	empty!(record)
	read!(reader, record)
	if BAM.ismapped(record)
	if BAM.isprimaryalignment(record)
	# Define your logic to break the record into smaller parts
	# For example, split the record based on some criteria
	smaller_sequences = break_into_smaller_sequences(record, shred_length)
	smaller_qualities = break_into_smaller_qualities(record, shred_length)
	smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
	smaller_templates = template_names(record, shred_length)
	smaller_pos = new_positions(record, shred_length)
	# Write the new smaller records to the output file
	if iseven(length(smaller_sequences))
	number_pairs = length(smaller_sequences) ÷ 2
	for i in 1:number_pairs
	counter = counter + 1
	println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
	println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i+1], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i+1], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i+1], "\t", smaller_qualities[length(smaller_sequences)-i+1])
	print(stderr, "\rWrote $(counter[]) BAM record pairs.")
	flush(stderr)
	end
	else
	number_pairs = (length(smaller_sequences)-1) ÷ 2
	for i in 1:number_pairs
	counter = counter + 1
	println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
	println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i], "\t", smaller_qualities[length(smaller_sequences)-i])
	print(stderr, "\rWrote $(counter[]) BAM record pairs.")
	flush(stderr)
	end
	end
	end
	end
	end
	close(reader)
	end

	function main()
	parsed_args = parse_commandline()
	input_file = parsed_args["input_file"]
	output_file = parsed_args["output_file"]
	make_pairs(input_file, output_file)
	end

	function template_names(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# template name
	new_template_name = string(BAM.tempname(record)) * "_" * string(i)
	# Add the new record to the array
	push!(new_records, new_template_name)
	end

	return new_records
	end

	function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	# Extract the segment sequence
	segment_sequence = full_sequence[start_pos:end_pos]

	# Add the new record to the array
	push!(new_records, segment_sequence)
	end

	return new_records
	end

	function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	# Extract the segment sequence
	segment_sequence = full_sequence[start_pos:end_pos]

	# Add the new record to the array
	push!(new_records, segment_sequence)
	end

	return new_records
	end

	function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
	full_sequence = BAM.quality(record)
	full_length = length(full_sequence)
	offset = 33
	num_segments = ceil(Int, full_length / segment_length)

	new_records = []

	for i in 1:num_segments
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
	push!(new_records, join(new_chars))
	end

	return new_records
	end

	function new_positions(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + BAM.position(record)

	# Add the new record to the array
	push!(new_records, start_pos)
	end

	return new_records
	end

	function main()
	parsed_args = parse_commandline()
	input_file = parsed_args["input_file"]
	shred_length = parsed_args["shred_length"]

	split_BAM_records(input_file, shred_length)
	end

	main()