jelber2 · March 11, 2024 13:09
diff --git a/shredBAM.jl b/shredBAM.jl
 # previous versions allowed to generate directly from BAM, but there seemed to have been some troubles 
 # between versions of XAM, so this version is working so far for its intended purpose on https://github.com/brendanofallon/jovian
 # also, this version is rather fast
 #
 #
 # note: ignores quality scores at the moment - fixed in revision#4
 # note2: outputs to STDOUT a SAM file without a header
 # tested with julialang v1.10.2 and XAM v0.4.0
 # $ julia shredBAM.jl input.bam 300 > test.sam.noheader
 #
 # to add a header from the original BAM file that was shredded
 # and add on a PG line
 # $ SHRED_LENGTH=300
 # $ INPUT=input.bam
 # $ INPUT2=test.sam.noheader
 # this is the command to make the BAM file
 # $ cat <(samtools view -H $INPUT) <(echo -e "@PG\tID:shredBAM.jl\tPN:shredBAM.jl\tVN:0.16\tCL:julia shredBAM.jl $INPUT $SHRED_LENGTH") $INPUT2 |samtools sort > test.bam
 #
 using XAM
 using Base.Iterators: partition
 using ArgParse

 function parse_commandline()
    s = ArgParseSettings()

    @add_arg_table s begin
        "input_file"
            help = "Path to the input BAM file"
            required = true
        "shred_length"
            help = "Length of the shredded segments"
            required = true
            arg_type = Int
    end

    return parse_args(s)
 end

 function split_cigar(cigar_string::String, split_length::Int)
    # Regular expression to match CIGAR operations
    cigar_pattern = r"(\d+)([MIDNSHP=X])"
    operations = collect(eachmatch(cigar_pattern, cigar_string))
    
 # Pre-allocate the split_cigars array with an estimated size
 split_cigars = String[]
 sizehint!(split_cigars, length(operations))

 current_cigar = ""
 current_length = 0

 for op in operations
    length = parse(Int, op.captures[1])
    operation = op.captures[2]
    while length > 0
        if operation == "D" || operation == "H"
            current_cigar *= "$(length)$(operation)"
            break
        else
            add_length = min(length, split_length - current_length)
            current_cigar *= "$(add_length)$(operation)"
            length -= add_length
            current_length += add_length
            
            if current_length == split_length
                push!(split_cigars, current_cigar)
                current_cigar = ""
                current_length = 0
            end
        end
    end
 end

 if current_cigar != ""
    push!(split_cigars, current_cigar)
 end

 return split_cigars
 end

 function split_BAM_records(input_file::String, shred_length::Int)
    reader = open(BAM.Reader, input_file)
    record = BAM.Record()
    while !eof(reader)
        empty!(record)
        read!(reader, record)
        if BAM.ismapped(record)
            if BAM.isprimaryalignment(record)
                # Define your logic to break the record into smaller parts
                # For example, split the record based on some criteria
                smaller_sequences = break_into_smaller_sequences(record, shred_length)
                smaller_qualities = break_into_smaller_qualities(record, shred_length)
                smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
                smaller_templates = template_names(record, shred_length)
                smaller_pos = new_positions(record, shred_length)
                # Write the new smaller records to the output file
                for (small_seq, small_cigar, small_temp, small_pos, small_qualities) in zip(smaller_sequences, smaller_cigars, smaller_templates, smaller_pos, smaller_qualities)
                    println("$small_temp", "\t", 0, "\t", BAM.refname(record), "\t", "$small_pos", "\t", BAM.mappingquality(record), "\t", "$small_cigar", "\t*\t0\t0\t", "$small_seq", "\t", "$small_qualities")
                end
            end
        end
    end
    close(reader)
 end

 function template_names(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # template name
        new_template_name = string(BAM.tempname(record)) * "_" * string(i)
        # Add the new record to the array
        push!(new_records, new_template_name)
    end

    return new_records
 end

 function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        # Extract the segment sequence
        segment_sequence = full_sequence[start_pos:end_pos]

        # Add the new record to the array
        push!(new_records, segment_sequence)
    end

    return new_records
 end

 function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        # Extract the segment sequence
        segment_sequence = full_sequence[start_pos:end_pos]

        # Add the new record to the array
        push!(new_records, segment_sequence)
    end

    return new_records
 end

 function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
    full_sequence = BAM.quality(record)
    full_length = length(full_sequence)
    offset = 33
    num_segments = ceil(Int, full_length / segment_length)

    new_records = []
    
    for i in 1:num_segments
        start_pos = (i - 1) * segment_length + 1
        end_pos = min(i * segment_length, full_length)

        new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
        push!(new_records, join(new_chars)) 
    end

    return new_records
 end

 function new_positions(record::BAM.Record, segment_length::Int)
    # Get the full sequence of the read, including unaligned regions
    full_sequence = BAM.sequence(record)
    full_length = length(full_sequence)

    # Calculate the number of segments
    num_segments = ceil(Int, full_length / segment_length)

    # Placeholder for new records
    new_records = []

    # Generate new records for each segment
    for i in 1:num_segments
        # Calculate the start and end of the segment
        start_pos = (i - 1) * segment_length + BAM.position(record)

        # Add the new record to the array
        push!(new_records, start_pos)
    end

    return new_records
 end

 function main()
    parsed_args = parse_commandline()
    input_file = parsed_args["input_file"]
    shred_length = parsed_args["shred_length"]

    split_BAM_records(input_file, shred_length)
 end

 main()
	# previous versions allowed to generate directly from BAM, but there seemed to have been some troubles
	# between versions of XAM, so this version is working so far for its intended purpose on https://github.com/brendanofallon/jovian
	# also, this version is rather fast
	#
	#
	# note: ignores quality scores at the moment - fixed in revision#4
	# note2: outputs to STDOUT a SAM file without a header
	# tested with julialang v1.10.2 and XAM v0.4.0
	# $ julia shredBAM.jl input.bam 300 > test.sam.noheader
	#
	# to add a header from the original BAM file that was shredded
	# and add on a PG line
	# $ SHRED_LENGTH=300
	# $ INPUT=input.bam
	# $ INPUT2=test.sam.noheader
	# this is the command to make the BAM file
	# $ cat <(samtools view -H $INPUT) <(echo -e "@PG\tID:shredBAM.jl\tPN:shredBAM.jl\tVN:0.16\tCL:julia shredBAM.jl $INPUT $SHRED_LENGTH") $INPUT2 \|samtools sort > test.bam
	#
	using XAM
	using Base.Iterators: partition
	using ArgParse

	function parse_commandline()
	s = ArgParseSettings()

	@add_arg_table s begin
	"input_file"
	help = "Path to the input BAM file"
	required = true
	"shred_length"
	help = "Length of the shredded segments"
	required = true
	arg_type = Int
	end

	return parse_args(s)
	end

	function split_cigar(cigar_string::String, split_length::Int)
	# Regular expression to match CIGAR operations
	cigar_pattern = r"(\d+)([MIDNSHP=X])"
	operations = collect(eachmatch(cigar_pattern, cigar_string))

	# Pre-allocate the split_cigars array with an estimated size
	split_cigars = String[]
	sizehint!(split_cigars, length(operations))

	current_cigar = ""
	current_length = 0

	for op in operations
	length = parse(Int, op.captures[1])
	operation = op.captures[2]
	while length > 0
	if operation == "D" \|\| operation == "H"
	current_cigar *= "$(length)$(operation)"
	break
	else
	add_length = min(length, split_length - current_length)
	current_cigar *= "$(add_length)$(operation)"
	length -= add_length
	current_length += add_length

	if current_length == split_length
	push!(split_cigars, current_cigar)
	current_cigar = ""
	current_length = 0
	end
	end
	end
	end

	if current_cigar != ""
	push!(split_cigars, current_cigar)
	end

	return split_cigars
	end

	function split_BAM_records(input_file::String, shred_length::Int)
	reader = open(BAM.Reader, input_file)
	record = BAM.Record()
	while !eof(reader)
	empty!(record)
	read!(reader, record)
	if BAM.ismapped(record)
	if BAM.isprimaryalignment(record)
	# Define your logic to break the record into smaller parts
	# For example, split the record based on some criteria
	smaller_sequences = break_into_smaller_sequences(record, shred_length)
	smaller_qualities = break_into_smaller_qualities(record, shred_length)
	smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
	smaller_templates = template_names(record, shred_length)
	smaller_pos = new_positions(record, shred_length)
	# Write the new smaller records to the output file
	for (small_seq, small_cigar, small_temp, small_pos, small_qualities) in zip(smaller_sequences, smaller_cigars, smaller_templates, smaller_pos, smaller_qualities)
	println("$small_temp", "\t", 0, "\t", BAM.refname(record), "\t", "$small_pos", "\t", BAM.mappingquality(record), "\t", "$small_cigar", "\t*\t0\t0\t", "$small_seq", "\t", "$small_qualities")
	end
	end
	end
	end
	close(reader)
	end

	function template_names(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# template name
	new_template_name = string(BAM.tempname(record)) * "_" * string(i)
	# Add the new record to the array
	push!(new_records, new_template_name)
	end

	return new_records
	end

	function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	# Extract the segment sequence
	segment_sequence = full_sequence[start_pos:end_pos]

	# Add the new record to the array
	push!(new_records, segment_sequence)
	end

	return new_records
	end

	function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	# Extract the segment sequence
	segment_sequence = full_sequence[start_pos:end_pos]

	# Add the new record to the array
	push!(new_records, segment_sequence)
	end

	return new_records
	end

	function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
	full_sequence = BAM.quality(record)
	full_length = length(full_sequence)
	offset = 33
	num_segments = ceil(Int, full_length / segment_length)

	new_records = []

	for i in 1:num_segments
	start_pos = (i - 1) * segment_length + 1
	end_pos = min(i * segment_length, full_length)

	new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
	push!(new_records, join(new_chars))
	end

	return new_records
	end

	function new_positions(record::BAM.Record, segment_length::Int)
	# Get the full sequence of the read, including unaligned regions
	full_sequence = BAM.sequence(record)
	full_length = length(full_sequence)

	# Calculate the number of segments
	num_segments = ceil(Int, full_length / segment_length)

	# Placeholder for new records
	new_records = []

	# Generate new records for each segment
	for i in 1:num_segments
	# Calculate the start and end of the segment
	start_pos = (i - 1) * segment_length + BAM.position(record)

	# Add the new record to the array
	push!(new_records, start_pos)
	end

	return new_records
	end

	function main()
	parsed_args = parse_commandline()
	input_file = parsed_args["input_file"]
	shred_length = parsed_args["shred_length"]

	split_BAM_records(input_file, shred_length)
	end

	main()
No results found