Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active August 22, 2024 13:22
Show Gist options
  • Save jelber2/01511d3b47c6f9ab610a9a599603733a to your computer and use it in GitHub Desktop.
Save jelber2/01511d3b47c6f9ab610a9a599603733a to your computer and use it in GitHub Desktop.
shreds a BAM file into non-overlapping pairs keeping the alignment positions of each shred
# Revisions 1 & 2 use the same read name read1 and read2, this
# revision 3 appends _1 and _2 to read1 and read2, respectively.
#
# Basic idea is to take a long read alignment file that, then shred the long reads into shreds, keeping their alignment information
# output hi-c-like read-pairs, interleaved in a BAM file!
#
# So for example,
# ----------------------- 1long read
# 1r1 2r1 3r1 3r2 2r2 1r2
# results in 1read1, 1read2
# 2read1, 2read2
# 3read1, 3read2
#
#
# Example usage to convert a normal long-read BAM file to shredded BAM:
#
# cat <(samtools view -H test.bam) \
# <(echo -e "@PG\tID:shredBAM-pairs.jl\tPN:shredBAM-pairs.jl\tVN:0.01\tCL:julia shredBAM-pairs.jl test.bam 500") \
# <(julia /home/jelber43/anonymous/shredBAM-pairs.jl anonymous.fasta.gz-against-finalasm.bp.p_ctg.ul.fa.gz.bam 500) | \
# samtools view -Sb -@8 > test.shredBAM-pairs.jl.500.bam
#
#
using XAM
using Base.Iterators: partition
using ArgParse
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"input_file"
help = "Path to the input BAM file"
required = true
"shred_length"
help = "Length of the shredded segments"
required = true
arg_type = Int
end
return parse_args(s)
end
function split_cigar(cigar_string::String, split_length::Int)
# Regular expression to match CIGAR operations
cigar_pattern = r"(\d+)([MIDNSHP=X])"
operations = collect(eachmatch(cigar_pattern, cigar_string))
# Pre-allocate the split_cigars array with an estimated size
split_cigars = String[]
sizehint!(split_cigars, length(operations))
current_cigar = ""
current_length = 0
for op in operations
length = parse(Int, op.captures[1])
operation = op.captures[2]
while length > 0
if operation == "D" || operation == "H"
current_cigar *= "$(length)$(operation)"
break
else
add_length = min(length, split_length - current_length)
current_cigar *= "$(add_length)$(operation)"
length -= add_length
current_length += add_length
if current_length == split_length
push!(split_cigars, current_cigar)
current_cigar = ""
current_length = 0
end
end
end
end
if current_cigar != ""
push!(split_cigars, current_cigar)
end
return split_cigars
end
function split_BAM_records(input_file::String, shred_length::Int)
reader = open(BAM.Reader, input_file)
record = BAM.Record()
counter = 0
while !eof(reader)
empty!(record)
read!(reader, record)
if BAM.ismapped(record)
if BAM.isprimaryalignment(record)
# Define your logic to break the record into smaller parts
# For example, split the record based on some criteria
smaller_sequences = break_into_smaller_sequences(record, shred_length)
smaller_qualities = break_into_smaller_qualities(record, shred_length)
smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
smaller_templates = template_names(record, shred_length)
smaller_pos = new_positions(record, shred_length)
# Write the new smaller records to the output file
if iseven(length(smaller_sequences))
number_pairs = length(smaller_sequences) ÷ 2
for i in 1:number_pairs
counter = counter + 1
println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i+1], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i+1], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i+1], "\t", smaller_qualities[length(smaller_sequences)-i+1])
print(stderr, "\rWrote $(counter[]) BAM record pairs.")
flush(stderr)
end
else
number_pairs = (length(smaller_sequences)-1) ÷ 2
for i in 1:number_pairs
counter = counter + 1
println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i])
println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i], "\t", smaller_qualities[length(smaller_sequences)-i])
print(stderr, "\rWrote $(counter[]) BAM record pairs.")
flush(stderr)
end
end
end
end
end
close(reader)
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
output_file = parsed_args["output_file"]
make_pairs(input_file, output_file)
end
function template_names(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# template name
new_template_name = string(BAM.tempname(record)) * "_" * string(i)
# Add the new record to the array
push!(new_records, new_template_name)
end
return new_records
end
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
# Extract the segment sequence
segment_sequence = full_sequence[start_pos:end_pos]
# Add the new record to the array
push!(new_records, segment_sequence)
end
return new_records
end
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
# Extract the segment sequence
segment_sequence = full_sequence[start_pos:end_pos]
# Add the new record to the array
push!(new_records, segment_sequence)
end
return new_records
end
function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
full_sequence = BAM.quality(record)
full_length = length(full_sequence)
offset = 33
num_segments = ceil(Int, full_length / segment_length)
new_records = []
for i in 1:num_segments
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
push!(new_records, join(new_chars))
end
return new_records
end
function new_positions(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + BAM.position(record)
# Add the new record to the array
push!(new_records, start_pos)
end
return new_records
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
shred_length = parsed_args["shred_length"]
split_BAM_records(input_file, shred_length)
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment