Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active March 11, 2024 13:09
Show Gist options
  • Save jelber2/47129820373474a768dacabc0e686fdc to your computer and use it in GitHub Desktop.
Save jelber2/47129820373474a768dacabc0e686fdc to your computer and use it in GitHub Desktop.
Shred alignments in a BAM file (output is a SAM file without header) to make long read alignments into short-read-like
# previous versions allowed to generate directly from BAM, but there seemed to have been some troubles
# between versions of XAM, so this version is working so far for its intended purpose on https://github.com/brendanofallon/jovian
# also, this version is rather fast
#
#
# note: ignores quality scores at the moment - fixed in revision#4
# note2: outputs to STDOUT a SAM file without a header
# tested with julialang v1.10.2 and XAM v0.4.0
# $ julia shredBAM.jl input.bam 300 > test.sam.noheader
#
# to add a header from the original BAM file that was shredded
# and add on a PG line
# $ SHRED_LENGTH=300
# $ INPUT=input.bam
# $ INPUT2=test.sam.noheader
# this is the command to make the BAM file
# $ cat <(samtools view -H $INPUT) <(echo -e "@PG\tID:shredBAM.jl\tPN:shredBAM.jl\tVN:0.16\tCL:julia shredBAM.jl $INPUT $SHRED_LENGTH") $INPUT2 |samtools sort > test.bam
#
using XAM
using Base.Iterators: partition
using ArgParse
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"input_file"
help = "Path to the input BAM file"
required = true
"shred_length"
help = "Length of the shredded segments"
required = true
arg_type = Int
end
return parse_args(s)
end
function split_cigar(cigar_string::String, split_length::Int)
# Regular expression to match CIGAR operations
cigar_pattern = r"(\d+)([MIDNSHP=X])"
operations = collect(eachmatch(cigar_pattern, cigar_string))
# Pre-allocate the split_cigars array with an estimated size
split_cigars = String[]
sizehint!(split_cigars, length(operations))
current_cigar = ""
current_length = 0
for op in operations
length = parse(Int, op.captures[1])
operation = op.captures[2]
while length > 0
if operation == "D" || operation == "H"
current_cigar *= "$(length)$(operation)"
break
else
add_length = min(length, split_length - current_length)
current_cigar *= "$(add_length)$(operation)"
length -= add_length
current_length += add_length
if current_length == split_length
push!(split_cigars, current_cigar)
current_cigar = ""
current_length = 0
end
end
end
end
if current_cigar != ""
push!(split_cigars, current_cigar)
end
return split_cigars
end
function split_BAM_records(input_file::String, shred_length::Int)
reader = open(BAM.Reader, input_file)
record = BAM.Record()
while !eof(reader)
empty!(record)
read!(reader, record)
if BAM.ismapped(record)
if BAM.isprimaryalignment(record)
# Define your logic to break the record into smaller parts
# For example, split the record based on some criteria
smaller_sequences = break_into_smaller_sequences(record, shred_length)
smaller_qualities = break_into_smaller_qualities(record, shred_length)
smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length)
smaller_templates = template_names(record, shred_length)
smaller_pos = new_positions(record, shred_length)
# Write the new smaller records to the output file
for (small_seq, small_cigar, small_temp, small_pos, small_qualities) in zip(smaller_sequences, smaller_cigars, smaller_templates, smaller_pos, smaller_qualities)
println("$small_temp", "\t", 0, "\t", BAM.refname(record), "\t", "$small_pos", "\t", BAM.mappingquality(record), "\t", "$small_cigar", "\t*\t0\t0\t", "$small_seq", "\t", "$small_qualities")
end
end
end
end
close(reader)
end
function template_names(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# template name
new_template_name = string(BAM.tempname(record)) * "_" * string(i)
# Add the new record to the array
push!(new_records, new_template_name)
end
return new_records
end
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
# Extract the segment sequence
segment_sequence = full_sequence[start_pos:end_pos]
# Add the new record to the array
push!(new_records, segment_sequence)
end
return new_records
end
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
# Extract the segment sequence
segment_sequence = full_sequence[start_pos:end_pos]
# Add the new record to the array
push!(new_records, segment_sequence)
end
return new_records
end
function break_into_smaller_qualities(record::BAM.Record, segment_length::Int)
full_sequence = BAM.quality(record)
full_length = length(full_sequence)
offset = 33
num_segments = ceil(Int, full_length / segment_length)
new_records = []
for i in 1:num_segments
start_pos = (i - 1) * segment_length + 1
end_pos = min(i * segment_length, full_length)
new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]]
push!(new_records, join(new_chars))
end
return new_records
end
function new_positions(record::BAM.Record, segment_length::Int)
# Get the full sequence of the read, including unaligned regions
full_sequence = BAM.sequence(record)
full_length = length(full_sequence)
# Calculate the number of segments
num_segments = ceil(Int, full_length / segment_length)
# Placeholder for new records
new_records = []
# Generate new records for each segment
for i in 1:num_segments
# Calculate the start and end of the segment
start_pos = (i - 1) * segment_length + BAM.position(record)
# Add the new record to the array
push!(new_records, start_pos)
end
return new_records
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
shred_length = parsed_args["shred_length"]
split_BAM_records(input_file, shred_length)
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment