Last active
March 11, 2024 13:09
-
-
Save jelber2/47129820373474a768dacabc0e686fdc to your computer and use it in GitHub Desktop.
Shred alignments in a BAM file (output is a SAM file without header) to make long read alignments into short-read-like
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# previous versions allowed to generate directly from BAM, but there seemed to have been some troubles | |
# between versions of XAM, so this version is working so far for its intended purpose on https://github.com/brendanofallon/jovian | |
# also, this version is rather fast | |
# | |
# | |
# note: ignores quality scores at the moment - fixed in revision#4 | |
# note2: outputs to STDOUT a SAM file without a header | |
# tested with julialang v1.10.2 and XAM v0.4.0 | |
# $ julia shredBAM.jl input.bam 300 > test.sam.noheader | |
# | |
# to add a header from the original BAM file that was shredded | |
# and add on a PG line | |
# $ SHRED_LENGTH=300 | |
# $ INPUT=input.bam | |
# $ INPUT2=test.sam.noheader | |
# this is the command to make the BAM file | |
# $ cat <(samtools view -H $INPUT) <(echo -e "@PG\tID:shredBAM.jl\tPN:shredBAM.jl\tVN:0.16\tCL:julia shredBAM.jl $INPUT $SHRED_LENGTH") $INPUT2 |samtools sort > test.bam | |
# | |
using XAM | |
using Base.Iterators: partition | |
using ArgParse | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"input_file" | |
help = "Path to the input BAM file" | |
required = true | |
"shred_length" | |
help = "Length of the shredded segments" | |
required = true | |
arg_type = Int | |
end | |
return parse_args(s) | |
end | |
function split_cigar(cigar_string::String, split_length::Int) | |
# Regular expression to match CIGAR operations | |
cigar_pattern = r"(\d+)([MIDNSHP=X])" | |
operations = collect(eachmatch(cigar_pattern, cigar_string)) | |
# Pre-allocate the split_cigars array with an estimated size | |
split_cigars = String[] | |
sizehint!(split_cigars, length(operations)) | |
current_cigar = "" | |
current_length = 0 | |
for op in operations | |
length = parse(Int, op.captures[1]) | |
operation = op.captures[2] | |
while length > 0 | |
if operation == "D" || operation == "H" | |
current_cigar *= "$(length)$(operation)" | |
break | |
else | |
add_length = min(length, split_length - current_length) | |
current_cigar *= "$(add_length)$(operation)" | |
length -= add_length | |
current_length += add_length | |
if current_length == split_length | |
push!(split_cigars, current_cigar) | |
current_cigar = "" | |
current_length = 0 | |
end | |
end | |
end | |
end | |
if current_cigar != "" | |
push!(split_cigars, current_cigar) | |
end | |
return split_cigars | |
end | |
function split_BAM_records(input_file::String, shred_length::Int) | |
reader = open(BAM.Reader, input_file) | |
record = BAM.Record() | |
while !eof(reader) | |
empty!(record) | |
read!(reader, record) | |
if BAM.ismapped(record) | |
if BAM.isprimaryalignment(record) | |
# Define your logic to break the record into smaller parts | |
# For example, split the record based on some criteria | |
smaller_sequences = break_into_smaller_sequences(record, shred_length) | |
smaller_qualities = break_into_smaller_qualities(record, shred_length) | |
smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length) | |
smaller_templates = template_names(record, shred_length) | |
smaller_pos = new_positions(record, shred_length) | |
# Write the new smaller records to the output file | |
for (small_seq, small_cigar, small_temp, small_pos, small_qualities) in zip(smaller_sequences, smaller_cigars, smaller_templates, smaller_pos, smaller_qualities) | |
println("$small_temp", "\t", 0, "\t", BAM.refname(record), "\t", "$small_pos", "\t", BAM.mappingquality(record), "\t", "$small_cigar", "\t*\t0\t0\t", "$small_seq", "\t", "$small_qualities") | |
end | |
end | |
end | |
end | |
close(reader) | |
end | |
function template_names(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# template name | |
new_template_name = string(BAM.tempname(record)) * "_" * string(i) | |
# Add the new record to the array | |
push!(new_records, new_template_name) | |
end | |
return new_records | |
end | |
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
# Extract the segment sequence | |
segment_sequence = full_sequence[start_pos:end_pos] | |
# Add the new record to the array | |
push!(new_records, segment_sequence) | |
end | |
return new_records | |
end | |
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
# Extract the segment sequence | |
segment_sequence = full_sequence[start_pos:end_pos] | |
# Add the new record to the array | |
push!(new_records, segment_sequence) | |
end | |
return new_records | |
end | |
function break_into_smaller_qualities(record::BAM.Record, segment_length::Int) | |
full_sequence = BAM.quality(record) | |
full_length = length(full_sequence) | |
offset = 33 | |
num_segments = ceil(Int, full_length / segment_length) | |
new_records = [] | |
for i in 1:num_segments | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]] | |
push!(new_records, join(new_chars)) | |
end | |
return new_records | |
end | |
function new_positions(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + BAM.position(record) | |
# Add the new record to the array | |
push!(new_records, start_pos) | |
end | |
return new_records | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
shred_length = parsed_args["shred_length"] | |
split_BAM_records(input_file, shred_length) | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment