Last active
August 22, 2024 13:22
-
-
Save jelber2/01511d3b47c6f9ab610a9a599603733a to your computer and use it in GitHub Desktop.
shreds a BAM file into non-overlapping pairs keeping the alignment positions of each shred
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Revisions 1 & 2 use the same read name read1 and read2, this | |
# revision 3 appends _1 and _2 to read1 and read2, respectively. | |
# | |
# Basic idea is to take a long read alignment file that, then shred the long reads into shreds, keeping their alignment information | |
# output hi-c-like read-pairs, interleaved in a BAM file! | |
# | |
# So for example, | |
# ----------------------- 1long read | |
# 1r1 2r1 3r1 3r2 2r2 1r2 | |
# results in 1read1, 1read2 | |
# 2read1, 2read2 | |
# 3read1, 3read2 | |
# | |
# | |
# Example usage to convert a normal long-read BAM file to shredded BAM: | |
# | |
# cat <(samtools view -H test.bam) \ | |
# <(echo -e "@PG\tID:shredBAM-pairs.jl\tPN:shredBAM-pairs.jl\tVN:0.01\tCL:julia shredBAM-pairs.jl test.bam 500") \ | |
# <(julia /home/jelber43/anonymous/shredBAM-pairs.jl anonymous.fasta.gz-against-finalasm.bp.p_ctg.ul.fa.gz.bam 500) | \ | |
# samtools view -Sb -@8 > test.shredBAM-pairs.jl.500.bam | |
# | |
# | |
using XAM | |
using Base.Iterators: partition | |
using ArgParse | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"input_file" | |
help = "Path to the input BAM file" | |
required = true | |
"shred_length" | |
help = "Length of the shredded segments" | |
required = true | |
arg_type = Int | |
end | |
return parse_args(s) | |
end | |
function split_cigar(cigar_string::String, split_length::Int) | |
# Regular expression to match CIGAR operations | |
cigar_pattern = r"(\d+)([MIDNSHP=X])" | |
operations = collect(eachmatch(cigar_pattern, cigar_string)) | |
# Pre-allocate the split_cigars array with an estimated size | |
split_cigars = String[] | |
sizehint!(split_cigars, length(operations)) | |
current_cigar = "" | |
current_length = 0 | |
for op in operations | |
length = parse(Int, op.captures[1]) | |
operation = op.captures[2] | |
while length > 0 | |
if operation == "D" || operation == "H" | |
current_cigar *= "$(length)$(operation)" | |
break | |
else | |
add_length = min(length, split_length - current_length) | |
current_cigar *= "$(add_length)$(operation)" | |
length -= add_length | |
current_length += add_length | |
if current_length == split_length | |
push!(split_cigars, current_cigar) | |
current_cigar = "" | |
current_length = 0 | |
end | |
end | |
end | |
end | |
if current_cigar != "" | |
push!(split_cigars, current_cigar) | |
end | |
return split_cigars | |
end | |
function split_BAM_records(input_file::String, shred_length::Int) | |
reader = open(BAM.Reader, input_file) | |
record = BAM.Record() | |
counter = 0 | |
while !eof(reader) | |
empty!(record) | |
read!(reader, record) | |
if BAM.ismapped(record) | |
if BAM.isprimaryalignment(record) | |
# Define your logic to break the record into smaller parts | |
# For example, split the record based on some criteria | |
smaller_sequences = break_into_smaller_sequences(record, shred_length) | |
smaller_qualities = break_into_smaller_qualities(record, shred_length) | |
smaller_cigars = split_cigar(string(BAM.cigar(record)), shred_length) | |
smaller_templates = template_names(record, shred_length) | |
smaller_pos = new_positions(record, shred_length) | |
# Write the new smaller records to the output file | |
if iseven(length(smaller_sequences)) | |
number_pairs = length(smaller_sequences) ÷ 2 | |
for i in 1:number_pairs | |
counter = counter + 1 | |
println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i]) | |
println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i+1], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i+1], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i+1], "\t", smaller_qualities[length(smaller_sequences)-i+1]) | |
print(stderr, "\rWrote $(counter[]) BAM record pairs.") | |
flush(stderr) | |
end | |
else | |
number_pairs = (length(smaller_sequences)-1) ÷ 2 | |
for i in 1:number_pairs | |
counter = counter + 1 | |
println("seq_$(counter)_1", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[i], "\t*\t0\t0\t", smaller_sequences[i], "\t", smaller_qualities[i]) | |
println("seq_$(counter)_2", "\t", Int(BAM.flags(record)), "\t", BAM.refname(record), "\t", smaller_pos[length(smaller_sequences)-i], "\t", BAM.mappingquality(record), "\t", smaller_cigars[length(smaller_sequences)-i], "\t*\t0\t0\t", smaller_sequences[length(smaller_sequences)-i], "\t", smaller_qualities[length(smaller_sequences)-i]) | |
print(stderr, "\rWrote $(counter[]) BAM record pairs.") | |
flush(stderr) | |
end | |
end | |
end | |
end | |
end | |
close(reader) | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
output_file = parsed_args["output_file"] | |
make_pairs(input_file, output_file) | |
end | |
function template_names(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# template name | |
new_template_name = string(BAM.tempname(record)) * "_" * string(i) | |
# Add the new record to the array | |
push!(new_records, new_template_name) | |
end | |
return new_records | |
end | |
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
# Extract the segment sequence | |
segment_sequence = full_sequence[start_pos:end_pos] | |
# Add the new record to the array | |
push!(new_records, segment_sequence) | |
end | |
return new_records | |
end | |
function break_into_smaller_sequences(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
# Extract the segment sequence | |
segment_sequence = full_sequence[start_pos:end_pos] | |
# Add the new record to the array | |
push!(new_records, segment_sequence) | |
end | |
return new_records | |
end | |
function break_into_smaller_qualities(record::BAM.Record, segment_length::Int) | |
full_sequence = BAM.quality(record) | |
full_length = length(full_sequence) | |
offset = 33 | |
num_segments = ceil(Int, full_length / segment_length) | |
new_records = [] | |
for i in 1:num_segments | |
start_pos = (i - 1) * segment_length + 1 | |
end_pos = min(i * segment_length, full_length) | |
new_chars = [Char(UInt32(x) + offset) for x in @view full_sequence[start_pos:end_pos]] | |
push!(new_records, join(new_chars)) | |
end | |
return new_records | |
end | |
function new_positions(record::BAM.Record, segment_length::Int) | |
# Get the full sequence of the read, including unaligned regions | |
full_sequence = BAM.sequence(record) | |
full_length = length(full_sequence) | |
# Calculate the number of segments | |
num_segments = ceil(Int, full_length / segment_length) | |
# Placeholder for new records | |
new_records = [] | |
# Generate new records for each segment | |
for i in 1:num_segments | |
# Calculate the start and end of the segment | |
start_pos = (i - 1) * segment_length + BAM.position(record) | |
# Add the new record to the array | |
push!(new_records, start_pos) | |
end | |
return new_records | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
shred_length = parsed_args["shred_length"] | |
split_BAM_records(input_file, shred_length) | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment