Last active
July 15, 2024 09:46
-
-
Save jelber2/4053b3c10a495c331e5420c54b9b1d6c to your computer and use it in GitHub Desktop.
Takes reads processed by shredBAM.jl and then makes hi-c pairs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# revision 2, now supports multi-core processing | |
# Basic idea is to take a long read alignment file that has been shredded with shredBAM.jl, then | |
# output hi-c-like read-pairs | |
# | |
# So for example, | |
# ----------------------- 1long read | |
# 1r1 2r1 3r1 3r2 2r2 1r2 | |
# results in 1read1, 1read2 | |
# 2read1, 2read2 | |
# 3read1, 3read2 | |
# in an interleaved FASTA file ready, hopefully for BWA-MEM or BWA-MEM2 | |
# pretty slow at the moment | |
# see https://gist.github.com/jelber2/47129820373474a768dacabc0e686fdc for running shredBAM.jl, which is also pretty slow | |
# | |
# note, need to name sort the BAM file with samtools sort -@threads -n input > input.name.sorted.bam !!FIRST!! | |
# | |
# Usage | |
# julia long-reads-shredded-2-hi-c-pairs.jl hg002.herro.Q30.sam1.3.noSoftClip.22.shredBAM.jl.150.name.sorted.bam makepairs.fasta & | |
# | |
# | |
using XAM | |
using DataFrames | |
using DataFramesMeta | |
using ArgParse | |
using Base.Threads | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"input_file" | |
help = "Path to the input BAM file" | |
required = true | |
"output_file" | |
help = "Path to the output FASTA file" | |
required = true | |
end | |
return parse_args(s) | |
end | |
function make_pairs(input_file::String, output_file::String) | |
counter1 = Atomic{Int}(0) | |
reads = DataFrame(read_id=String[], sequence=String[]) | |
reader = open(BAM.Reader, input_file) | |
record = BAM.Record() | |
while !eof(reader) | |
empty!(record) | |
read!(reader, record) | |
if BAM.ismapped(record) | |
if BAM.isprimaryalignment(record) | |
push!(reads.read_id, BAM.tempname(record)) | |
push!(reads.sequence, string(BAM.sequence(record))) | |
end | |
end | |
atomic_add!(counter1, 1) | |
print("\rRead $(counter1[]) BAM records.") | |
flush(stdout) | |
end | |
close(reader) | |
println("") | |
reads = transform(reads, :read_id => ByRow(x -> replace(x, r"_\d+$" => "")) => :read_id_new) | |
reads2 = unique(reads.read_id_new) | |
# Open output file | |
output_file = open(output_file, "w") | |
# Pre-allocate a buffer for each thread | |
buffers = [IOBuffer() for _ in 1:nthreads()] | |
local_buffer_lock = ReentrantLock() | |
relevant_reads_lock = ReentrantLock() | |
n_rows_lock = ReentrantLock() | |
counter = Atomic{Int}(0) | |
relevant_reads_lock = ReentrantLock() | |
counter_lock = ReentrantLock() | |
Threads.@threads :static for read_id_new in reads2 | |
local_buffer = buffers[threadid()] | |
relevant_reads = lock(relevant_reads_lock) do | |
reads[reads.read_id_new .== read_id_new, :] | |
end | |
n_rows = size(relevant_reads, 1) | |
if iseven(n_rows) | |
number_pairs = n_rows ÷ 2 | |
for i in 1:number_pairs | |
local_counter = lock(counter_lock) do | |
atomic_add!(counter, 1) | |
counter[] | |
end | |
write(local_buffer, ">seq_$(local_counter)_1\n") | |
write(local_buffer, relevant_reads[i, :sequence], "\n") | |
write(local_buffer, ">seq_$(local_counter)_2\n") | |
write(local_buffer, relevant_reads[n_rows-i+1, :sequence], "\n") | |
print("\rWrote $(counter[]) FASTA record pairs.") | |
flush(stdout) | |
end | |
else | |
number_pairs = (n_rows-1) ÷ 2 | |
for i in 1:number_pairs | |
local_counter = lock(counter_lock) do | |
atomic_add!(counter, 1) | |
counter[] | |
end | |
write(local_buffer, ">seq_$(local_counter)_1\n") | |
write(local_buffer, relevant_reads[i, :sequence], "\n") | |
write(local_buffer, ">seq_$(local_counter)_2\n") | |
write(local_buffer, relevant_reads[n_rows-i, :sequence], "\n") | |
print("\rWrote $(counter[]) FASTA record pairs.") | |
flush(stdout) | |
end | |
end | |
end | |
# Write all buffers to file | |
for buffer in buffers | |
write(output_file, take!(buffer)) | |
end | |
close(output_file) | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
output_file = parsed_args["output_file"] | |
make_pairs(input_file, output_file) | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment