Last active
March 26, 2024 10:04
-
-
Save jelber2/7163185d5e3f6ff81f8bf4620e0a3740 to your computer and use it in GitHub Desktop.
Plot changes in read length distribution of Nanopore Dorado called bases
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using XAM | |
using ArgParse | |
using DataFrames | |
using Dates | |
using Plots | |
using StatsBase | |
using Plots.PlotMeasures | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"--input_file", "-i" | |
help = "Path to the input BAM file" | |
required = true | |
"--read_lengths", "-r" | |
help = "Read lengths interval size (try maybe 5000)" | |
arg_type = Int | |
required = true | |
"--fps", "-f" | |
help = "Frames per second for the animation (try maybe 5)" | |
arg_type = Int | |
required = true | |
"--xlims_max", "-x" | |
help = "Maximum value for xlims (try maybe 200000)" | |
arg_type = Int | |
required = true | |
"--output_file", "-o" | |
help = "Output file name for the animation (try output.gif)" | |
required = true | |
"--log10" | |
help = "Use log10 scale for the plot" | |
action = :store_true | |
end | |
return parse_args(s) | |
end | |
# Function to find the length of the shortest value in the running sum | |
function findShortestInRunningSum(values) | |
# Sort the values in descending order | |
sorted_values = sort(values, rev=true) | |
# Calculate the total sum | |
total_sum = sum(sorted_values) | |
# Calculate half of the total sum | |
half_sum = total_sum / 2 | |
# Initialize running sum and the length of the shortest value | |
running_sum = 0 | |
shortest_value = Inf | |
# Iterate through the sorted values | |
for value in sorted_values | |
running_sum += value | |
# If the running sum is equal to or greater than half of the total sum | |
if running_sum >= half_sum | |
# Update the shortest length if the current value's length is shorter | |
if value < shortest_value | |
shortest_value = value | |
end | |
break | |
end | |
end | |
return shortest_value | |
end | |
function get_BAM_records(input_file::String) | |
# Initialize an empty DataFrame with the desired column names | |
df = DataFrame(Column1 = Int[], Column2 = Any[]) | |
reader = open(BAM.Reader, input_file) | |
record = BAM.Record() | |
counter = 0 | |
println() | |
println("Reading BAM records.") | |
while !eof(reader) | |
empty!(record) | |
read!(reader, record) | |
# Append each record's BAM.seqlength(record) and values(record)[7] to the DataFrame | |
push!(df, [BAM.seqlength(record), values(record)[7]]) | |
counter += 1 | |
# Update the counter display | |
print("\rRead $counter BAM records.") | |
flush(stdout) | |
end | |
println("Done reading BAM records.") | |
println() | |
close(reader) | |
return df | |
end | |
function process_BAM_records(df::DataFrame, read_lengths_interval_size::Int, xlims_max::Int, use_log10=false) | |
println("Begin plotting.") | |
# Read the data | |
ez1_orig = df | |
# Convert the second column to DateTime | |
ez1_orig.Column2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.Column2] | |
# Convert the first datetime to the start of the day | |
start_time = floor(ez1_orig.Column2[1], Day) | |
# Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes | |
intervals = start_time:Hour(1):floor(maximum(ez1_orig.Column2), Day) + Day(1) - Minute(1) | |
# Create a new column named interval in ez1_orig to store the assigned intervals | |
ez1_orig.interval = similar(ez1_orig.Column2, DateTime) | |
# Assign each datetime in ez1_orig.Column2 to the nearest interval | |
for i in 1:nrow(ez1_orig) | |
index = searchsorted(intervals, ez1_orig.Column2[i]).stop | |
if index == 0 | |
ez1_orig.interval[i] = intervals[1] | |
elseif index > 1 && (ez1_orig.Column2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.Column2[i]) | |
ez1_orig.interval[i] = intervals[index-1] | |
else | |
ez1_orig.interval[i] = intervals[index] | |
end | |
end | |
# Initialize an empty DataFrame to store the results | |
results = DataFrame() | |
sort!(ez1_orig, :interval) | |
j = 0 | |
for hour in unique(ez1_orig.interval) | |
test = [] | |
filtered_df = ez1_orig[ez1_orig.interval .== hour, :] | |
read_lengths = 0:read_lengths_interval_size:ceil(maximum(ez1_orig.Column1)/read_lengths_interval_size)*read_lengths_interval_size | |
for i in 1:length(read_lengths)-1 | |
range_start = read_lengths[i] | |
range_end = read_lengths[i+1] | |
sum_in_range = sum(filtered_df.Column1[(filtered_df.Column1 .> range_start) .& (filtered_df.Column1 .<= range_end)]) | |
push!(test, sum_in_range) | |
end | |
test3 = DataFrame( | |
number_of_bases = test, | |
read_length = read_lengths[1:end-1], | |
hour = fill(hour, length(test)) | |
) | |
if isempty(results) | |
results = test3 | |
else | |
j += 1 | |
sum_row = results.number_of_bases[j:j+nrow(test3)-1] + test3.number_of_bases | |
j += nrow(test3) - 1 | |
test3.number_of_bases .= sum_row | |
append!(results, test3, promote=true) | |
end | |
end | |
sort!(results, :hour) | |
# Create the animation | |
anim = @animate for hour in unique(results.hour) | |
filtered_results = results[results.hour .== hour, :] | |
create_plot(filtered_results, results, ez1_orig, xlims_max, use_log10) | |
end | |
return anim | |
println("Done plotting.") | |
end | |
# Create a plot for each frame | |
function create_plot(filtered_results::DataFrame, results::DataFrame, ez1_orig::DataFrame, xlims_max::Int, use_log10=false) | |
if use_log10 | |
p = plot(log10.(filtered_results.read_length), filtered_results.number_of_bases, group=filtered_results.hour, seriestype=:bar, | |
xlabel="Log10 Read length", ylabel="Number of Bases", | |
title="$(round(sum(ez1_orig.Column1/1000000000), digits=2)) Gbp output, red line N50=$(round(findShortestInRunningSum(ez1_orig.Column1)/1000, digits=2)) Kbp", | |
xlims=(log10(minimum(filtered_results.read_length)), log10(xlims_max)), legend=false, ylims=(0, maximum(results.number_of_bases))) | |
else | |
p = plot(filtered_results.read_length, filtered_results.number_of_bases, group=filtered_results.hour, seriestype=:bar, | |
xlabel="Read length", ylabel="Number of Bases", | |
title="$(round(sum(ez1_orig.Column1/1000000000), digits=2)) Gbp output, red line N50=$(round(findShortestInRunningSum(ez1_orig.Column1)/1000, digits=2)) Kbp", | |
xlims=(0, xlims_max), legend=false, ylims=(0, maximum(results.number_of_bases))) | |
end | |
if use_log10 | |
vline!([log10(findShortestInRunningSum(ez1_orig.Column1))], color=:red, linestyle=:dash, linewidth=1) | |
else | |
vline!([findShortestInRunningSum(ez1_orig.Column1)], color=:red, linestyle=:dash, linewidth=1) | |
end | |
return p | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
read_lengths_interval_size = parsed_args["read_lengths"] | |
fps = parsed_args["fps"] | |
xlims_max = parsed_args["xlims_max"] | |
output_file = parsed_args["output_file"] | |
use_log10=parsed_args["log10"] | |
df = get_BAM_records(input_file) | |
anim = process_BAM_records(df, read_lengths_interval_size, xlims_max, use_log10) | |
# Save the animation with the specified output file name | |
gif(anim, output_file, fps=fps) | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment