Last active
May 8, 2024 12:42
-
-
Save jelber2/547eb590b6d8eb3ce61eac16948b3256 to your computer and use it in GitHub Desktop.
Plots Quality Score Density Distribution per Hour of a Sequencing Run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# note: the Dorado BAM file produced by basecalling POD5 files needs to be indexed with "samtools index input_file" | |
# note2: something seems really strange about the qualtiy scores, I have checked several times, but they seem correct | |
# I am not sure why they are reporting that for example reads with Q38 average quality scores. | |
# EDIT: this is fixed, see https://www.biostars.org/p/295932/#295936 for better formula for average Phred Score | |
# tested with julia-1.10.3 and XAM.jl-0.40 | |
using XAM | |
using ArgParse | |
using DataFrames | |
using Dates | |
using Plots | |
using Statistics | |
using Plots.PlotMeasures | |
using Base.MathConstants | |
using StatsBase | |
using KernelDensity | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"--input_file", "-i" | |
help = "Path to the input BAM file" | |
required = true | |
"--fps", "-f" | |
help = "Frames per second for the animation (try maybe 5)" | |
arg_type = Int | |
required = true | |
"--output_file", "-o" | |
help = "Output file name for the animation (try output.gif)" | |
required = true | |
end | |
return parse_args(s) | |
end | |
function get_BAM_records(input_file::String) | |
println() | |
println("Reading BAM Index for preallocating df size.") | |
input_file_index=string(input_file, ".bai") | |
try | |
total_records = XAM.BAM.BAI(input_file_index).n_no_coors | |
catch e | |
println("Please use samtools index as $input_file_index is not available in the same directory as $input_file.") | |
end | |
total_records = XAM.BAM.BAI(input_file_index).n_no_coors | |
println("Total records in the BAM file: $total_records") | |
df = DataFrame(col1=Vector{Int}(undef, total_records), col2=Vector{Any}(undef, total_records)) | |
reader = open(BAM.Reader, input_file) | |
record = BAM.Record() | |
counter = 0 | |
println() | |
println("Reading BAM records for adding to df.") | |
for i in 1:total_records | |
empty!(record) | |
read!(reader, record) | |
df.col1[i] = Int(round(-10*log10(sum(y->10^(Int(Char(y))/-10), BAM.quality(record))/length(BAM.quality(record))), RoundNearestTiesUp)) | |
df.col2[i] = values(record)[7] | |
counter += 1 | |
print("\rRead $counter BAM records.") | |
flush(stdout) | |
end | |
println() | |
println("Done reading BAM records for adding to df.") | |
close(reader) | |
return df | |
end | |
function process_BAM_records(df::DataFrame) | |
println("Begin plotting.") | |
# Read the data | |
ez1_orig = df | |
# Convert the second column to DateTime | |
ez1_orig.col2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.col2] | |
# Convert the first datetime to the start of the day | |
start_time = floor(ez1_orig.col2[1], Day) | |
# Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes | |
intervals = start_time:Hour(1):floor(maximum(ez1_orig.col2), Day) + Day(1) - Minute(1) | |
# Create a new column named interval in ez1_orig to store the assigned intervals | |
ez1_orig.interval = similar(ez1_orig.col2, DateTime) | |
# Assign each datetime in ez1_orig.Column2 to the nearest interval | |
for i in 1:nrow(ez1_orig) | |
index = searchsorted(intervals, ez1_orig.col2[i]).stop | |
if index == 0 | |
ez1_orig.interval[i] = intervals[1] | |
elseif index > 1 && (ez1_orig.col2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.col2[i]) | |
ez1_orig.interval[i] = intervals[index-1] | |
else | |
ez1_orig.interval[i] = intervals[index] | |
end | |
end | |
result = combine(groupby(df, :interval), :col1 => (x -> kde(x, boundary=(0,40), npoints=10000)) => :density) | |
sort!(result, :interval) | |
density = Float64[] | |
for i in 1:nrow(result) | |
row = result[i, :] | |
density = push!(density, maximum(row.density.density)) | |
end | |
# Create an empty plot | |
plt = plot(xlim = (0, 40), ylim = (0,maximum(density)), | |
title = "Hourly Average Read Quality", xlabel = "Quality Score", ylabel = "Density", | |
size=(800,600), legend=false) | |
# Create the animation | |
anim = @animate for i in 1:nrow(result) | |
animate_density!(plt, result, i, density) | |
end | |
end | |
# Define the animation function | |
function animate_density!(plt, result, frame, density) | |
interval = result.interval[frame] | |
row = result[frame, :] | |
plot(plt, row.density.x, row.density.density, | |
title = "\"Hourly\" Average Read Quality, Interval $interval", | |
fillrange = 0, fillalpha = 0.35, linecolor = :blue, | |
linealpha = 0.5, linewidth = 2, | |
xlims = (0, 40), ylims = (0, maximum(density)), xlabel = "Quality Score", ylabel = "Density") | |
end | |
function main() | |
parsed_args = parse_commandline() | |
input_file = parsed_args["input_file"] | |
fps = parsed_args["fps"] | |
output_file = parsed_args["output_file"] | |
df = get_BAM_records(input_file) | |
anim = process_BAM_records(df) | |
# Save the animation with the specified output file name | |
gif(anim, output_file, fps=fps, show_msg=false) | |
end | |
main() |
Author
jelber2
commented
May 8, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment