Skip to content

Instantly share code, notes, and snippets.

@jelber2
Last active May 8, 2024 12:42
Show Gist options
  • Save jelber2/547eb590b6d8eb3ce61eac16948b3256 to your computer and use it in GitHub Desktop.
Save jelber2/547eb590b6d8eb3ce61eac16948b3256 to your computer and use it in GitHub Desktop.
Plots Quality Score Density Distribution per Hour of a Sequencing Run
# note: the Dorado BAM file produced by basecalling POD5 files needs to be indexed with "samtools index input_file"
# note2: something seems really strange about the qualtiy scores, I have checked several times, but they seem correct
# I am not sure why they are reporting that for example reads with Q38 average quality scores.
# EDIT: this is fixed, see https://www.biostars.org/p/295932/#295936 for better formula for average Phred Score
# tested with julia-1.10.3 and XAM.jl-0.40
using XAM
using ArgParse
using DataFrames
using Dates
using Plots
using Statistics
using Plots.PlotMeasures
using Base.MathConstants
using StatsBase
using KernelDensity
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"--input_file", "-i"
help = "Path to the input BAM file"
required = true
"--fps", "-f"
help = "Frames per second for the animation (try maybe 5)"
arg_type = Int
required = true
"--output_file", "-o"
help = "Output file name for the animation (try output.gif)"
required = true
end
return parse_args(s)
end
function get_BAM_records(input_file::String)
println()
println("Reading BAM Index for preallocating df size.")
input_file_index=string(input_file, ".bai")
try
total_records = XAM.BAM.BAI(input_file_index).n_no_coors
catch e
println("Please use samtools index as $input_file_index is not available in the same directory as $input_file.")
end
total_records = XAM.BAM.BAI(input_file_index).n_no_coors
println("Total records in the BAM file: $total_records")
df = DataFrame(col1=Vector{Int}(undef, total_records), col2=Vector{Any}(undef, total_records))
reader = open(BAM.Reader, input_file)
record = BAM.Record()
counter = 0
println()
println("Reading BAM records for adding to df.")
for i in 1:total_records
empty!(record)
read!(reader, record)
df.col1[i] = Int(round(-10*log10(sum(y->10^(Int(Char(y))/-10), BAM.quality(record))/length(BAM.quality(record))), RoundNearestTiesUp))
df.col2[i] = values(record)[7]
counter += 1
print("\rRead $counter BAM records.")
flush(stdout)
end
println()
println("Done reading BAM records for adding to df.")
close(reader)
return df
end
function process_BAM_records(df::DataFrame)
println("Begin plotting.")
# Read the data
ez1_orig = df
# Convert the second column to DateTime
ez1_orig.col2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.col2]
# Convert the first datetime to the start of the day
start_time = floor(ez1_orig.col2[1], Day)
# Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes
intervals = start_time:Hour(1):floor(maximum(ez1_orig.col2), Day) + Day(1) - Minute(1)
# Create a new column named interval in ez1_orig to store the assigned intervals
ez1_orig.interval = similar(ez1_orig.col2, DateTime)
# Assign each datetime in ez1_orig.Column2 to the nearest interval
for i in 1:nrow(ez1_orig)
index = searchsorted(intervals, ez1_orig.col2[i]).stop
if index == 0
ez1_orig.interval[i] = intervals[1]
elseif index > 1 && (ez1_orig.col2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.col2[i])
ez1_orig.interval[i] = intervals[index-1]
else
ez1_orig.interval[i] = intervals[index]
end
end
result = combine(groupby(df, :interval), :col1 => (x -> kde(x, boundary=(0,40), npoints=10000)) => :density)
sort!(result, :interval)
density = Float64[]
for i in 1:nrow(result)
row = result[i, :]
density = push!(density, maximum(row.density.density))
end
# Create an empty plot
plt = plot(xlim = (0, 40), ylim = (0,maximum(density)),
title = "Hourly Average Read Quality", xlabel = "Quality Score", ylabel = "Density",
size=(800,600), legend=false)
# Create the animation
anim = @animate for i in 1:nrow(result)
animate_density!(plt, result, i, density)
end
end
# Define the animation function
function animate_density!(plt, result, frame, density)
interval = result.interval[frame]
row = result[frame, :]
plot(plt, row.density.x, row.density.density,
title = "\"Hourly\" Average Read Quality, Interval $interval",
fillrange = 0, fillalpha = 0.35, linecolor = :blue,
linealpha = 0.5, linewidth = 2,
xlims = (0, 40), ylims = (0, maximum(density)), xlabel = "Quality Score", ylabel = "Density")
end
function main()
parsed_args = parse_commandline()
input_file = parsed_args["input_file"]
fps = parsed_args["fps"]
output_file = parsed_args["output_file"]
df = get_BAM_records(input_file)
anim = process_BAM_records(df)
# Save the animation with the specified output file name
gif(anim, output_file, fps=fps, show_msg=false)
end
main()
@jelber2
Copy link
Author

jelber2 commented May 8, 2024

WGS_HG002_EZ1_10kb_SUP_NEW bam quality

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment