-
-
Save jelber2/f22c24442c34f872d8ebf073ad721476 to your computer and use it in GitHub Desktop.
| # new in revision 2, multithreaded support - up to twice as fast as single-threaded, revision 1 | |
| # invoke with: | |
| # julia --threads 16 Plot_Animated_Nanopore_Quality_x_Length_2D_Contours.jl --input_file test.bam --fps 0.7 --output_file test.gif | |
| # but need a lot of threads (up to 16, above 16 threads, get saturation in cores vs. time plot) | |
| using XAM | |
| using ArgParse | |
| using DataFrames | |
| using Dates | |
| using Statistics | |
| using Base.MathConstants | |
| using StatsBase | |
| using StatsPlots | |
| using KernelDensity | |
| using Base.Threads | |
| function parse_commandline() | |
| s = ArgParseSettings() | |
| @add_arg_table s begin | |
| "--input_file", "-i" | |
| help = "Path to the input BAM file" | |
| required = true | |
| "--fps", "-f" | |
| help = "Frames per second for the animation (try maybe 0.35)" | |
| arg_type = Float64 | |
| required = true | |
| "--output_file", "-o" | |
| help = "Output file name for the animation (try output.gif)" | |
| required = true | |
| end | |
| return parse_args(s) | |
| end | |
| function get_BAM_records(input_file::String) | |
| println() | |
| println("Reading BAM Index for preallocating df size.") | |
| input_file_index = string(input_file, ".bai") | |
| try | |
| total_records = XAM.BAM.BAI(input_file_index).n_no_coors | |
| catch e | |
| println("Please use samtools index as $input_file_index is not available in the same directory as $input_file.") | |
| println() | |
| end | |
| total_records = XAM.BAM.BAI(input_file_index).n_no_coors | |
| println("Total records in the BAM file: $total_records") | |
| df = DataFrame(col1=Vector{Int}(undef, total_records), col2=Vector{Any}(undef, total_records), col3=Vector{Int}(undef, total_records)) | |
| reader = open(BAM.Reader, input_file) | |
| record = BAM.Record() | |
| counter = Atomic{Int}(0) | |
| reader_lock = ReentrantLock() | |
| println() | |
| println("Reading BAM records for adding to df.") | |
| Threads.@threads for i in 1:total_records | |
| local_record = BAM.Record() | |
| lock(reader_lock) | |
| read!(reader, local_record) | |
| unlock(reader_lock) | |
| df.col1[i] = Int(round(-10*log10(sum(y->10^(Int(Char(y))/-10), BAM.quality(local_record))/length(BAM.quality(local_record))), RoundNearestTiesUp)) | |
| df.col2[i] = values(local_record)[7] | |
| df.col3[i] = BAM.seqlength(local_record) | |
| atomic_add!(counter, 1) | |
| print("\rRead $(counter[]) BAM records.") | |
| flush(stdout) | |
| end | |
| println() | |
| println("Done reading BAM records for adding to df.") | |
| close(reader) | |
| return df | |
| end | |
| function process_BAM_records(df::DataFrame) | |
| println("Begin plotting.") | |
| # Read the data | |
| ez1_orig = df | |
| # Convert the second column to DateTime | |
| ez1_orig.col2 = [DateTime(split(dt, "+")[1], "yyyy-mm-ddTHH:MM:SS.sss") for dt in ez1_orig.col2] | |
| # Convert the first datetime to the start of the day | |
| start_time = floor(ez1_orig.col2[1], Day) | |
| # Create a sequence of times from the start of the day to the end of the day, with intervals of 15 minutes | |
| intervals = start_time:Hour(1):floor(maximum(ez1_orig.col2), Day) + Day(1) - Minute(1) | |
| # Create a new column named interval in ez1_orig to store the assigned intervals | |
| ez1_orig.interval = similar(ez1_orig.col2, DateTime) | |
| # Assign each datetime in ez1_orig.Column2 to the nearest interval | |
| for i in 1:nrow(ez1_orig) | |
| index = searchsorted(intervals, ez1_orig.col2[i]).stop | |
| if index == 0 | |
| ez1_orig.interval[i] = intervals[1] | |
| elseif index > 1 && (ez1_orig.col2[i] - intervals[index-1]) < (intervals[index] - ez1_orig.col2[i]) | |
| ez1_orig.interval[i] = intervals[index-1] | |
| else | |
| ez1_orig.interval[i] = intervals[index] | |
| end | |
| end | |
| sort!(ez1_orig, :interval) | |
| ez1_orig.col3 = log10.(ez1_orig.col3) | |
| # Create an empty plot | |
| plt = plot(xlim = (0, 40), ylim = (0,6), | |
| title = "Hourly Hourly Read Quality by Length", xlabel = "Average Read Quality Phred Score", ylabel = "Read Length Bases", | |
| size=(800,600), legend=true) | |
| # Assuming your DataFrame is named 'ez1_orig' | |
| result = combine(groupby(ez1_orig, :interval)) do subdf | |
| x = subdf.col1 | |
| y = subdf.col3 | |
| xmin, xmax = 0, 40 | |
| ymin, ymax = 0, maximum(6) # Adjust the range for col3 based on your data | |
| # Perform 2D KDE | |
| kde_data = kde((x, y); boundary=((xmin, xmax), (ymin, ymax)), npoints=(100, 100)) | |
| return kde_data | |
| end | |
| # Create the animation | |
| anim = @animate for i in 1:nrow(result) | |
| animate_density!(plt, result, i, ez1_orig) | |
| end | |
| end | |
| # Define the animation function | |
| function animate_density!(plt, result, frame, ez1_orig) | |
| interval = result.interval[frame] | |
| row = result[frame, :] | |
| plot(plt, row.x1, | |
| title = "\"Hourly\" Avg Read Qual by Length, Interval $interval", | |
| xlims = (0, 40), ylims = (0, 6), xlabel = "Average Read Quality Phred Score", | |
| ylabel = "Read Length Bases", yticks=([1, 2, 3, 4, 5, 6], ["10", "100", "1000", "10000", "100000", "1000000"])) | |
| end | |
| function main() | |
| parsed_args = parse_commandline() | |
| input_file = parsed_args["input_file"] | |
| fps = parsed_args["fps"] | |
| output_file = parsed_args["output_file"] | |
| df = get_BAM_records(input_file) | |
| anim = process_BAM_records(df) | |
| # Save the animation with the specified output file name | |
| gif(anim, output_file, fps=fps, show_msg=false) | |
| end | |
| main() |
This is what the same data side-by-side looks like after error-correcting the SUP Nanopore Reads with Herro (https://github.com/lbcb-sci/herro), error-correcting with k=19 with brutal rewrite (https://github.com/natir/br), two times overlapping error-correction with peregrine-2021 (https://github.com/cschin/peregrine-2021), and error-correcting with k=21 with DeChat (https://github.com/LuoGroup2023/DeChat) (not using the hifiasm step).

Phred Scores for SUP_Herro-pg_asm2x-dechat reads was estimated by aligning these HG002 reads to hg002v1.0.1.fasta (https://github.com/marbl/HG002), using BBTools/BBMap's calctruequality.sh followed by bbduk.sh recalibrate (https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/)
Can use 1-16 cores, more cores leads to performance saturation