Last active
May 28, 2024 10:24
-
-
Save jelber2/45225fa0937b59f72bbdddf8d694eb70 to your computer and use it in GitHub Desktop.
Using Nanopore Reads from HG002, estimate the phase switching with hapmers and readmers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get HG002 | |
# wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.0.1.fasta.gz | |
# gunzip hg002v1.0.1.fasta.gz | |
# | |
# Use seqtk to get maternal and paternal sequences | |
# https://github.com/lh3/seqtk | |
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "MATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.maternal.fasta & | |
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "PATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.paternal.fasta & | |
# | |
# use samtools to index haplotypes | |
# samtools index hg002v1.0.1.maternal.fasta | |
# samtools index hg002v1.0.1.paternal.fasta | |
# | |
# Use minimap2 and samtools to map un-corrected SUP reads to each haplotype separately | |
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.maternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.maternal.bam | |
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.paternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam | |
# | |
# Use samtools and minimap2's paftools.js to get sam2paf and retain | |
# only mapping quality Q60 and primary alignments of the following PAF columns in that order | |
# [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop] | |
# these are columns 1,3,4,5,6,8, and 9 in PAF file | |
# https://github.com/lh3/miniasm/blob/master/PAF.md | |
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf | |
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf | |
# | |
# get fai index for SUP reads | |
# samtools fasta -@16 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam > WGS_HG002_EZ1_10kb_SUP.fasta && \ | |
# samtools faidx WGS_HG002_EZ1_10kb_SUP.fasta | |
# | |
# make sure there is only occurrence of each read_id | |
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.unique.paf | |
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.unique.paf | |
# | |
# output is a tab-separated file with columns: | |
# read_id number_matching_hapmers_from_maternal_haplotype number_matching_hapmers_from_paternal_haplotype | |
# | |
# I theorized that one should have mostly matching hapmers for a single haplotype if no phase_switching, but "the proof | |
# is in the pudding" | |
# script works, just not sure if the actual concept works | |
using BioSequences | |
using FASTX | |
using Kmers | |
using CSV | |
using DataFrames | |
using Base.Threads | |
using ArgParse | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"--maternal_paf_file", "-m" | |
help = "path to maternal paf file" | |
required = true | |
"--paternal_paf_file", "-p" | |
help = "path to paternal paf file" | |
required = true | |
"--nanopore_fasta", "-n" | |
help = "path to nanopore fasta file (SUP.fasta)" | |
required = true | |
"--maternal_fasta", "-i" | |
help = "path to maternal fasta file (hg002v1.0.1.maternal.fasta)" | |
required = true | |
"--paternal_fasta", "-j" | |
help = "path to paternal fasta file (hg002v1.0.1.paternal.fasta)" | |
required = true | |
"--kmer_length", "-k" | |
arg_type = Int64 | |
help = "for example 15" | |
required = true | |
"--output_file", "-o" | |
help = "Output file name (WGS_HG002_EZ1_10kb_SUP.kmer-matches.txt)" | |
required = true | |
end | |
return parse_args(s) | |
end | |
function phase_switches(maternal_paf_file::String, | |
paternal_paf_file::String, | |
nanopore_fasta::String, | |
maternal_fasta::String, | |
paternal_fasta::String, | |
kmer_length::Int64) | |
println() | |
println("Reading maternal paf file...") | |
maternal_paf = CSV.read(maternal_paf_file, | |
DataFrame; | |
delim='\t', | |
header=false, | |
types=[String, UInt32, UInt32, String, String, UInt32, UInt32]) | |
rename!(maternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop]) | |
println("Done reading maternal paf file.") | |
println() | |
println("Reading paternal paf file...") | |
paternal_paf = CSV.read(paternal_paf_file, | |
DataFrame; | |
delim='\t', | |
header=false, | |
types=[String, UInt32, UInt32, String, String, UInt32, UInt32]) | |
rename!(paternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop]) | |
println("Done reading paternal paf file.") | |
# Function to extract the chromosome part using regular expressions | |
function extract_chr_id(id::String) | |
return match.(r"chr\w+_", id) | |
end | |
println() | |
println("Extracting maternal chr ids...") | |
# Apply the function to create a new column for matching | |
maternal_paf[!, :chr_id] = extract_chr_id.(maternal_paf.ref_id) | |
maternal_paf[!, :chr_id] = [m.match for m in maternal_paf.chr_id] | |
println("Done extracting maternal chr ids.") | |
println() | |
println("Extracting paternal chr ids...") | |
paternal_paf[!, :chr_id] = extract_chr_id.(paternal_paf.ref_id) | |
paternal_paf[!, :chr_id] = [m.match for m in paternal_paf.chr_id] | |
println("Done extracting paternal chr ids.") | |
println() | |
println("Performing leftjoin on maternal and paternal paf files...") | |
pafs = leftjoin(maternal_paf, paternal_paf, on = [:query_id, :chr_id, :query_start, :query_stop], makeunique=true) | |
dropmissing!(pafs) | |
println("Done performing leftjoin on maternal and paternal paf files.") | |
paternal_paf=nothing | |
maternal_paf=nothing | |
println() | |
println("Reading read FASTA Index for preallocating dictionary size.") | |
nanopore_fasta_index = string(nanopore_fasta, ".fai") | |
total_records = length(FASTX.FASTA.Index(nanopore_fasta_index).lengths) | |
println("There are $total_records read FASTA records") | |
nanopore_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, nanopore_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
nanopore_lock = ReentrantLock() | |
println() | |
println("Reading read FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(nanopore_lock) | |
nanopore_dict[identifier(local_record)] = sequence(local_record) | |
unlock(nanopore_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading read FASTA records for adding to dictionary.") | |
close(reader) | |
println() | |
println("Reading maternal FASTA Index for preallocating dictionary size.") | |
maternal_fasta_index = string(maternal_fasta, ".fai") | |
total_records_maternal = length(FASTX.FASTA.Index(maternal_fasta_index).lengths) | |
println("There are $total_records_maternal FASTA records") | |
maternal_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, maternal_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
maternal_lock = ReentrantLock() | |
println() | |
println("Reading maternal FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records_maternal | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(maternal_lock) | |
maternal_dict[identifier(local_record)] = sequence(local_record) | |
unlock(maternal_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading FASTA records for adding to dictionary.") | |
close(reader) | |
println() | |
println("Reading paternal FASTA Index for preallocating dictionary size.") | |
paternal_fasta_index = string(paternal_fasta, ".fai") | |
total_records_paternal = length(FASTX.FASTA.Index(paternal_fasta_index).lengths) | |
println("There are $total_records_paternal FASTA records") | |
paternal_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, paternal_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
paternal_lock = ReentrantLock() | |
println() | |
println("Reading paternal FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records_paternal | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(paternal_lock) | |
paternal_dict[identifier(local_record)] = sequence(local_record) | |
unlock(paternal_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading paternal FASTA records for adding to dictionary.") | |
close(reader) | |
switches = DataFrame(read_id=Vector{String}(undef, nrow(pafs)), | |
maternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs)), | |
paternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs))) | |
counter = Atomic{Int}(0) | |
Threads.@threads for i in 1:nrow(pafs) | |
switches_lock = ReentrantLock() | |
# nanopore SUP sequences | |
nanopore_start = Int(pafs.query_start[i]) + 1 | |
nanopore_stop = Int(pafs.query_stop[i]) | |
nanopore_seq = LongSequence{DNAAlphabet{2}}(nanopore_dict[pafs.query_id[i]][nanopore_start:nanopore_stop]) | |
if pafs.strand[i] == "-" | |
reverse_complement!(nanopore_seq) | |
end | |
# maternal sequences | |
maternal_start = Int(pafs.ref_start[i]) + 1 | |
maternal_stop = Int(pafs.ref_stop[i]) | |
maternal_seq = LongSequence{DNAAlphabet{2}}(maternal_dict[pafs.ref_id[i]][maternal_start:maternal_stop]) | |
# maternal sequences | |
paternal_start = Int(pafs.ref_start_1[i]) + 1 | |
paternal_stop = Int(pafs.ref_stop_1[i]) | |
paternal_seq = LongSequence{DNAAlphabet{2}}(paternal_dict[pafs.ref_id_1[i]][paternal_start:paternal_stop]) | |
# make an empty vector of kmer_length with dna alphabet 2 | |
nanopore_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(nanopore_seq, Val(kmer_length)) | |
push!(nanopore_kmers, i[2]) | |
end | |
# make an empty vector of kmer_length with dna alphabet 2 | |
maternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(maternal_seq, Val(kmer_length)) | |
push!(maternal_kmers, i[2]) | |
end | |
# make an empty vector of kmer_length with dna alphabet 2 | |
paternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(paternal_seq, Val(kmer_length)) | |
push!(paternal_kmers, i[2]) | |
end | |
# Find unique elements in each vector | |
maternal_hapmers = setdiff(maternal_kmers, paternal_kmers) | |
paternal_hapmers = setdiff(paternal_kmers, maternal_kmers) | |
nanopore_vs_paternal_hapmers = intersect(paternal_hapmers, nanopore_kmers) | |
nanopore_vs_maternal_hapmers = intersect(maternal_hapmers, nanopore_kmers) | |
lock(switches_lock) | |
switches[i, :] = [pafs.query_id[i], length(nanopore_vs_maternal_hapmers), length(nanopore_vs_paternal_hapmers)] | |
unlock(switches_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rProcessed $(counter[]) records for hapmer analysis.") | |
flush(stdout) | |
end | |
end | |
return switches | |
end | |
function main() | |
parsed_args = parse_commandline() | |
maternal_paf_file = parsed_args["maternal_paf_file"] | |
paternal_paf_file = parsed_args["paternal_paf_file"] | |
nanopore_fasta = parsed_args["nanopore_fasta"] | |
maternal_fasta = parsed_args["maternal_fasta"] | |
paternal_fasta = parsed_args["paternal_fasta"] | |
kmer_length = parsed_args["kmer_length"] | |
output_file = parsed_args["output_file"] | |
switches = phase_switches(maternal_paf_file, | |
paternal_paf_file, | |
nanopore_fasta, | |
maternal_fasta, | |
paternal_fasta, | |
kmer_length) | |
CSV.write(output_file, switches; delim='\t', newline='\n') | |
end | |
main() |
You can then use julia
again in the REPL if you want, to make histograms.
using CSV
using DataFrames
using Plots
using KernelDensity
# SUP data
# read in the hapmer matches
test = CSV.read("WGS_HG002_EZ1_10kb_SUP.kmer_matches.txt", DataFrame; delim="\t",header=true, types=[String,Int64,Int64])
# make a copy of the test DataFrame
test2 = test
# get the percent out of the total matching to mat haplotype
test2.maternal_hapmers_percent = test2.maternal_hapmers_matching ./ (test2.paternal_hapmers_matching .+ test2.maternal_hapmers_matching) .* 100
# get the percent out of the total matching to pat haplotype
test2.paternal_hapmers_percent = test2.paternal_hapmers_matching ./ (test2.paternal_hapmers_matching .+ test2.maternal_hapmers_matching) .* 100
# filter out NaN (0 / 0 ) values
test2 = filter(row -> all(x -> !(x isa Number && isnan(x)), row), test2)
# make a histogram
hist = histogram(abs.(test2.maternal_hapmers_percent - test2.paternal_hapmers_percent), bins=100, title="SUP", xlabel="Abs val of differ in percent matching HAPMERs between haplotypes", ylabel="Num Reads", legend=false)
# save as SVG image
savefig(hist, "SUP_abs_val_diff_percent_HAPMERs_between_haplotypes_hist.svg")
Below is such an image if you compare SUP to Herro reads in the same image.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can call the script , (ideally using multiple threads) like this:
This shows that read
0000020f-1fd7-473a-ab04-4c6b2ebbc5d1
has 220 matching maternal hapmersThis also shows that read
00000ba1-86dd-4169-b344-a075ba139113
has 45 matching paternal hapmersRead
00001537-a3be-49e7-9a60-d1b0f769204a
has no matching hapmers to either paternal or maternal haplotypesRead
00001ebf-9a15-4e4d-9cc7-bd3512db01c4
has 2 maternal and 163 paternal matching hapmers, its value below in the plot would be(|2-163|) / ((163 + 2) *100) = 97.57575757575758
every other read in the above output would have values of 100.0 except this read,00001ebf-9a15-4e4d-9cc7-bd3512db01c4