Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jelber2/45225fa0937b59f72bbdddf8d694eb70 to your computer and use it in GitHub Desktop.
Save jelber2/45225fa0937b59f72bbdddf8d694eb70 to your computer and use it in GitHub Desktop.
Using Nanopore Reads from HG002, estimate the phase switching with hapmers and readmers
# Get HG002
# wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.0.1.fasta.gz
# gunzip hg002v1.0.1.fasta.gz
#
# Use seqtk to get maternal and paternal sequences
# https://github.com/lh3/seqtk
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "MATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.maternal.fasta &
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "PATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.paternal.fasta &
#
# use samtools to index haplotypes
# samtools index hg002v1.0.1.maternal.fasta
# samtools index hg002v1.0.1.paternal.fasta
#
# Use minimap2 and samtools to map un-corrected SUP reads to each haplotype separately
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.maternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.maternal.bam
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.paternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam
#
# Use samtools and minimap2's paftools.js to get sam2paf and retain
# only mapping quality Q60 and primary alignments of the following PAF columns in that order
# [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop]
# these are columns 1,3,4,5,6,8, and 9 in PAF file
# https://github.com/lh3/miniasm/blob/master/PAF.md
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf
#
# get fai index for SUP reads
# samtools fasta -@16 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam > WGS_HG002_EZ1_10kb_SUP.fasta && \
# samtools faidx WGS_HG002_EZ1_10kb_SUP.fasta
#
# make sure there is only occurrence of each read_id
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.unique.paf
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.unique.paf
#
# output is a tab-separated file with columns:
# read_id number_matching_hapmers_from_maternal_haplotype number_matching_hapmers_from_paternal_haplotype
#
# I theorized that one should have mostly matching hapmers for a single haplotype if no phase_switching, but "the proof
# is in the pudding"
# script works, just not sure if the actual concept works
using BioSequences
using FASTX
using Kmers
using CSV
using DataFrames
using Base.Threads
using ArgParse
function parse_commandline()
s = ArgParseSettings()
@add_arg_table s begin
"--maternal_paf_file", "-m"
help = "path to maternal paf file"
required = true
"--paternal_paf_file", "-p"
help = "path to paternal paf file"
required = true
"--nanopore_fasta", "-n"
help = "path to nanopore fasta file (SUP.fasta)"
required = true
"--maternal_fasta", "-i"
help = "path to maternal fasta file (hg002v1.0.1.maternal.fasta)"
required = true
"--paternal_fasta", "-j"
help = "path to paternal fasta file (hg002v1.0.1.paternal.fasta)"
required = true
"--kmer_length", "-k"
arg_type = Int64
help = "for example 15"
required = true
"--output_file", "-o"
help = "Output file name (WGS_HG002_EZ1_10kb_SUP.kmer-matches.txt)"
required = true
end
return parse_args(s)
end
function phase_switches(maternal_paf_file::String,
paternal_paf_file::String,
nanopore_fasta::String,
maternal_fasta::String,
paternal_fasta::String,
kmer_length::Int64)
println()
println("Reading maternal paf file...")
maternal_paf = CSV.read(maternal_paf_file,
DataFrame;
delim='\t',
header=false,
types=[String, UInt32, UInt32, String, String, UInt32, UInt32])
rename!(maternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop])
println("Done reading maternal paf file.")
println()
println("Reading paternal paf file...")
paternal_paf = CSV.read(paternal_paf_file,
DataFrame;
delim='\t',
header=false,
types=[String, UInt32, UInt32, String, String, UInt32, UInt32])
rename!(paternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop])
println("Done reading paternal paf file.")
# Function to extract the chromosome part using regular expressions
function extract_chr_id(id::String)
return match.(r"chr\w+_", id)
end
println()
println("Extracting maternal chr ids...")
# Apply the function to create a new column for matching
maternal_paf[!, :chr_id] = extract_chr_id.(maternal_paf.ref_id)
maternal_paf[!, :chr_id] = [m.match for m in maternal_paf.chr_id]
println("Done extracting maternal chr ids.")
println()
println("Extracting paternal chr ids...")
paternal_paf[!, :chr_id] = extract_chr_id.(paternal_paf.ref_id)
paternal_paf[!, :chr_id] = [m.match for m in paternal_paf.chr_id]
println("Done extracting paternal chr ids.")
println()
println("Performing leftjoin on maternal and paternal paf files...")
pafs = leftjoin(maternal_paf, paternal_paf, on = [:query_id, :chr_id, :query_start, :query_stop], makeunique=true)
dropmissing!(pafs)
println("Done performing leftjoin on maternal and paternal paf files.")
paternal_paf=nothing
maternal_paf=nothing
println()
println("Reading read FASTA Index for preallocating dictionary size.")
nanopore_fasta_index = string(nanopore_fasta, ".fai")
total_records = length(FASTX.FASTA.Index(nanopore_fasta_index).lengths)
println("There are $total_records read FASTA records")
nanopore_dict = Dict{String, String}()
reader = open(FASTA.Reader, nanopore_fasta)
record = FASTA.Record()
counter = Atomic{Int}(0)
reader_lock = ReentrantLock()
nanopore_lock = ReentrantLock()
println()
println("Reading read FASTA records for adding to dictionary.")
Threads.@threads for i in 1:total_records
local_record = FASTA.Record()
lock(reader_lock)
read!(reader, local_record)
unlock(reader_lock)
lock(nanopore_lock)
nanopore_dict[identifier(local_record)] = sequence(local_record)
unlock(nanopore_lock)
atomic_add!(counter, 1)
if threadid() == 1
print("\rRead $(counter[]) FASTA records.")
flush(stdout)
end
end
println()
println("Done reading read FASTA records for adding to dictionary.")
close(reader)
println()
println("Reading maternal FASTA Index for preallocating dictionary size.")
maternal_fasta_index = string(maternal_fasta, ".fai")
total_records_maternal = length(FASTX.FASTA.Index(maternal_fasta_index).lengths)
println("There are $total_records_maternal FASTA records")
maternal_dict = Dict{String, String}()
reader = open(FASTA.Reader, maternal_fasta)
record = FASTA.Record()
counter = Atomic{Int}(0)
reader_lock = ReentrantLock()
maternal_lock = ReentrantLock()
println()
println("Reading maternal FASTA records for adding to dictionary.")
Threads.@threads for i in 1:total_records_maternal
local_record = FASTA.Record()
lock(reader_lock)
read!(reader, local_record)
unlock(reader_lock)
lock(maternal_lock)
maternal_dict[identifier(local_record)] = sequence(local_record)
unlock(maternal_lock)
atomic_add!(counter, 1)
if threadid() == 1
print("\rRead $(counter[]) FASTA records.")
flush(stdout)
end
end
println()
println("Done reading FASTA records for adding to dictionary.")
close(reader)
println()
println("Reading paternal FASTA Index for preallocating dictionary size.")
paternal_fasta_index = string(paternal_fasta, ".fai")
total_records_paternal = length(FASTX.FASTA.Index(paternal_fasta_index).lengths)
println("There are $total_records_paternal FASTA records")
paternal_dict = Dict{String, String}()
reader = open(FASTA.Reader, paternal_fasta)
record = FASTA.Record()
counter = Atomic{Int}(0)
reader_lock = ReentrantLock()
paternal_lock = ReentrantLock()
println()
println("Reading paternal FASTA records for adding to dictionary.")
Threads.@threads for i in 1:total_records_paternal
local_record = FASTA.Record()
lock(reader_lock)
read!(reader, local_record)
unlock(reader_lock)
lock(paternal_lock)
paternal_dict[identifier(local_record)] = sequence(local_record)
unlock(paternal_lock)
atomic_add!(counter, 1)
if threadid() == 1
print("\rRead $(counter[]) FASTA records.")
flush(stdout)
end
end
println()
println("Done reading paternal FASTA records for adding to dictionary.")
close(reader)
switches = DataFrame(read_id=Vector{String}(undef, nrow(pafs)),
maternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs)),
paternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs)))
counter = Atomic{Int}(0)
Threads.@threads for i in 1:nrow(pafs)
switches_lock = ReentrantLock()
# nanopore SUP sequences
nanopore_start = Int(pafs.query_start[i]) + 1
nanopore_stop = Int(pafs.query_stop[i])
nanopore_seq = LongSequence{DNAAlphabet{2}}(nanopore_dict[pafs.query_id[i]][nanopore_start:nanopore_stop])
if pafs.strand[i] == "-"
reverse_complement!(nanopore_seq)
end
# maternal sequences
maternal_start = Int(pafs.ref_start[i]) + 1
maternal_stop = Int(pafs.ref_stop[i])
maternal_seq = LongSequence{DNAAlphabet{2}}(maternal_dict[pafs.ref_id[i]][maternal_start:maternal_stop])
# maternal sequences
paternal_start = Int(pafs.ref_start_1[i]) + 1
paternal_stop = Int(pafs.ref_stop_1[i])
paternal_seq = LongSequence{DNAAlphabet{2}}(paternal_dict[pafs.ref_id_1[i]][paternal_start:paternal_stop])
# make an empty vector of kmer_length with dna alphabet 2
nanopore_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}()
# add the k-mers to the vector
for i in EveryKmer(nanopore_seq, Val(kmer_length))
push!(nanopore_kmers, i[2])
end
# make an empty vector of kmer_length with dna alphabet 2
maternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}()
# add the k-mers to the vector
for i in EveryKmer(maternal_seq, Val(kmer_length))
push!(maternal_kmers, i[2])
end
# make an empty vector of kmer_length with dna alphabet 2
paternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}()
# add the k-mers to the vector
for i in EveryKmer(paternal_seq, Val(kmer_length))
push!(paternal_kmers, i[2])
end
# Find unique elements in each vector
maternal_hapmers = setdiff(maternal_kmers, paternal_kmers)
paternal_hapmers = setdiff(paternal_kmers, maternal_kmers)
nanopore_vs_paternal_hapmers = intersect(paternal_hapmers, nanopore_kmers)
nanopore_vs_maternal_hapmers = intersect(maternal_hapmers, nanopore_kmers)
lock(switches_lock)
switches[i, :] = [pafs.query_id[i], length(nanopore_vs_maternal_hapmers), length(nanopore_vs_paternal_hapmers)]
unlock(switches_lock)
atomic_add!(counter, 1)
if threadid() == 1
print("\rProcessed $(counter[]) records for hapmer analysis.")
flush(stdout)
end
end
return switches
end
function main()
parsed_args = parse_commandline()
maternal_paf_file = parsed_args["maternal_paf_file"]
paternal_paf_file = parsed_args["paternal_paf_file"]
nanopore_fasta = parsed_args["nanopore_fasta"]
maternal_fasta = parsed_args["maternal_fasta"]
paternal_fasta = parsed_args["paternal_fasta"]
kmer_length = parsed_args["kmer_length"]
output_file = parsed_args["output_file"]
switches = phase_switches(maternal_paf_file,
paternal_paf_file,
nanopore_fasta,
maternal_fasta,
paternal_fasta,
kmer_length)
CSV.write(output_file, switches; delim='\t', newline='\n')
end
main()
@jelber2
Copy link
Author

jelber2 commented May 28, 2024

abs_val_diff_percent_HAPMERs_between_haplotypes
Full histograms

abs_val_diff_percent_HAPMERs_between_haplotypes2
Cut-off at 100,000 reads

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment