Last active
May 28, 2024 10:24
-
-
Save jelber2/45225fa0937b59f72bbdddf8d694eb70 to your computer and use it in GitHub Desktop.
Using Nanopore Reads from HG002, estimate the phase switching with hapmers and readmers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get HG002 | |
# wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.0.1.fasta.gz | |
# gunzip hg002v1.0.1.fasta.gz | |
# | |
# Use seqtk to get maternal and paternal sequences | |
# https://github.com/lh3/seqtk | |
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "MATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.maternal.fasta & | |
# seqtk seq -l0 hg002v1.0.1.fasta|paste - - |fgrep "PATERNAL" |tr '\t' '\n'|seqtk seq -l60 > hg002v1.0.1.paternal.fasta & | |
# | |
# use samtools to index haplotypes | |
# samtools index hg002v1.0.1.maternal.fasta | |
# samtools index hg002v1.0.1.paternal.fasta | |
# | |
# Use minimap2 and samtools to map un-corrected SUP reads to each haplotype separately | |
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.maternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.maternal.bam | |
# minimap2 --eqx --secondary=no -t 248 -Y -c -ax lr:hq hg002v1.0.1.paternal.fasta WGS_HG002_EZ1_10kb_SUP_NEW.fastq.gz |samtools sort -@248 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam | |
# | |
# Use samtools and minimap2's paftools.js to get sam2paf and retain | |
# only mapping quality Q60 and primary alignments of the following PAF columns in that order | |
# [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop] | |
# these are columns 1,3,4,5,6,8, and 9 in PAF file | |
# https://github.com/lh3/miniasm/blob/master/PAF.md | |
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf | |
# samtools view -h -q60 -F 0x100 -@4 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.bam|paftools.js sam2paf - |cut -f 1,3-6,8-9 > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf | |
# | |
# get fai index for SUP reads | |
# samtools fasta -@16 WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.bam > WGS_HG002_EZ1_10kb_SUP.fasta && \ | |
# samtools faidx WGS_HG002_EZ1_10kb_SUP.fasta | |
# | |
# make sure there is only occurrence of each read_id | |
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.paternal.fasta.cols1345689.unique.paf | |
# sort -uk1,1 --parallel=48 -S 300G WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.paf > WGS_HG002_EZ1_10kb_SUP_sam1.4_SoftClip_against_hg002v1.0.1.maternal.fasta.cols1345689.unique.paf | |
# | |
# output is a tab-separated file with columns: | |
# read_id number_matching_hapmers_from_maternal_haplotype number_matching_hapmers_from_paternal_haplotype | |
# | |
# I theorized that one should have mostly matching hapmers for a single haplotype if no phase_switching, but "the proof | |
# is in the pudding" | |
# script works, just not sure if the actual concept works | |
using BioSequences | |
using FASTX | |
using Kmers | |
using CSV | |
using DataFrames | |
using Base.Threads | |
using ArgParse | |
function parse_commandline() | |
s = ArgParseSettings() | |
@add_arg_table s begin | |
"--maternal_paf_file", "-m" | |
help = "path to maternal paf file" | |
required = true | |
"--paternal_paf_file", "-p" | |
help = "path to paternal paf file" | |
required = true | |
"--nanopore_fasta", "-n" | |
help = "path to nanopore fasta file (SUP.fasta)" | |
required = true | |
"--maternal_fasta", "-i" | |
help = "path to maternal fasta file (hg002v1.0.1.maternal.fasta)" | |
required = true | |
"--paternal_fasta", "-j" | |
help = "path to paternal fasta file (hg002v1.0.1.paternal.fasta)" | |
required = true | |
"--kmer_length", "-k" | |
arg_type = Int64 | |
help = "for example 15" | |
required = true | |
"--output_file", "-o" | |
help = "Output file name (WGS_HG002_EZ1_10kb_SUP.kmer-matches.txt)" | |
required = true | |
end | |
return parse_args(s) | |
end | |
function phase_switches(maternal_paf_file::String, | |
paternal_paf_file::String, | |
nanopore_fasta::String, | |
maternal_fasta::String, | |
paternal_fasta::String, | |
kmer_length::Int64) | |
println() | |
println("Reading maternal paf file...") | |
maternal_paf = CSV.read(maternal_paf_file, | |
DataFrame; | |
delim='\t', | |
header=false, | |
types=[String, UInt32, UInt32, String, String, UInt32, UInt32]) | |
rename!(maternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop]) | |
println("Done reading maternal paf file.") | |
println() | |
println("Reading paternal paf file...") | |
paternal_paf = CSV.read(paternal_paf_file, | |
DataFrame; | |
delim='\t', | |
header=false, | |
types=[String, UInt32, UInt32, String, String, UInt32, UInt32]) | |
rename!(paternal_paf, [:query_id, :query_start, :query_stop, :strand, :ref_id, :ref_start, :ref_stop]) | |
println("Done reading paternal paf file.") | |
# Function to extract the chromosome part using regular expressions | |
function extract_chr_id(id::String) | |
return match.(r"chr\w+_", id) | |
end | |
println() | |
println("Extracting maternal chr ids...") | |
# Apply the function to create a new column for matching | |
maternal_paf[!, :chr_id] = extract_chr_id.(maternal_paf.ref_id) | |
maternal_paf[!, :chr_id] = [m.match for m in maternal_paf.chr_id] | |
println("Done extracting maternal chr ids.") | |
println() | |
println("Extracting paternal chr ids...") | |
paternal_paf[!, :chr_id] = extract_chr_id.(paternal_paf.ref_id) | |
paternal_paf[!, :chr_id] = [m.match for m in paternal_paf.chr_id] | |
println("Done extracting paternal chr ids.") | |
println() | |
println("Performing leftjoin on maternal and paternal paf files...") | |
pafs = leftjoin(maternal_paf, paternal_paf, on = [:query_id, :chr_id, :query_start, :query_stop], makeunique=true) | |
dropmissing!(pafs) | |
println("Done performing leftjoin on maternal and paternal paf files.") | |
paternal_paf=nothing | |
maternal_paf=nothing | |
println() | |
println("Reading read FASTA Index for preallocating dictionary size.") | |
nanopore_fasta_index = string(nanopore_fasta, ".fai") | |
total_records = length(FASTX.FASTA.Index(nanopore_fasta_index).lengths) | |
println("There are $total_records read FASTA records") | |
nanopore_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, nanopore_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
nanopore_lock = ReentrantLock() | |
println() | |
println("Reading read FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(nanopore_lock) | |
nanopore_dict[identifier(local_record)] = sequence(local_record) | |
unlock(nanopore_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading read FASTA records for adding to dictionary.") | |
close(reader) | |
println() | |
println("Reading maternal FASTA Index for preallocating dictionary size.") | |
maternal_fasta_index = string(maternal_fasta, ".fai") | |
total_records_maternal = length(FASTX.FASTA.Index(maternal_fasta_index).lengths) | |
println("There are $total_records_maternal FASTA records") | |
maternal_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, maternal_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
maternal_lock = ReentrantLock() | |
println() | |
println("Reading maternal FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records_maternal | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(maternal_lock) | |
maternal_dict[identifier(local_record)] = sequence(local_record) | |
unlock(maternal_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading FASTA records for adding to dictionary.") | |
close(reader) | |
println() | |
println("Reading paternal FASTA Index for preallocating dictionary size.") | |
paternal_fasta_index = string(paternal_fasta, ".fai") | |
total_records_paternal = length(FASTX.FASTA.Index(paternal_fasta_index).lengths) | |
println("There are $total_records_paternal FASTA records") | |
paternal_dict = Dict{String, String}() | |
reader = open(FASTA.Reader, paternal_fasta) | |
record = FASTA.Record() | |
counter = Atomic{Int}(0) | |
reader_lock = ReentrantLock() | |
paternal_lock = ReentrantLock() | |
println() | |
println("Reading paternal FASTA records for adding to dictionary.") | |
Threads.@threads for i in 1:total_records_paternal | |
local_record = FASTA.Record() | |
lock(reader_lock) | |
read!(reader, local_record) | |
unlock(reader_lock) | |
lock(paternal_lock) | |
paternal_dict[identifier(local_record)] = sequence(local_record) | |
unlock(paternal_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rRead $(counter[]) FASTA records.") | |
flush(stdout) | |
end | |
end | |
println() | |
println("Done reading paternal FASTA records for adding to dictionary.") | |
close(reader) | |
switches = DataFrame(read_id=Vector{String}(undef, nrow(pafs)), | |
maternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs)), | |
paternal_hapmers_matching=Vector{UInt64}(undef, nrow(pafs))) | |
counter = Atomic{Int}(0) | |
Threads.@threads for i in 1:nrow(pafs) | |
switches_lock = ReentrantLock() | |
# nanopore SUP sequences | |
nanopore_start = Int(pafs.query_start[i]) + 1 | |
nanopore_stop = Int(pafs.query_stop[i]) | |
nanopore_seq = LongSequence{DNAAlphabet{2}}(nanopore_dict[pafs.query_id[i]][nanopore_start:nanopore_stop]) | |
if pafs.strand[i] == "-" | |
reverse_complement!(nanopore_seq) | |
end | |
# maternal sequences | |
maternal_start = Int(pafs.ref_start[i]) + 1 | |
maternal_stop = Int(pafs.ref_stop[i]) | |
maternal_seq = LongSequence{DNAAlphabet{2}}(maternal_dict[pafs.ref_id[i]][maternal_start:maternal_stop]) | |
# maternal sequences | |
paternal_start = Int(pafs.ref_start_1[i]) + 1 | |
paternal_stop = Int(pafs.ref_stop_1[i]) | |
paternal_seq = LongSequence{DNAAlphabet{2}}(paternal_dict[pafs.ref_id_1[i]][paternal_start:paternal_stop]) | |
# make an empty vector of kmer_length with dna alphabet 2 | |
nanopore_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(nanopore_seq, Val(kmer_length)) | |
push!(nanopore_kmers, i[2]) | |
end | |
# make an empty vector of kmer_length with dna alphabet 2 | |
maternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(maternal_seq, Val(kmer_length)) | |
push!(maternal_kmers, i[2]) | |
end | |
# make an empty vector of kmer_length with dna alphabet 2 | |
paternal_kmers = Vector{Kmer{DNAAlphabet{2}, kmer_length, 1}}() | |
# add the k-mers to the vector | |
for i in EveryKmer(paternal_seq, Val(kmer_length)) | |
push!(paternal_kmers, i[2]) | |
end | |
# Find unique elements in each vector | |
maternal_hapmers = setdiff(maternal_kmers, paternal_kmers) | |
paternal_hapmers = setdiff(paternal_kmers, maternal_kmers) | |
nanopore_vs_paternal_hapmers = intersect(paternal_hapmers, nanopore_kmers) | |
nanopore_vs_maternal_hapmers = intersect(maternal_hapmers, nanopore_kmers) | |
lock(switches_lock) | |
switches[i, :] = [pafs.query_id[i], length(nanopore_vs_maternal_hapmers), length(nanopore_vs_paternal_hapmers)] | |
unlock(switches_lock) | |
atomic_add!(counter, 1) | |
if threadid() == 1 | |
print("\rProcessed $(counter[]) records for hapmer analysis.") | |
flush(stdout) | |
end | |
end | |
return switches | |
end | |
function main() | |
parsed_args = parse_commandline() | |
maternal_paf_file = parsed_args["maternal_paf_file"] | |
paternal_paf_file = parsed_args["paternal_paf_file"] | |
nanopore_fasta = parsed_args["nanopore_fasta"] | |
maternal_fasta = parsed_args["maternal_fasta"] | |
paternal_fasta = parsed_args["paternal_fasta"] | |
kmer_length = parsed_args["kmer_length"] | |
output_file = parsed_args["output_file"] | |
switches = phase_switches(maternal_paf_file, | |
paternal_paf_file, | |
nanopore_fasta, | |
maternal_fasta, | |
paternal_fasta, | |
kmer_length) | |
CSV.write(output_file, switches; delim='\t', newline='\n') | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Full histograms
Cut-off at 100,000 reads