Skip to content

Instantly share code, notes, and snippets.

@TransGirlCodes
Last active May 30, 2020 15:30
Show Gist options
  • Save TransGirlCodes/4e06c5c4f4648c594fdb1a886cf5042d to your computer and use it in GitHub Desktop.
Save TransGirlCodes/4e06c5c4f4648c594fdb1a886cf5042d to your computer and use it in GitHub Desktop.
Benchmarking a 128 primitive type kmer vs an NTuple based kmer
using BioSequences, BenchmarkTools
# In BioSequences.jl kmers are called Mer, not Kmer so there is no name clash.
struct Kmer{A<:NucleicAcidAlphabet{2},K,N}
data::NTuple{N,UInt64}
end
const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N}
###
### Base Functions
###
@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64N, 2)
@inline n_unused(::Type{Kmer{A,K,N}}) where {A,K,N} = capacity(Kmer{A,K,N}) - K
@inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N}
c = capacity(Kmer{A,K,N})
if !(1 ≤ K ≤ c)
throw(ArgumentError("K must be within 1..$c"))
end
end
@inline Base.eltype(::Type{Kmer{A,K,N}}) where {A,K,N} = eltype(A)
@inline Base.length(x::Kmer{A,K,N}) where {A,K,N} = K
@inline Base.summary(x::Kmer{DNAAlphabet{2},K,N}) where {K,N} = string("DNA ", K, "-mer")
@inline Base.summary(x::Kmer{RNAAlphabet{2},K,N}) where {K,N} = string("RNA ", K, "-mer")
function Base.typemin(::Type{Kmer{A,K,N}}) where {A,K,N}
checkmer(T)
return T(ntuple(i -> zero(UInt64), N))
end
function Base.typemax(::Type{Kmer{A,K,N}}) where {A,K,N}
checkmer(Kmer{A,K,N})
return Kmer{A,K,N}((typemax(UInt64) >> (64N - 2K), ntuple(i -> typemax(UInt64),N - 1)...))
end
function Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N}
return Kmer{A,K,N}(ntuple(i -> rand(UInt64), N))
end
# Create a Mer from a sequence.
function (::Type{Kmer{A,K,N}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K,N}
seqlen = length(seq)
if seqlen != K
throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen ≠ $K)"))
end
if seqlen > capacity(Kmer{A,K,N})
throw(ArgumentError("Cannot build a mer longer than $(capacity(Kmer{A,K,N}))bp long"))
end
# Construct the head.
bases_in_head = div(64 - (64N - 2K), 2)
head = zero(UInt64)
@inbounds for i in 1:bases_in_head
nt = convert(eltype(typeof(seq)), seq[i])
head = (head << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
end
# And the rest of the sequence
idx = Ref(bases_in_head + 1)
tail = ntuple(Val{N - 1}()) do i
Base.@_inline_meta
body = zero(UInt64)
@inbounds for i in 1:32
nt = convert(eltype(typeof(seq)), seq[idx[]])
body = (body << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
idx[] += 1
end
return body
end
return Kmer{A,K,N}((head, tail...))
end
function (::Type{BigMer{A,K}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K}
seqlen = length(seq)
if seqlen != K
throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen ≠ $K)"))
end
if seqlen > BioSequences.capacity(BigMer{A,K})
throw(ArgumentError("Cannot build a mer longer than $(BioSequences.capacity(BigMer{A,K}))bp long"))
end
x = zero(BioSequences.encoded_data_type(BigMer{A,K}))
for c in seq
nt = convert(eltype(BigMer{A,K}), c)
x = (x << 2) | BioSequences.encoded_data_type(BigMer{A,K})(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
end
return BigMer{A,K}(x)
end
@inline function choptail(x::Kmer{A,K,N}) where {A,K,N}
ntuple(Val{N - 1}()) do i
Base.@_inline_meta
return @inbounds x.data[i]
end
end
@inline function gettail(x::Kmer{A,K,N}) where {A,K,N}
return @inbounds x.data[N]
end
@inline function tailput(x::Kmer{A,K,N}, nt::DNA) where {A,K,N}
bits = UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01])
tail = (gettail(x) & (typemax(UInt64) - UInt64(3))) | bits
return Kmer{A,K,N}((choptail(x)..., tail))
end
@inline function shiftleft(x::Kmer{A,K,N}) where {A,K,N}
carry = Ref(UInt64(0))
ntuple(Val{N}()) do i
Base.@_inline_meta
elem = x.data[(N - i) + 1]
oldcarry = carry[]
carry[] = elem >> 62
return ((elem << 2) | oldcarry)
end
end
@inline shiftright(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) >> 2)
@inline function shiftright(x::Kmer{A,K,N}) where {A,K,N}
head = @inbounds x.data[1] >> 2
tail = ntuple(Val{N - 1}()) do i
Base.@_inline_meta
j = i + 1
@inbounds begin
return (x.data[j] >> 2) | ((x.data[i] & UInt64(3)) << 62)
end
end
return Kmer{A,K,N}((head, tail...))
end
dnaseq = LongSequence{DNAAlphabet{2}}(randdnaseq(63))
@benchmark BigDNAMer{63}($dnaseq)
@benchmark DNAKmer{63,2}($dnaseq)
m = DNAKmer{63,2}(dnaseq)
oldm = BigDNAMer{63}(dnaseq)
@benchmark shiftright($m)
@benchmark shiftright($oldm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment