Last active
May 30, 2020 15:30
-
-
Save TransGirlCodes/4e06c5c4f4648c594fdb1a886cf5042d to your computer and use it in GitHub Desktop.
Benchmarking a 128 primitive type kmer vs an NTuple based kmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BioSequences, BenchmarkTools | |
# In BioSequences.jl kmers are called Mer, not Kmer so there is no name clash. | |
struct Kmer{A<:NucleicAcidAlphabet{2},K,N} | |
data::NTuple{N,UInt64} | |
end | |
const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N} | |
### | |
### Base Functions | |
### | |
@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64N, 2) | |
@inline n_unused(::Type{Kmer{A,K,N}}) where {A,K,N} = capacity(Kmer{A,K,N}) - K | |
@inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} | |
c = capacity(Kmer{A,K,N}) | |
if !(1 ≤ K ≤ c) | |
throw(ArgumentError("K must be within 1..$c")) | |
end | |
end | |
@inline Base.eltype(::Type{Kmer{A,K,N}}) where {A,K,N} = eltype(A) | |
@inline Base.length(x::Kmer{A,K,N}) where {A,K,N} = K | |
@inline Base.summary(x::Kmer{DNAAlphabet{2},K,N}) where {K,N} = string("DNA ", K, "-mer") | |
@inline Base.summary(x::Kmer{RNAAlphabet{2},K,N}) where {K,N} = string("RNA ", K, "-mer") | |
function Base.typemin(::Type{Kmer{A,K,N}}) where {A,K,N} | |
checkmer(T) | |
return T(ntuple(i -> zero(UInt64), N)) | |
end | |
function Base.typemax(::Type{Kmer{A,K,N}}) where {A,K,N} | |
checkmer(Kmer{A,K,N}) | |
return Kmer{A,K,N}((typemax(UInt64) >> (64N - 2K), ntuple(i -> typemax(UInt64),N - 1)...)) | |
end | |
function Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} | |
return Kmer{A,K,N}(ntuple(i -> rand(UInt64), N)) | |
end | |
# Create a Mer from a sequence. | |
function (::Type{Kmer{A,K,N}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K,N} | |
seqlen = length(seq) | |
if seqlen != K | |
throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen ≠ $K)")) | |
end | |
if seqlen > capacity(Kmer{A,K,N}) | |
throw(ArgumentError("Cannot build a mer longer than $(capacity(Kmer{A,K,N}))bp long")) | |
end | |
# Construct the head. | |
bases_in_head = div(64 - (64N - 2K), 2) | |
head = zero(UInt64) | |
@inbounds for i in 1:bases_in_head | |
nt = convert(eltype(typeof(seq)), seq[i]) | |
head = (head << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) | |
end | |
# And the rest of the sequence | |
idx = Ref(bases_in_head + 1) | |
tail = ntuple(Val{N - 1}()) do i | |
Base.@_inline_meta | |
body = zero(UInt64) | |
@inbounds for i in 1:32 | |
nt = convert(eltype(typeof(seq)), seq[idx[]]) | |
body = (body << 2) | UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) | |
idx[] += 1 | |
end | |
return body | |
end | |
return Kmer{A,K,N}((head, tail...)) | |
end | |
function (::Type{BigMer{A,K}})(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet{2},K} | |
seqlen = length(seq) | |
if seqlen != K | |
throw(ArgumentError("seq does not contain the correct number of nucleotides ($seqlen ≠ $K)")) | |
end | |
if seqlen > BioSequences.capacity(BigMer{A,K}) | |
throw(ArgumentError("Cannot build a mer longer than $(BioSequences.capacity(BigMer{A,K}))bp long")) | |
end | |
x = zero(BioSequences.encoded_data_type(BigMer{A,K})) | |
for c in seq | |
nt = convert(eltype(BigMer{A,K}), c) | |
x = (x << 2) | BioSequences.encoded_data_type(BigMer{A,K})(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) | |
end | |
return BigMer{A,K}(x) | |
end | |
@inline function choptail(x::Kmer{A,K,N}) where {A,K,N} | |
ntuple(Val{N - 1}()) do i | |
Base.@_inline_meta | |
return @inbounds x.data[i] | |
end | |
end | |
@inline function gettail(x::Kmer{A,K,N}) where {A,K,N} | |
return @inbounds x.data[N] | |
end | |
@inline function tailput(x::Kmer{A,K,N}, nt::DNA) where {A,K,N} | |
bits = UInt64(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) | |
tail = (gettail(x) & (typemax(UInt64) - UInt64(3))) | bits | |
return Kmer{A,K,N}((choptail(x)..., tail)) | |
end | |
@inline function shiftleft(x::Kmer{A,K,N}) where {A,K,N} | |
carry = Ref(UInt64(0)) | |
ntuple(Val{N}()) do i | |
Base.@_inline_meta | |
elem = x.data[(N - i) + 1] | |
oldcarry = carry[] | |
carry[] = elem >> 62 | |
return ((elem << 2) | oldcarry) | |
end | |
end | |
@inline shiftright(x::BigDNAMer{K}) where {K} = BigDNAMer{K}(reinterpret(UInt128, x) >> 2) | |
@inline function shiftright(x::Kmer{A,K,N}) where {A,K,N} | |
head = @inbounds x.data[1] >> 2 | |
tail = ntuple(Val{N - 1}()) do i | |
Base.@_inline_meta | |
j = i + 1 | |
@inbounds begin | |
return (x.data[j] >> 2) | ((x.data[i] & UInt64(3)) << 62) | |
end | |
end | |
return Kmer{A,K,N}((head, tail...)) | |
end | |
dnaseq = LongSequence{DNAAlphabet{2}}(randdnaseq(63)) | |
@benchmark BigDNAMer{63}($dnaseq) | |
@benchmark DNAKmer{63,2}($dnaseq) | |
m = DNAKmer{63,2}(dnaseq) | |
oldm = BigDNAMer{63}(dnaseq) | |
@benchmark shiftright($m) | |
@benchmark shiftright($oldm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment