Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Created March 16, 2017 06:21
Show Gist options
  • Save bicycle1885/23f15ac022cff46cb454c9be1a585987 to your computer and use it in GitHub Desktop.
Save bicycle1885/23f15ac022cff46cb454c9be1a585987 to your computer and use it in GitHub Desktop.
K-mer MinHash
using Bio.Seq
import DataStructures: SortedSet
immutable MinHashSketch
sketch::Vector{UInt64}
kmersize::Int
function MinHashSketch(sketch::Vector, kmersize::Int)
length(sketch) > 0 || error("Sketch cannot be empty")
kmersize > 0 || error("Kmersize must be greater than 0")
new(sketch, kmersize)
end
end
function revcomphash(kmer::Kmer)
k = min(kmer, reverse_complement(kmer))
return hash(k)
end
function kmerminhash(seq::BioSequence, kmerset, kmerhashes::Vector{UInt64}, k::Int, s::Int)
typeof(kmerset) <: Set || typeof(kmerset) <: SortedSet || error("Kmerset must be a `Set` or `SortedSet`")
length(kmerhashes) <= s || error("Kmerhashes cannot be larger than the set size")
for kmer in each(DNAKmer{k}, seq)
if length(kmerhashes) == 0
if length(kmerset) < s
push!(kmerset, revcomphash(kmer[2]))
elseif length(kmerset) == s
kmerset = SortedSet(kmerset)
for i in kmerset
push!(kmerhashes, pop!(kmerset))
end
end
end
if length(kmerhashes) == s
h = revcomphash(kmer[2])
if h < kmerhashes[end]
i = searchsortedlast(kmerhashes, h)
if i == 0 && h != kmerhashes[1]
pop!(kmerhashes)
unshift!(kmerhashes, h)
elseif h != kmerhashes[i]
pop!(kmerhashes)
insert!(kmerhashes, i+1, h)
end
end
end
end
return (kmerset, kmerhashes)
end
function minhash(seq::BioSequence, k::Int, s::Int)
kmerset = Set{UInt64}()
kmerhashes = Vector{UInt64}()
kmerset, kmerhashes = kmerminhash(seq, kmerset, kmerhashes, k, s)
length(kmerhashes) == s || error("Sketch size is too large for the given kmer size")
return MinHashSketch(kmerhashes, k)
end
function mykmerminhash{k}(::Type{DNAKmer{k}}, seq::BioSequence, s::Int)
# generate first `s` kmers
kmerhashes = UInt64[]
iter = each(DNAKmer{k}, seq)
state = start(iter)
while length(kmerhashes) < s && !done(iter, state)
(_, kmer), state = next(iter, state)
h = revcomphash(kmer)
if h kmerhashes
push!(kmerhashes, h)
end
end
if length(kmerhashes) < s
error("failed to generate enough hashes")
end
sort!(kmerhashes)
# scan `seq` to make a minhash
while !done(iter, state)
(_, kmer), state = next(iter, state)
h = revcomphash(kmer)
if h < kmerhashes[end]
i = searchsortedlast(kmerhashes, h)
if i == 0 && h != kmerhashes[1]
pop!(kmerhashes)
unshift!(kmerhashes, h)
elseif h != kmerhashes[i]
pop!(kmerhashes)
insert!(kmerhashes, i+1, h)
end
end
end
return kmerhashes
end
function myminhash(seq::BioSequence, k::Int, s::Int)
kmerhashes = mykmerminhash(DNAKmer{k}, seq, s)
return MinHashSketch(kmerhashes, k)
end
seq = dna"""
GTACGACGGAGTGTTATAAGATGGGAAATCGGATACCAGATGAAATTGTGGATCAGGTGCAAAAGTCGGC
AGATATCGTTGAAGTCATAGGTGATTATGTTCAATTAAAGAAGCAAGGCCGAAACTACTTTGGACTCTGT
CCTTTTCATGGAGAAAGCACACCTTCGTTTTCCGTATCGCCCGACAAACAGATTTTTCATTGCTTTGGCT
GCGGAGCGGGCGGCAATGTTTTCTCTTTTTTAAGGCAGATGGAAGGCTATTCTTTTGCCGAGTCGGTTTC
TCACCTTGCTGACAAATACCAAATTGATTTTCCAGATGATATAACAGTCCATTCCGGAGCCCGGCCAGAG
TCTTCTGGAGAACAAAAAATGGCTGAGGCACATGAGCTCCTGAAGAAATTTTACCATCATTTGTTAATAA
ATACAAAAGAAGGTCAAGAGGCACTGGATTATCTGCTTTCTAGGGGCTTTACGAAAGAGCTGATTAATGA
ATTTCAGATTGGCTATGCTCTTGATTCTTGGGACTTTATCACGAAATTCCTTGTAAAGAGGGGATTTAGT
GAGGCGCAAATGGAAAAAGCGGGTCTCCTGATCAGACGCGAAGACGGAAGCGGATATTTCGACCGCTTCA
GAAACCGTGTCATGTTTCCGATCCATGATCATCACGGGGCTGTTGTTGCTTTCTCAGGCAGGGCTCTTGG
CAGCCAGCAGCCTAAGTATATGAACAGTCCTGAAACCCCGCTCTTTCATAAAAGCAAACTGCTTTACAAT
TTTTATAAGGCCCGCCTTCATATCAGAAAGCAGGAAAGAGCAGTCTTATTTGAAGGGTTTGCTGATGTCT
ATACGGCCGTAAGCTCGGATGTAAAGGAAAGCATAGCCACGATGGGAACGTCTCTTACAGATGATCATGT
CAAGATCCTGAGAAGAAACGTCGAAGAAATCATTCTTTGCTATGACTCTGATAAAGCCGGTTATGAAGCC
ACCTTAAAAGCTTCGGAGCTTCTGCAAAAAAAAGGCTGCAAAGTCAGAGTTGCAATGATTCCTGACGGAT
TGGACCCTGATGATTACATCAAAAAATTCGGCGGGGAAAAATTTAAAAACGACATTATTGACGCAAGTGT
CACCGTAATGGCGTTCAAAATGCAATATTTCCGAAAAGGAAAGAACCTGTCCGATGAAGGCGACCGCCTA
GCTTACATTAAAGACGTACTGAAAGAAATCAGCACGCTTTCAGGGTCTCTAGAGCAGGAAGTCTATGTAA
AGCAGCTTGCTTCAGAGTTTTCGCTTTCACAGGAGTCTTTAACTGAGCAGCTGTCTGTTTTCAGCAAGCA
AAACAAACCTGCTGACAATAGCGGTGAAACTAAAACGCGGCGAGCGCATCTGACGACAAAAGCAAGGCAA
AAACGTTTGCGTCCGGCGTATGAAAATGCAGAAAGGCTGTTACTCGCTCACATGCTTCGAGATCGGAGCG
TCATCAAAAAAGTGATTGACCGGGTAGGGTTTCAATTTAATATTGATGAGCACCGGGCATTAGCCGCTTA
TCTTTATGCTTTTTATGAAGAGGGAGCCGAGCTGACGCCTCAGCATCTGATGGCCAGGGTGACGGATGAT
CATATAAGCCAGCTCTTGTCCGATATATTAATGCTTCAGGTTAATCAAGAGCTTAGCGAAGCCGAGTTAT
CAGATTATGTAAAAAAAGTGTTGAATCAAAGAAATTGGTCAATGATAAAAGAAAAAGAGGCGGAAAGAGC
CGAAGCAGAAAGGCAAAAAGATTTTTTAAGAGCTGCTTCTTTGGCTCAAGAAATCGTTACATTGAACCGA
TCTTTAAAATAACTGGAGAACTGATGAGGAGCATTTATTGGCAATGATTCCTTGCGGAGGAGCAAATAGA
TCGCTTAACCTCATCATGAATTGTCATTTCATTATTCGCACATTGTTAAAGGCAGTTCACATAGAAAACG
CCTGAATGGACCGAATAAGAATCATACCGCTTATAGAATTC"""
@show minhash(seq, 2, 10).sketch == myminhash(seq, 2, 10).sketch
@show minhash(seq, 3, 10).sketch == myminhash(seq, 3, 10).sketch
@show minhash(seq, 4, 10).sketch == myminhash(seq, 4, 10).sketch
@show minhash(seq, 5, 10).sketch == myminhash(seq, 5, 10).sketch
@show minhash(seq, 6, 10).sketch == myminhash(seq, 6, 10).sketch
#=
julia> using BenchmarkTools
julia> @benchmark minhash($seq, 5, 10)
BenchmarkTools.Trial:
memory estimate: 379.33 KiB
allocs estimate: 12104
--------------
minimum time: 356.333 μs (0.00% GC)
median time: 363.300 μs (0.00% GC)
mean time: 423.777 μs (10.56% GC)
maximum time: 6.631 ms (91.68% GC)
--------------
samples: 10000
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
julia> @benchmark myminhash($seq, 5, 10)
BenchmarkTools.Trial:
memory estimate: 448 bytes
allocs estimate: 8
--------------
minimum time: 32.142 μs (0.00% GC)
median time: 32.417 μs (0.00% GC)
mean time: 34.247 μs (0.00% GC)
maximum time: 668.418 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
julia> @benchmark minhash($seq, 8, 30)
BenchmarkTools.Trial:
memory estimate: 384.31 KiB
allocs estimate: 12138
--------------
minimum time: 316.221 μs (0.00% GC)
median time: 325.547 μs (0.00% GC)
mean time: 394.006 μs (11.94% GC)
maximum time: 5.893 ms (92.72% GC)
--------------
samples: 10000
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
julia> @benchmark myminhash($seq, 8, 30)
BenchmarkTools.Trial:
memory estimate: 784 bytes
allocs estimate: 9
--------------
minimum time: 37.042 μs (0.00% GC)
median time: 37.376 μs (0.00% GC)
mean time: 39.983 μs (0.00% GC)
maximum time: 431.924 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
time tolerance: 5.00%
memory tolerance: 1.00%
=#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment