Created
March 16, 2017 06:21
-
-
Save bicycle1885/23f15ac022cff46cb454c9be1a585987 to your computer and use it in GitHub Desktop.
K-mer MinHash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Bio.Seq | |
import DataStructures: SortedSet | |
immutable MinHashSketch | |
sketch::Vector{UInt64} | |
kmersize::Int | |
function MinHashSketch(sketch::Vector, kmersize::Int) | |
length(sketch) > 0 || error("Sketch cannot be empty") | |
kmersize > 0 || error("Kmersize must be greater than 0") | |
new(sketch, kmersize) | |
end | |
end | |
function revcomphash(kmer::Kmer) | |
k = min(kmer, reverse_complement(kmer)) | |
return hash(k) | |
end | |
function kmerminhash(seq::BioSequence, kmerset, kmerhashes::Vector{UInt64}, k::Int, s::Int) | |
typeof(kmerset) <: Set || typeof(kmerset) <: SortedSet || error("Kmerset must be a `Set` or `SortedSet`") | |
length(kmerhashes) <= s || error("Kmerhashes cannot be larger than the set size") | |
for kmer in each(DNAKmer{k}, seq) | |
if length(kmerhashes) == 0 | |
if length(kmerset) < s | |
push!(kmerset, revcomphash(kmer[2])) | |
elseif length(kmerset) == s | |
kmerset = SortedSet(kmerset) | |
for i in kmerset | |
push!(kmerhashes, pop!(kmerset)) | |
end | |
end | |
end | |
if length(kmerhashes) == s | |
h = revcomphash(kmer[2]) | |
if h < kmerhashes[end] | |
i = searchsortedlast(kmerhashes, h) | |
if i == 0 && h != kmerhashes[1] | |
pop!(kmerhashes) | |
unshift!(kmerhashes, h) | |
elseif h != kmerhashes[i] | |
pop!(kmerhashes) | |
insert!(kmerhashes, i+1, h) | |
end | |
end | |
end | |
end | |
return (kmerset, kmerhashes) | |
end | |
function minhash(seq::BioSequence, k::Int, s::Int) | |
kmerset = Set{UInt64}() | |
kmerhashes = Vector{UInt64}() | |
kmerset, kmerhashes = kmerminhash(seq, kmerset, kmerhashes, k, s) | |
length(kmerhashes) == s || error("Sketch size is too large for the given kmer size") | |
return MinHashSketch(kmerhashes, k) | |
end | |
function mykmerminhash{k}(::Type{DNAKmer{k}}, seq::BioSequence, s::Int) | |
# generate first `s` kmers | |
kmerhashes = UInt64[] | |
iter = each(DNAKmer{k}, seq) | |
state = start(iter) | |
while length(kmerhashes) < s && !done(iter, state) | |
(_, kmer), state = next(iter, state) | |
h = revcomphash(kmer) | |
if h ∉ kmerhashes | |
push!(kmerhashes, h) | |
end | |
end | |
if length(kmerhashes) < s | |
error("failed to generate enough hashes") | |
end | |
sort!(kmerhashes) | |
# scan `seq` to make a minhash | |
while !done(iter, state) | |
(_, kmer), state = next(iter, state) | |
h = revcomphash(kmer) | |
if h < kmerhashes[end] | |
i = searchsortedlast(kmerhashes, h) | |
if i == 0 && h != kmerhashes[1] | |
pop!(kmerhashes) | |
unshift!(kmerhashes, h) | |
elseif h != kmerhashes[i] | |
pop!(kmerhashes) | |
insert!(kmerhashes, i+1, h) | |
end | |
end | |
end | |
return kmerhashes | |
end | |
function myminhash(seq::BioSequence, k::Int, s::Int) | |
kmerhashes = mykmerminhash(DNAKmer{k}, seq, s) | |
return MinHashSketch(kmerhashes, k) | |
end | |
seq = dna""" | |
GTACGACGGAGTGTTATAAGATGGGAAATCGGATACCAGATGAAATTGTGGATCAGGTGCAAAAGTCGGC | |
AGATATCGTTGAAGTCATAGGTGATTATGTTCAATTAAAGAAGCAAGGCCGAAACTACTTTGGACTCTGT | |
CCTTTTCATGGAGAAAGCACACCTTCGTTTTCCGTATCGCCCGACAAACAGATTTTTCATTGCTTTGGCT | |
GCGGAGCGGGCGGCAATGTTTTCTCTTTTTTAAGGCAGATGGAAGGCTATTCTTTTGCCGAGTCGGTTTC | |
TCACCTTGCTGACAAATACCAAATTGATTTTCCAGATGATATAACAGTCCATTCCGGAGCCCGGCCAGAG | |
TCTTCTGGAGAACAAAAAATGGCTGAGGCACATGAGCTCCTGAAGAAATTTTACCATCATTTGTTAATAA | |
ATACAAAAGAAGGTCAAGAGGCACTGGATTATCTGCTTTCTAGGGGCTTTACGAAAGAGCTGATTAATGA | |
ATTTCAGATTGGCTATGCTCTTGATTCTTGGGACTTTATCACGAAATTCCTTGTAAAGAGGGGATTTAGT | |
GAGGCGCAAATGGAAAAAGCGGGTCTCCTGATCAGACGCGAAGACGGAAGCGGATATTTCGACCGCTTCA | |
GAAACCGTGTCATGTTTCCGATCCATGATCATCACGGGGCTGTTGTTGCTTTCTCAGGCAGGGCTCTTGG | |
CAGCCAGCAGCCTAAGTATATGAACAGTCCTGAAACCCCGCTCTTTCATAAAAGCAAACTGCTTTACAAT | |
TTTTATAAGGCCCGCCTTCATATCAGAAAGCAGGAAAGAGCAGTCTTATTTGAAGGGTTTGCTGATGTCT | |
ATACGGCCGTAAGCTCGGATGTAAAGGAAAGCATAGCCACGATGGGAACGTCTCTTACAGATGATCATGT | |
CAAGATCCTGAGAAGAAACGTCGAAGAAATCATTCTTTGCTATGACTCTGATAAAGCCGGTTATGAAGCC | |
ACCTTAAAAGCTTCGGAGCTTCTGCAAAAAAAAGGCTGCAAAGTCAGAGTTGCAATGATTCCTGACGGAT | |
TGGACCCTGATGATTACATCAAAAAATTCGGCGGGGAAAAATTTAAAAACGACATTATTGACGCAAGTGT | |
CACCGTAATGGCGTTCAAAATGCAATATTTCCGAAAAGGAAAGAACCTGTCCGATGAAGGCGACCGCCTA | |
GCTTACATTAAAGACGTACTGAAAGAAATCAGCACGCTTTCAGGGTCTCTAGAGCAGGAAGTCTATGTAA | |
AGCAGCTTGCTTCAGAGTTTTCGCTTTCACAGGAGTCTTTAACTGAGCAGCTGTCTGTTTTCAGCAAGCA | |
AAACAAACCTGCTGACAATAGCGGTGAAACTAAAACGCGGCGAGCGCATCTGACGACAAAAGCAAGGCAA | |
AAACGTTTGCGTCCGGCGTATGAAAATGCAGAAAGGCTGTTACTCGCTCACATGCTTCGAGATCGGAGCG | |
TCATCAAAAAAGTGATTGACCGGGTAGGGTTTCAATTTAATATTGATGAGCACCGGGCATTAGCCGCTTA | |
TCTTTATGCTTTTTATGAAGAGGGAGCCGAGCTGACGCCTCAGCATCTGATGGCCAGGGTGACGGATGAT | |
CATATAAGCCAGCTCTTGTCCGATATATTAATGCTTCAGGTTAATCAAGAGCTTAGCGAAGCCGAGTTAT | |
CAGATTATGTAAAAAAAGTGTTGAATCAAAGAAATTGGTCAATGATAAAAGAAAAAGAGGCGGAAAGAGC | |
CGAAGCAGAAAGGCAAAAAGATTTTTTAAGAGCTGCTTCTTTGGCTCAAGAAATCGTTACATTGAACCGA | |
TCTTTAAAATAACTGGAGAACTGATGAGGAGCATTTATTGGCAATGATTCCTTGCGGAGGAGCAAATAGA | |
TCGCTTAACCTCATCATGAATTGTCATTTCATTATTCGCACATTGTTAAAGGCAGTTCACATAGAAAACG | |
CCTGAATGGACCGAATAAGAATCATACCGCTTATAGAATTC""" | |
@show minhash(seq, 2, 10).sketch == myminhash(seq, 2, 10).sketch | |
@show minhash(seq, 3, 10).sketch == myminhash(seq, 3, 10).sketch | |
@show minhash(seq, 4, 10).sketch == myminhash(seq, 4, 10).sketch | |
@show minhash(seq, 5, 10).sketch == myminhash(seq, 5, 10).sketch | |
@show minhash(seq, 6, 10).sketch == myminhash(seq, 6, 10).sketch | |
#= | |
julia> using BenchmarkTools | |
julia> @benchmark minhash($seq, 5, 10) | |
BenchmarkTools.Trial: | |
memory estimate: 379.33 KiB | |
allocs estimate: 12104 | |
-------------- | |
minimum time: 356.333 μs (0.00% GC) | |
median time: 363.300 μs (0.00% GC) | |
mean time: 423.777 μs (10.56% GC) | |
maximum time: 6.631 ms (91.68% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
julia> @benchmark myminhash($seq, 5, 10) | |
BenchmarkTools.Trial: | |
memory estimate: 448 bytes | |
allocs estimate: 8 | |
-------------- | |
minimum time: 32.142 μs (0.00% GC) | |
median time: 32.417 μs (0.00% GC) | |
mean time: 34.247 μs (0.00% GC) | |
maximum time: 668.418 μs (0.00% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
julia> @benchmark minhash($seq, 8, 30) | |
BenchmarkTools.Trial: | |
memory estimate: 384.31 KiB | |
allocs estimate: 12138 | |
-------------- | |
minimum time: 316.221 μs (0.00% GC) | |
median time: 325.547 μs (0.00% GC) | |
mean time: 394.006 μs (11.94% GC) | |
maximum time: 5.893 ms (92.72% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
julia> @benchmark myminhash($seq, 8, 30) | |
BenchmarkTools.Trial: | |
memory estimate: 784 bytes | |
allocs estimate: 9 | |
-------------- | |
minimum time: 37.042 μs (0.00% GC) | |
median time: 37.376 μs (0.00% GC) | |
mean time: 39.983 μs (0.00% GC) | |
maximum time: 431.924 μs (0.00% GC) | |
-------------- | |
samples: 10000 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
=# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment