Last active
March 16, 2017 16:36
-
-
Save kescobo/7341eed840b56ba90d5661da2d745cac to your computer and use it in GitHub Desktop.
Better Jaccard Distance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function jaccarddistance(sketch1::MinHashSketch, sketch2::MinHashSketch) | |
d = length(setdiff(sketch1.sketch, sketch2.sketch)) | |
l = length(sketch1) | |
return (l-d) / (l+d) | |
end | |
function newjd(sketch1::MinHashSketch, sketch2::MinHashSketch) | |
matches = 0 | |
sketchlen = length(sketch1) | |
i = 1 | |
j = 1 | |
while i <= sketchlen && j <= sketchlen | |
if sketch1.sketch[i] == sketch2.sketch[j] | |
matches += 1 | |
i += 1 | |
j += 1 | |
elseif sketch1.sketch[i] < sketch2.sketch[j] | |
while sketch1.sketch[i] < sketch2.sketch[j] | |
i += 1 | |
end | |
elseif sketch2.sketch[j] < sketch1.sketch[i] | |
while sketch2.sketch[j] < sketch1.sketch[i] | |
j += 1 | |
end | |
end | |
end | |
matches == sketchlen ? return 1.0 : return matches / (2 * sketchlen - matches) | |
end | |
newjd(sk1, sk2) == jaccarddistance(sk1, sk2) # true | |
# Old JD on sketches with k = 12, s = 500 for two ~ 4MB genomes | |
# BenchmarkTools.Trial: | |
# memory estimate: 26.02 KiB | |
# allocs estimate: 26 | |
# -------------- | |
# minimum time: 37.162 μs (0.00% GC) | |
# median time: 40.514 μs (0.00% GC) | |
# mean time: 49.695 μs (7.43% GC) | |
# maximum time: 5.304 ms (98.24% GC) | |
# -------------- | |
# samples: 10000 | |
# evals/sample: 1 | |
# time tolerance: 5.00% | |
# memory tolerance: 1.00% | |
# New JD on sketches with k = 12, s = 500 for two ~ 4MB genomes | |
# BenchmarkTools.Trial: | |
# memory estimate: 16 bytes | |
# allocs estimate: 1 | |
# -------------- | |
# minimum time: 1.089 μs (0.00% GC) | |
# median time: 1.140 μs (0.00% GC) | |
# mean time: 1.246 μs (0.00% GC) | |
# maximum time: 7.864 μs (0.00% GC) | |
# -------------- | |
# samples: 10000 | |
# evals/sample: 10 | |
# time tolerance: 5.00% | |
# memory tolerance: 1.00% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment