Created
November 25, 2020 01:43
-
-
Save clintval/51984d5048c278146c83fd4ed3747471 to your computer and use it in GitHub Desktop.
Use a Bloom filter to count all the unique elements in an iterator, approximately
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package com.twinstrandbio.math | |
| import breeze.util.BloomFilter | |
| import com.fulcrumgenomics.commons.util.SimpleCounter | |
| import scala.reflect.runtime.{universe => ru} | |
| /** Methods for counting. */ | |
| object Counter { | |
| /** Count all non-unique items in an iterator using a Bloom filter as a staging area. */ | |
| def countNonUniqueApproximate[T]( | |
| coll: IterableOnce[T], | |
| expectedNumItems: Int = 1000000, | |
| falsePositiveRate: Double = 0.001 | |
| )(implicit tag: ru.TypeTag[T]): SimpleCounter[T] = { | |
| val counter = new SimpleCounter[T]() | |
| val bloom = BloomFilter.optimallySized[T]( | |
| expectedNumItems = expectedNumItems, | |
| falsePositiveRate = falsePositiveRate | |
| ) | |
| coll.iterator.foreach { item => | |
| if (bloom.contains(item)) counter.count(item) | |
| else bloom += item | |
| } | |
| counter.toMap.keysIterator.foreach(counter.count) | |
| counter | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment