Created
October 3, 2018 14:38
-
-
Save justinfay/96d349cd606f477ca82fe8263f6c6210 to your computer and use it in GitHub Desktop.
Count-Min sketch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Count-Min Sketch\n", | |
"\n", | |
"Thanks to [A practical introduction to the Count-Min Sketch](https://hkorte.github.io/slides/cmsketch/#/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"bar 1\n", | |
"billy 1\n", | |
"foo 2\n", | |
"bing 1\n" | |
] | |
} | |
], | |
"source": [ | |
"L = 100\n", | |
"counter = [0] * L\n", | |
"\n", | |
"def counter_index(word):\n", | |
" return sum(map(ord, word)) % L\n", | |
"\n", | |
"def get_count(word):\n", | |
" return counter[counter_index(word)]\n", | |
"\n", | |
"def count(word):\n", | |
" counter[counter_index(word)] += 1\n", | |
"\n", | |
"words = ['foo', 'bar', 'bing', 'foo', 'billy']\n", | |
"\n", | |
"for word in words:\n", | |
" count(word)\n", | |
" \n", | |
"for word in set(words):\n", | |
" print(word, get_count(word))\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**There will be collisions!**\n", | |
"We can minimize these by:\n", | |
"- Use multiple arrays with different hash functions\n", | |
"- When queried return the minimum of the numbers of the arrays (error correction)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"bar 1\n", | |
"billy 1\n", | |
"foo 2\n", | |
"bing 1\n" | |
] | |
} | |
], | |
"source": [ | |
"import hashlib\n", | |
"\n", | |
"L = 100\n", | |
"N = 5\n", | |
"counter = [[0] * L] * N\n", | |
"\n", | |
"_hash_algos = [\n", | |
" hashlib.md5,\n", | |
" hashlib.sha1,\n", | |
" hashlib.sha224,\n", | |
" hashlib.sha256,\n", | |
" hashlib.sha384,\n", | |
"]\n", | |
"\n", | |
"def counter_index(word, n):\n", | |
" return int(_hash_algos[n](word.encode()).hexdigest(), 16) % L\n", | |
"\n", | |
"\n", | |
"def get_count(word):\n", | |
" return min(\n", | |
" counter[n][counter_index(word, n)]\n", | |
" for n in range(N))\n", | |
"\n", | |
"\n", | |
"def count(word):\n", | |
" # Error correction, only update lowest counters.\n", | |
" counts = [counter[n][counter_index(word, n)] for n in range(N)]\n", | |
" min_ = min(counts)\n", | |
" to_update = [i for i, e in enumerate(counts) if e == min_]\n", | |
" for i in to_update:\n", | |
" counter[i][counter_index(word, i)] += 1\n", | |
" \n", | |
" \n", | |
"words = ['foo', 'bar', 'bing', 'foo', 'billy']\n", | |
"\n", | |
"for word in words:\n", | |
" count(word)\n", | |
" \n", | |
"for word in set(words):\n", | |
" print(word, get_count(word))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment