Skip to content

Instantly share code, notes, and snippets.

@marcusmueller
Last active September 1, 2017 20:20
Show Gist options
  • Select an option

  • Save marcusmueller/8e10d0da74258a3b0c6d7000f813fbd1 to your computer and use it in GitHub Desktop.

Select an option

Save marcusmueller/8e10d0da74258a3b0c6d7000f813fbd1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Author: Marcus Müller
# License: WTFPL.
from matplotlib import pyplot
import numpy
import typing
N_HASH = 701
def P_collision(n_entries: int, n_candidates: int) -> float:
return (1 - #calculating the no collisions prob
numpy.prod(
numpy.arange(n_entries-n_candidates, n_entries + 1) / # n_candidates factors
n_entries
)
)
n_cands = numpy.arange(1, N_HASH//5)
prob_collision = [P_collision(N_HASH, n) for n in n_cands]
max_below_half = 0
while(prob_collision[max_below_half] < 0.5):
max_below_half += 1
max_below_ninetynine = max_below_half
while(prob_collision[max_below_ninetynine] < 0.99 and max_below_ninetynine < len(prob_collision)):
max_below_ninetynine += 1
f= pyplot.figure(figsize=(6,5), dpi=150)
pyplot.axhline(y=0.5, alpha=0.5)
pyplot.axvline(x=n_cands[max_below_half], alpha=0.5, color='orange')
pyplot.axvline(x=n_cands[max_below_ninetynine], alpha=0.5, color='r')
pyplot.semilogy(n_cands, prob_collision)
pyplot.xlabel("Number of PMTs in existence")
pyplot.xlabel("Probability of having at least one collision")
pyplot.title("Probability has no mercy for hashes that have {:d} values".format(N_HASH))
pyplot.yticks(
sorted([10**n for n in range(int( numpy.log10(prob_collision[0]))-1, 0)] + [1.0, 0.5])
)
ticks = numpy.append(( n_cands[max_below_half], n_cands[max_below_ninetynine]),
numpy.arange(start=0, stop=N_HASH//2, step=25))
ticks_la = ["P>0.5 @ {:d}".format(ticks[0]), "P>99% @ {:d}".format(ticks[1])] + [str(t) for t in ticks[2:]]
pyplot.xticks(ticks, ticks_la, rotation=45)
pyplot.xlim((n_cands[0], n_cands[-1]))
print ("maximum below 50%: {mbh:d}, below 99%: {mbnn:d}".format(mbnn=max_below_ninetynine, mbh=max_below_half))
pyplot.tight_layout()
f.savefig("why a {:d} hashtable is bad.png".format(N_HASH))
#pyplot.show()
### Check the histogram
occ = numpy.fromfile("histogram.csv", sep="\n", dtype=int)
f= pyplot.figure(figsize=(6,5), dpi=150)
total = occ.sum()
rel = occ.astype(float)/occ.sum()*701
pyplot.plot(rel)
pyplot.title("distribution of hashes")
pyplot.xlabel("hash")
pyplot.ylabel("relative occurence (ideal: 1/701 constant), N = {:d}".format((total)))
pyplot.axhline(rel.mean(),color="b")
pyplot.axhline(rel.mean()+rel.std())
pyplot.axhline(rel.mean()-rel.std())
pyplot.legend(["std: {:f}, mean: {:f}".format(rel.std(), rel.mean())])
f.savefig("distribution of hashes for English words.png")
WHY := why\ a\ 701\ hashtable\ is\ bad.png
OCC := distribution\ of\ hashes\ for\ English\ words.png
all: $(WHY) $(OCC)
.PHONY: all clean
$(WHY) $(OCC): histogram.csv
./birthday.py
histogram.csv: words.txt test_hashing
./test_hashing > histogram.csv
test_hashing: test_hashing.cc
$(CXX) -o test_hashing test_hashing.cc
words.txt:
wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
clean:
rm $(WHY) $(OCC) test_hashing words.txt histogram.csv
#include <iostream>
#include <string>
#include <fstream>
#include <vector>
/*
* By fortune of hash_string being from GNU Radio, this file's GPL'ed.
*/
static unsigned int
hash_string(const std::string &s)
{
unsigned int h = 0;
unsigned int g = 0;
for (std::string::const_iterator p = s.begin(); p != s.end(); ++p){
h = (h << 4) + (*p & 0xff);
g = h & 0xf0000000;
if (g){
h = h ^ (g >> 24);
h = h ^ g;
}
}
return h;
}
int main() {
std::ifstream wordlist ("words.txt");
if (!wordlist.is_open()) {
std::string info("Download https://raw.githubusercontent.com/dwyl/english-words/master/words.txt as data basis.\n");
std::cerr << info;
return -1;
}
std::vector<unsigned> histogram(701);
std::string line;
while ( std::getline (wordlist,line) ) {
unsigned int val = hash_string(line) % 701;
// std::cout << line << ": " << val << "\n";
histogram[val]++;
}
wordlist.close();
for(unsigned i = 0; i < 701; i++)
std::cout << histogram[i] << "\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment