Created
January 9, 2018 22:53
-
-
Save matthewdowney/1ca9c691706ef2f5d17f63ee346a4ccf to your computer and use it in GitHub Desktop.
Pipe some data to this script to plot a data set vs benford's law.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pipe some data to stdin. Each line should contain a number. | |
import csv | |
from collections import defaultdict | |
from math import log10 | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import functools | |
import fileinput | |
# We care about digits 1-10 | |
digits = set(map(str, range(1, 10))) | |
def log_frequency(frequencies, line): | |
"""Increment the frequency in the dict for {digit: count}.""" | |
input_digits = [int(x) for x in line if x in digits] | |
if input_digits: | |
frequencies[input_digits[0]] = frequencies.get(input_digits[0], 0) + 1 | |
return frequencies | |
# Scaling | |
data = functools.reduce(log_frequency, fileinput.input(), {}) | |
total_numbers = sum(data.values()) | |
data = sorted([(count / float(total_numbers), d) for (d, count) in data.items()], reverse=True) | |
# Comparison | |
benford = [(log10(1 + 1.0 / i), str(i)) for i in range(1, 10)] | |
# Plot first digit distribution vs. Benford | |
plt.plot([x[0] for x in data], label='Data Set') | |
plt.plot([x[0] for x in benford], label="Benford's Law", linewidth=10, alpha=0.23) | |
plt.ylabel("Distribution probability", fontsize=14) | |
plt.xlabel("First digit for %s numbers" % total_numbers, fontsize=14) | |
plt.title("Check it out yo\n", fontsize=12) | |
plt.xticks([x for x in range(len(benford))], [int(x[1]) for x in benford]) | |
plt.legend() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment