Skip to content

Instantly share code, notes, and snippets.

@fuglede
Last active February 18, 2016 17:30
Show Gist options
  • Save fuglede/462596c2384db59cf266 to your computer and use it in GitHub Desktop.
Save fuglede/462596c2384db59cf266 to your computer and use it in GitHub Desktop.
HTTPS Everywhere ruleset count

HTTPS Everywhere ruleset count

Creating

Put all files in the HTTPS Everywhere git repository. Run get_count.sh, then plot_count.py.

#!/bin/bash
# Does the data already exist? Let's not overwrite stuff.
if [ -f ruleset_count_data ]; then
echo "Error: Ruleset data file (ruleset_count_data) exists. Remove the file to create from scratch.";
exit 1
fi;
# Naive check: are we actually in a git repo?
if git rev-parse --git-dir > /dev/null 2>&1; then
echo "Creating list of rulesets by date. This might take a while.";
for commit in $(git rev-list --reverse master); do
number_of_rules=$(git ls-tree --name-only -r $commit | grep -c "src/chrome/content/rules");
date_of_commit=$(git show -s --format=%ci $commit);
echo "$number_of_rules $date_of_commit" >> ruleset_count_data;
done
echo "Done fetching data. Execute plot_count.py to plot it.";
else
echo "Error: Https-e git repository not found. Put these files in the https-e git repo.";
fi;
#!/usr/bin/env python2
from datetime import date, datetime
import re
import os.path
from matplotlib.dates import drange, num2date, date2num
from matplotlib.pyplot import *
def main():
output = 'curve_plot.png'
if os.path.isfile(output):
print "Error: Output file (%s) exists. Delete and rerun to produce a new one." % output
return
try:
f = open('ruleset_count_data', 'r')
except:
print "Error: Run get_count.sh to get the data first."
return
counts = []
dates = []
for l in f:
d = l.replace('\n', '')
# We ignore timezones since their effect is largely negligible
# (and because dealing with timezones in Python2 sucks).
dataRE = re.match(r"(\d+)\s(.*)\s[\+-]", d)
if dataRE is not None:
count = int(dataRE.group(1))
date_string = dataRE.group(2)
date_object = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
year = int(date_object.strftime("%Y"))
# Manually remove some counts since someone
# appears to have messed up their system times.
if (count > 16000 and year < 2015) or (count > 19000 and year < 2016):
continue
counts.append(count)
dates.append(date2num(date_object))
# Let's just focus on the highest ruleset counts we've seen: this
# makes sense since at no point a (very) large number of rulesets
# were deleted. There's probably a more clever way to do this.
dates, counts = zip(*sorted(zip(dates, counts)))
max_count = 0
date_plot = []
count_plot = []
for date, count in zip(dates, counts):
if count > max_count:
max_count = count
date_plot.append(date)
count_plot.append(count)
# Finally it's plotting time!
rc('font', family='serif')
title('Number of rulesets in browser plugin HTTPS Everywhere')
ylabel('Ruleset count')
xlabel('Time of commit')
xticks(rotation=30) # Avoid overlap in tick labels
plot_date(date_plot, count_plot, '.')
gcf().subplots_adjust(bottom=0.15) # Avoid cropping
try:
rc('text', usetex=True) # Might fail depending on setup
savefig(output)
except:
rc('text', usetex=False)
savefig(output)
print "Plotting complete. See %s for the result." % output
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment