vmonaco · February 21, 2020 14:19
diff --git a/cmu_powerlaw.py b/cmu_powerlaw.py
 '''
 Created on May 26, 2015

 @author: vinnie, [email protected]

 Power-law results from:
 "DATA FORENSIC TECHNIQUES USING BENFORD’S LAW AND ZIPF’S LAW FOR KEYSTROKE
 DYNAMICS", Aamo Iorliam, Anthony T.S. Ho, Norman Poh, Santosh Tirunagari, 
 and Patrick Bours. IWBF 2015.

 Uses the data from:
 "Comparing Anomaly-Detection Algorithms for Keystroke Dynamics," 
 Kevin Killourhy and Roy Maxion. DSN 2009. http://www.cs.cmu.edu/~keystroke/

 Requires numpy, pandas, powerlaw, and matplotlib. Run the script as:
 $ python cmu_powerlaw.py
 '''

 import numpy as np
 import pandas as pd
 import powerlaw as pl
 import matplotlib.pyplot as plt

 FIT_METHOD = 'KS' # Can also be 'Likelihood'
 DATA_URL = 'http://www.cs.cmu.edu/~keystroke/DSL-StrongPasswordData.csv'

 # Use none as the x_mins to estimate them
 UD_XMIN, DD_XMIN, DUR_XMIN = None, None, None

 # Uncomment line below to use the estimates for x_min, 
 # The resulting x_min estimate from this script is 0.1852, not 0.1818 
 # UD_XMIN, DD_XMIN, DUR_XMIN = 0.9801, 0.9880, 0.1818

 print('Downloading data from', DATA_URL)
 # First 3 cols are subject, session, repetition
 df = pd.read_csv(DATA_URL, index_col=[0,1,2])

 # Columns are labeled like: feature_type.keyname[.secondkeyname]
 # Get the columns for a specific feature type 
 get_feature_cols = lambda feat: [c for c in df.columns if c.startswith(feat)]

 ud = df[get_feature_cols('UD')].values.flatten() # up-down latency
 dd = df[get_feature_cols('DD')].values.flatten() # down-down latency
 dur = df[get_feature_cols('H')].values.flatten() # Hold time
 # UD can be negative, so use abs values
 ud = np.abs(ud)

 print('Fitting models, may take a while...')
 fit_ud = pl.Fit(ud, fit_method=FIT_METHOD, xmin=UD_XMIN)
 fit_dd = pl.Fit(dd, fit_method=FIT_METHOD, xmin=DD_XMIN)
 fit_dur = pl.Fit(dur, fit_method=FIT_METHOD, xmin=DUR_XMIN)

 summarize = lambda fit: 'x_min = %.4f\nalpha = %.4f\nL = %.4f' \
            %(fit.power_law.xmin, fit.power_law.alpha, fit.power_law.loglikelihoods(fit.data).sum())

 print('Up-down\n', summarize(fit_ud), sep='')
 print('Down-down\n', summarize(fit_dd), sep='')
 print('Duration\n', summarize(fit_dur), sep='')

 # Helper to make a nice plot
 def make_subplot(name, fit, ax, visiblex=False):
    plt.setp(ax1.get_xticklabels(), visible=visiblex)
    fit.plot_ccdf(color='k')
    fit.power_law.plot_ccdf(color='r', linestyle='--')
    ax.text(0.9, 0.9, '%s\n%s' %(name, summarize(fit)), 
            ha='right', va='top', transform=ax.transAxes)

 plt.figure(figsize=(6,9))

 ax1 = plt.subplot(311)
 ax1.set_title('CMU keystroke power laws')
 make_subplot('Up-down', fit_ud, ax1)

 ax2 = plt.subplot(312, sharex=ax1)
 make_subplot('Down-down', fit_dd, ax2)

 ax3 = plt.subplot(313, sharex=ax1)
 make_subplot('Duration', fit_dur, ax3, True)

 plt.tight_layout()
 plt.show()
	'''
	Created on May 26, 2015

	@author: vinnie, [email protected]

	Power-law results from:
	"DATA FORENSIC TECHNIQUES USING BENFORD’S LAW AND ZIPF’S LAW FOR KEYSTROKE
	DYNAMICS", Aamo Iorliam, Anthony T.S. Ho, Norman Poh, Santosh Tirunagari,
	and Patrick Bours. IWBF 2015.

	Uses the data from:
	"Comparing Anomaly-Detection Algorithms for Keystroke Dynamics,"
	Kevin Killourhy and Roy Maxion. DSN 2009. http://www.cs.cmu.edu/~keystroke/

	Requires numpy, pandas, powerlaw, and matplotlib. Run the script as:
	$ python cmu_powerlaw.py
	'''

	import numpy as np
	import pandas as pd
	import powerlaw as pl
	import matplotlib.pyplot as plt

	FIT_METHOD = 'KS' # Can also be 'Likelihood'
	DATA_URL = 'http://www.cs.cmu.edu/~keystroke/DSL-StrongPasswordData.csv'

	# Use none as the x_mins to estimate them
	UD_XMIN, DD_XMIN, DUR_XMIN = None, None, None

	# Uncomment line below to use the estimates for x_min,
	# The resulting x_min estimate from this script is 0.1852, not 0.1818
	# UD_XMIN, DD_XMIN, DUR_XMIN = 0.9801, 0.9880, 0.1818

	print('Downloading data from', DATA_URL)
	# First 3 cols are subject, session, repetition
	df = pd.read_csv(DATA_URL, index_col=[0,1,2])

	# Columns are labeled like: feature_type.keyname[.secondkeyname]
	# Get the columns for a specific feature type
	get_feature_cols = lambda feat: [c for c in df.columns if c.startswith(feat)]

	ud = df[get_feature_cols('UD')].values.flatten() # up-down latency
	dd = df[get_feature_cols('DD')].values.flatten() # down-down latency
	dur = df[get_feature_cols('H')].values.flatten() # Hold time
	# UD can be negative, so use abs values
	ud = np.abs(ud)

	print('Fitting models, may take a while...')
	fit_ud = pl.Fit(ud, fit_method=FIT_METHOD, xmin=UD_XMIN)
	fit_dd = pl.Fit(dd, fit_method=FIT_METHOD, xmin=DD_XMIN)
	fit_dur = pl.Fit(dur, fit_method=FIT_METHOD, xmin=DUR_XMIN)

	summarize = lambda fit: 'x_min = %.4f\nalpha = %.4f\nL = %.4f' \
	%(fit.power_law.xmin, fit.power_law.alpha, fit.power_law.loglikelihoods(fit.data).sum())

	print('Up-down\n', summarize(fit_ud), sep='')
	print('Down-down\n', summarize(fit_dd), sep='')
	print('Duration\n', summarize(fit_dur), sep='')

	# Helper to make a nice plot
	def make_subplot(name, fit, ax, visiblex=False):
	plt.setp(ax1.get_xticklabels(), visible=visiblex)
	fit.plot_ccdf(color='k')
	fit.power_law.plot_ccdf(color='r', linestyle='--')
	ax.text(0.9, 0.9, '%s\n%s' %(name, summarize(fit)),
	ha='right', va='top', transform=ax.transAxes)

	plt.figure(figsize=(6,9))

	ax1 = plt.subplot(311)
	ax1.set_title('CMU keystroke power laws')
	make_subplot('Up-down', fit_ud, ax1)

	ax2 = plt.subplot(312, sharex=ax1)
	make_subplot('Down-down', fit_dd, ax2)

	ax3 = plt.subplot(313, sharex=ax1)
	make_subplot('Duration', fit_dur, ax3, True)

	plt.tight_layout()
	plt.show()