Created
September 1, 2017 18:08
-
-
Save shahradj/7870806b606d6f281336d6f69242431c to your computer and use it in GitHub Desktop.
Perform clustering on mouse tracking data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from scipy.cluster.hierarchy import fclusterdata | |
from pymongo import MongoClient | |
import numpy as np | |
from datetime import * | |
import json | |
def saveToRelational(jsonPacket): | |
""" | |
save the received json packet to a relational database | |
""" | |
def insertIntoDb(values,columns): | |
#TODO | |
return | |
df = packetToDF(jsonPacket['data'],jsonPacket['ip']) | |
df.loc[:,'timestamp'] = map(lambda dte:dte.iso_format(),df.index) | |
for row in df.iterrows(): | |
insertIntoDb(row,df.columns) | |
def packetToDF(jsonPacket,ip): | |
""" | |
convert all mouse tracking data of a json packet into a pandas dataframe | |
""" | |
columns = ['x','y','w','h','isClick','target','url','type','text'] | |
data = pd.DataFrame( | |
[[x.get(col) for col in columns] for i,x in enumerate(jsonPacket)], | |
[datetime.fromtimestamp(int(x['ts']) / 1000) for x in jsonPacket],columns).sort_index() | |
data.loc[:,'moves'] = range(len(data)) | |
data.loc[:,'time'] = np.array((data.index - data.index[0]) / 1e6,int) | |
data.loc[:,'ip'] = [ip]*len(data) | |
data.loc[:,'text'] = [x.lower() if x != None else "" for x in data.text] | |
return data | |
class Clusterer(): | |
def __init__(self,cluster = False): | |
""" | |
clustering the raw data sent from tracker.js | |
""" | |
with open('examplePackets.txt') as f: | |
self.jsonPackets = map(lambda line:json.loads(line),f.readlines()) | |
self.data = self.userSessions() | |
if cluster: | |
self.sessionClusterMap = self.clusteringOfSessions(self.data) | |
def userSessions(self): | |
""" | |
combines all packets of mouse tracking data for a single ip into a sorted dataframe | |
MongoDB saves the json packets as sent by the js directly into the database | |
""" | |
res = [ | |
packetToDF( | |
jsonPacket['data'], | |
jsonPacket['ip'] | |
) | |
for jsonPacket in self.jsonPackets | |
] | |
if len(res) == 0: | |
return pd.DataFrame([]) | |
elif len(res) == 1: | |
return res[0] | |
data = pd.concat(res) | |
sessions = data.groupby('ip').apply(self.separateSessions) | |
sessions.loc[:,'Date'] = [x.date().strftime('%Y-%m-%d') for x in sessions['timestamp']] | |
return sessions.set_index('timestamp') | |
def separateSessions(self,df): | |
""" | |
separates sessions of the same IP and identify them as IP(0), IP(1), IP(2) etc, | |
where a new session happens if a previous session has not had any movement for 1 hour, i.e. 3.6e6 microseconds | |
""" | |
ip = df['ip'].values[0] | |
n_sessions = [i + 1 for i,x in enumerate(df['time']-df['time'].shift()) if abs(x) >= 600000] | |
n_sessions = [0] + n_sessions + [len(df) + 1] | |
sessions = [ | |
df[n_sessions[i]:n_sessions[i + 1]] | |
for i in range(len(n_sessions)-1) | |
] | |
for sess in sessions: | |
sess.loc[:,'time'] = sess['time'] - sess['time'].values[0] | |
sess.loc[:,'x_norm'] = sess['x'] / (sess['w'] + 1) | |
sess.loc[:,'y_norm'] = sess['y'] / (sess['h'] + 1) | |
sess.loc[:,'nClicks'] = [sess['isClick'][:i].sum() for i in range(len(sess))] | |
sess.loc[:,'timestamp'] = sess.index | |
for i,sess in enumerate(sessions): | |
sess.loc[:,'session'] = ip + '(' + str(i) + ')' | |
return pd.concat(sessions) | |
def clusteringOfSessions(self,data,clusteringVariables = ['x_norm','y_norm'],n_movements = 50): | |
""" | |
performs clustering across all the different sessions on the first 50 mouse movements | |
""" | |
assert 'session' in data.columns | |
sessionLengths = data.groupby('session').apply(len) | |
sessions=[session for session,length in sessionLengths.iteritems() if length >= n_movements] | |
MouseMovements = map(lambda session:data.set_index('session').loc[session,clusteringVariables][:n_movements].values.flatten().tolist(), sessions) | |
groups = fclusterdata(MouseMovements,1) | |
return dict(zip(sessions,map(lambda i:'behavioural group ' + str(i), groups))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment