Last active
June 18, 2019 19:24
-
-
Save corajr/c9289e1a38b04614e6fdbc2bf820be0c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Usage: ./learning_time_sample.py SESSION_GAP_IN_MINUTES | |
Requires: pip install intervaltree | |
This script sums up sessions from a BigQuery export, generated using the following query: | |
SELECT | |
* EXCEPT(row_number) | |
FROM ( | |
SELECT | |
kaid, | |
content.content_id, | |
activity, | |
start_time, | |
end_time, | |
learning_time_ms, | |
ROW_NUMBER() OVER (PARTITION BY info.request_id) row_number | |
FROM | |
`khanacademy.org:deductive-jet-827.log_streams.learning_time_20190608` | |
WHERE | |
activity IN ("PRACTICING", | |
"WATCHING", | |
"READING") | |
AND MOD(FARM_FINGERPRINT(kaid), 10) = 1) | |
WHERE | |
row_number = 1 | |
Export this table from BigQuery as `learning_time_sample.json.gz` and store it alongside this script. | |
""" | |
import collections | |
import datetime | |
import gzip | |
import intervaltree | |
import json | |
import sys | |
import tqdm | |
def parse_datetime(s): | |
try: | |
return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S.%f %Z') | |
except ValueError: | |
return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S %Z') | |
def parse_sample(input_fname="learning_time_sample.json.gz", session_gap=15): | |
data = collections.defaultdict(lambda: collections.defaultdict(intervaltree.IntervalTree)) | |
with gzip.open(input_fname, 'rb') as f: | |
for line in tqdm.tqdm(f, total=580559): | |
event = json.loads(line) | |
if 'learning_time_ms' not in event or 'content_id' not in event: | |
continue | |
start_time = parse_datetime(event.pop(u'start_time')) | |
end_time = parse_datetime(event.pop(u'end_time')) | |
kaid = event[u'kaid'] | |
content_id = event[u'content_id'] | |
data[kaid][content_id].addi(start_time, end_time + datetime.timedelta(minutes=session_gap), event[u'activity']) | |
return data | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print "Usage: {} SESSION_GAP_IN_MINUTES".format(sys.argv[0]) | |
sys.exit(1) | |
session_gap = int(sys.argv[1]) | |
data = parse_sample(session_gap=session_gap) | |
session_counts = [] | |
session_counts_by_activity = {"READING": [], "PRACTICING": [], "WATCHING": []} | |
session_lengths = [] | |
user_n = 0 | |
sessions_n = 0 | |
for kaid, intervals_dict in data.iteritems(): | |
user_n += 1 | |
sessions_for_user = 0 | |
sessions_for_user_by_activity = collections.defaultdict(int) | |
for tree in intervals_dict.itervalues(): | |
tree.merge_overlaps(data_reducer=lambda x, _: x) | |
for interval in tree: | |
session_lengths.append(interval.length().total_seconds()) | |
sessions_for_user += 1 | |
sessions_for_user_by_activity[interval.data] += 1 | |
session_counts.append(sessions_for_user) | |
sessions_n += sessions_for_user | |
for k in session_counts_by_activity: | |
session_counts_by_activity[k].append(sessions_for_user_by_activity[k]) | |
print "Average session count per user: {}".format(float(sum(session_counts)) / user_n) | |
print "Average session length: {}".format(float(sum(session_lengths)) / sessions_n) | |
for activity_type, counts in session_counts_by_activity.iteritems(): | |
print "Average sessions of type {} per user: {}".format(activity_type, float(sum(counts)) / user_n) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment