Created
August 20, 2020 13:44
-
-
Save ivanistheone/ccc3de4f8b115984565370ec74039b53 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Helpers for downloding Kolibri databases and printing topic trees: | |
./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98 | |
or to get the same result as HTML (assuming you have `pandoc` installed): | |
./kolibridb.py --channel_id=95a52b386f2c485cb97dd60901674a98 --htmlexport | |
""" | |
import argparse | |
from collections import defaultdict | |
from contextlib import redirect_stdout | |
from itertools import groupby | |
from operator import itemgetter | |
import os | |
import io | |
import json | |
import requests | |
import sqlite3 | |
import subprocess | |
import uuid | |
# DATABASE | |
################################################################################ | |
DATABSES_DIR = 'reports/databases' | |
if not os.path.exists(DATABSES_DIR): | |
os.makedirs(DATABSES_DIR) | |
STUDIO_SERVER_LOOKUP = { | |
'production': 'https://studio.learningequality.org', | |
'develop': 'https://develop.studio.learningequality.org', | |
'local': 'http://localhost:8080', | |
} | |
def download_db_file(channel_id, server='production', update=False): | |
""" | |
Download DB file for Kolibri channel `channel_id` from a Studio server. | |
""" | |
db_file_path = os.path.join(DATABSES_DIR, channel_id + '.sqlite3') | |
if os.path.exists(db_file_path) and not update: | |
return db_file_path | |
if server in STUDIO_SERVER_LOOKUP.keys(): | |
base_url = STUDIO_SERVER_LOOKUP[server] | |
elif 'http' in server: | |
base_url = server.rstrip('/') | |
else: | |
raise ValueError('Unrecognized arg', server) | |
db_file_url = base_url + '/content/databases/' + channel_id + '.sqlite3' | |
response = requests.get(db_file_url) | |
if response.ok: | |
with open(db_file_path, 'wb') as db_file: | |
for chunk in response: | |
db_file.write(chunk) | |
return db_file_path | |
else: | |
print(response.status_code, response.content) | |
raise ConnectionError('Failed to download DB file from', db_file_url) | |
def dbconnect(db_file_path): | |
conn = sqlite3.connect(db_file_path) | |
return conn | |
def dbex(conn, query): | |
""" | |
Execure a DB query and return results as a list of dicts. | |
""" | |
cursor = conn.cursor() | |
print('Running DB query', query) | |
cursor.execute(query) | |
results = [dict(zip([col[0] for col in cursor.description], row)) for row in cursor.fetchall()] | |
return results | |
# BASIC ORM | |
################################################################################ | |
def dbfilter(rows, **kwargs): | |
""" | |
Return all the `rows` that match the `key=value` conditions, where keys are DB column | |
names and value is a row's value. | |
""" | |
selected = [] | |
for row in rows: | |
accept = True | |
for key, value in kwargs.items(): | |
if key not in row or row[key] != value: | |
accept = False | |
if accept: | |
selected.append(row) | |
return selected | |
def filter_key_in_values(rows, key, values): | |
""" | |
Return all the `rows` whose value for `key` is in the list `values`. | |
""" | |
if isinstance(values, str): | |
values = [values] | |
return list(filter(lambda r: r[key] in values, rows)) | |
def dbget(rows, **kwargs): | |
""" | |
Return all the `rows` that match the `key=value` conditions, where keys are DB column | |
names and value is a row's value. | |
""" | |
selected = dbfilter(rows, **kwargs) | |
assert len(selected) < 2, 'mulitple results found' | |
if selected: | |
return selected[0] | |
else: | |
return None | |
def dbvalues_list(rows, *args, flat=False): | |
results = [] | |
for row in rows: | |
result = [] | |
for arg in args: | |
result.append(row[arg]) | |
results.append(result) | |
if flat: | |
return [result[0] for result in results] | |
else: | |
return results | |
# UTILS | |
################################################################################ | |
def sane_group_by(items, key): | |
""" | |
Wrapper for itertools.groupby to make it easier to use. | |
Returns a dict with keys = possible values of key in items | |
and corresponding values being lists of items that have that key. | |
""" | |
sorted_items = sorted(items, key=itemgetter(key)) | |
return dict((k, list(g)) for k, g in groupby(sorted_items, key=itemgetter(key))) | |
def count_values_for_attr(rows, *attrs): | |
counts = {} | |
for attr in attrs: | |
counts[attr] = defaultdict(int) | |
for row in rows: | |
val = row[attr] | |
counts[attr][val] += 1 | |
return counts | |
# KOLIBRI CHANNEL | |
################################################################################ | |
def get_channel(channel_id): | |
db_file_path = download_db_file(channel_id) | |
conn = sqlite3.connect(db_file_path) | |
return dbex(conn, "SELECT * FROM content_channelmetadata;")[0] | |
def get_nodes_by_id(conn, attach_files=True, attach_assessments=True): | |
nodes = dbex(conn, "SELECT * FROM content_contentnode;") | |
# TODO: load tags from content_contentnode_tags and content_contenttag | |
# TODO: load content_contentnode_has_prerequisite, content_contentnode_related | |
nodes_by_id = {} | |
for node in nodes: | |
nodes_by_id[node['id']] = node | |
if attach_files: | |
# attach all the files associated with each node under the key "files" | |
files = get_files(conn) | |
for file in files: | |
node_id = file['contentnode_id'] | |
node = nodes_by_id[node_id] | |
if 'files' in node: | |
node['files'].append(file) | |
else: | |
node['files'] = [file] | |
if attach_assessments: | |
assessmentmetadata = get_assessmentmetadata(conn) | |
for aim in assessmentmetadata: | |
node = nodes_by_id[aim['contentnode_id']] | |
# attach assesment_ids direclty to node to imitate ricecooker/studio | |
node['assessment_item_ids'] = json.loads(aim['assessment_item_ids']) | |
node['assessmentmetadata'] = { | |
'number_of_assessments': aim['number_of_assessments'], | |
'mastery_model': aim['mastery_model'], | |
'randomize': aim['randomize'], | |
'is_manipulable': aim['is_manipulable'], | |
} | |
return nodes_by_id | |
def get_files(conn): | |
files = dbex(conn, "SELECT * FROM content_file;") | |
return files | |
def get_assessmentmetadata(conn): | |
assessmentmetadata = dbex(conn, "SELECT * FROM content_assessmentmetadata;") | |
return assessmentmetadata | |
def get_tree(conn): | |
""" | |
Return a complete JSON tree of the entire channel. | |
""" | |
nodes_by_id = get_nodes_by_id(conn) | |
nodes = nodes_by_id.values() | |
sorted_nodes = sorted(nodes, key=lambda n: (n['parent_id'] or '0'*32, n['sort_order'])) | |
root = sorted_nodes[0] | |
for node in sorted_nodes[1:]: | |
parent = nodes_by_id[node['parent_id']] | |
if 'children' in parent: | |
parent['children'].append(node) | |
else: | |
parent['children'] = [node] | |
return root | |
# NODE_ID UTILS | |
################################################################################ | |
def node_id_from_source_ids(source_domain, channel_source_id, source_ids): | |
""" | |
Compute the node_id (str) for the node whose path is `source_ids` (list) | |
in a channel identified by `source_domain` and `channel_source_id`. | |
""" | |
domain_namespace = uuid.uuid5(uuid.NAMESPACE_DNS, source_domain) | |
content_ids = [uuid.uuid5(domain_namespace, source_id).hex for source_id in source_ids] | |
print('computed content_ids =', content_ids) | |
channel_id = uuid.uuid5(domain_namespace, channel_source_id) | |
print('Computed channel_id =', channel_id.hex) | |
node_id = channel_id | |
for content_id in content_ids: | |
node_id = uuid.uuid5(node_id, content_id) | |
return node_id.hex | |
# TREE PRINTING | |
################################################################################ | |
CONTENT_KINDS = ['topic', 'video', 'audio', 'exercise', 'document', 'slideshow', 'h5p', 'html5'] | |
def get_stats(subtree): | |
""" | |
Recusively compute kind-counts and total file_size (non-deduplicated). | |
""" | |
if 'children' in subtree and subtree['children']: | |
stats = dict((kind, 0) for kind in CONTENT_KINDS) | |
stats['topic'] = 1 # count self | |
stats['size'] = 0 | |
for child in subtree['children']: | |
child_stats = get_stats(child) | |
for k, v in child_stats.items(): | |
stats[k] += v | |
return stats | |
else: | |
size = sum([f['file_size'] for f in subtree['files']]) | |
return {subtree['kind']: 1, 'size': size} | |
def stats_to_str(stats): | |
stats_str = ' ' | |
for key in CONTENT_KINDS: | |
if key in stats and stats[key]: | |
if stats[key] > 1: | |
stats_str += str(stats[key]) + ' ' + key + 's, ' | |
else: | |
stats_str += str(stats[key]) + ' ' + key + ', ' | |
size_mb_str = "%.2f" % (float(stats['size'])/1024/1024) + 'MB' | |
stats_str += size_mb_str | |
return stats_str | |
def print_subtree(subtree, level=0, extrakeys=None, maxlevel=2, printstats=True): | |
extra = '' | |
if level > maxlevel: | |
return | |
if extrakeys: | |
for key in extrakeys: | |
extra = extra + ' ' + key + '=' + subtree[key] | |
if printstats: | |
stats = get_stats(subtree) | |
extra += stats_to_str(stats) | |
title = subtree['title'].replace('\n', ' ') | |
print(' '*2*level + ' -', title + ' (' + subtree['id'] + ')', extra) | |
if 'children' in subtree: | |
for child in subtree['children']: | |
print_subtree(child, level=level+1, extrakeys=extrakeys, maxlevel=maxlevel, printstats=printstats) | |
# TREE EXPORT | |
################################################################################ | |
def export_kolibri_json_tree(channel_id=None, db_file_path=None, suffix='', server='production', update=False): | |
""" | |
Convert a channel from Kolibri database file to a JSON tree. | |
""" | |
if channel_id is None and db_file_path is None: | |
raise ValueError("Need to specify either channel_id or db_file_path") | |
if db_file_path: | |
conn = dbconnect(db_file_path) | |
else: | |
db_file_path = download_db_file(channel_id, server=server, update=update) | |
conn = dbconnect(db_file_path) | |
kolibri_tree = get_tree(conn) | |
conn.close() | |
if db_file_path: | |
pre_filename = db_file_path.split(os.pathsep)[-1].replace('.sqlite3', '') | |
json_filename = pre_filename + suffix + '.json' | |
else: | |
json_filename = channel_id + suffix + '.json' | |
with open(json_filename, 'w') as jsonf: | |
json.dump(kolibri_tree, jsonf, indent=2, ensure_ascii=False, sort_keys=True) | |
print('Channel exported as Kolibri JSON Tree in ' + json_filename) | |
# HTML EXPORTS | |
################################################################################ | |
KOLIBRI_TREE_HTMLEXPORT_DIR = 'reports/kolibrihtmltrees' | |
def export_kolibritree_as_html(kolibritree, maxlevel=7): | |
""" | |
Export `kolibritree` as HTML for inspection of contents. | |
""" | |
basedir = KOLIBRI_TREE_HTMLEXPORT_DIR | |
if not os.path.exists(basedir): | |
os.makedirs(basedir, exist_ok=True) | |
channel_id = kolibritree['id'] | |
path_md = os.path.join(basedir, 'channel_{}_tree.md'.format(channel_id)) | |
path_html = os.path.join(basedir, 'channel_{}_tree.html'.format(channel_id)) | |
with io.StringIO() as buf, redirect_stdout(buf): | |
print('# Kolibri Topic Tree for channel', channel_id) | |
print('') | |
print_subtree(kolibritree, maxlevel=maxlevel) | |
output_md = buf.getvalue() | |
with open(path_md, 'w') as mdfile: | |
mdfile.write(output_md) | |
subprocess.call(['pandoc', '--from', 'gfm', path_md, '-o', path_html]) | |
print('Saved', path_html) | |
os.remove(path_md) | |
# CLI | |
################################################################################ | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Kolibri channel topic tree viewer') | |
parser.add_argument('--channel_id', required=True, help="Channel ID") | |
parser.add_argument('--printmaxlevel', type=int, default=2, help='print tree depth') | |
parser.add_argument('--htmlexport', action='store_true', help='save topic tree as html') | |
parser.add_argument('--htmlmaxlevel', type=int, default=7, help='html tree depth') | |
parser.add_argument('--update', action='store_true', help='Force re-download of DB file') | |
args = parser.parse_args() | |
db_file_path = download_db_file(args.channel_id, update=args.update) | |
conn = dbconnect(db_file_path) | |
kolibritree = get_tree(conn) | |
# DUMP CHANNEL JSON | |
export_kolibri_json_tree(db_file_path=db_file_path) | |
# PRINT IN TERMINAL | |
print_subtree(kolibritree, maxlevel=args.printmaxlevel) | |
# HTML TREE EXPORT | |
if args.htmlexport: | |
export_kolibritree_as_html(kolibritree, maxlevel=args.htmlmaxlevel) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment