Last active
January 5, 2017 17:07
-
-
Save ekingery/1bb6865e24fccb40c1cd1368c04f4d31 to your computer and use it in GitHub Desktop.
Parse RCV1 topics into a tree structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script parses the RCV1 topics into a tree structure | |
# It can then be exported to json or dotfile format | |
# For more info on RCV1, see | |
# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/lewis04a.pdf | |
import re | |
from treelib import Tree | |
from treelib.plugins import export_to_dot | |
# read topics from flat file into a list of lists | |
topics = [] | |
# put them into a topic tree structure | |
ttree = Tree() | |
# curl http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a03-expanded-topics-hierarchy/rcv1.topics.hier.expanded > rcv1.topics.hier.expanded.txt # noqa | |
with open('rcv1.topics.hier.expanded.txt', 'r') as f: | |
for line in f: | |
cols = re.split(r'\s{2,}', line.rstrip('\n')) | |
cols = [c.replace('parent: ', ''). | |
replace('child: ', ''). | |
replace('child-description: ', '') for c in cols] | |
topics.append(cols) | |
# loop once to set the root node and it's children | |
# (the source data is in level-order after the 2nd level) | |
for t in topics: | |
if 'None' == t[0]: | |
ttree.create_node(t[1] + " - " + t[2], t[1]) | |
elif 'Root' == t[0]: | |
ttree.create_node(t[1] + " - " + t[2], t[1], parent=t[0]) | |
# loop again to pull all other nodes | |
for t in topics: | |
if ('None' != t[0] and 'Root' != t[0]): | |
ttree.create_node(t[1] + " - " + t[2], t[1], parent=t[0]) | |
# print(ttree) | |
# export_to_dot(ttree, 'topics-tree.dot') | |
# export dotfiles for each main category subtree | |
for cat in ['CCAT', 'ECAT', 'GCAT', 'MCAT']: | |
export_to_dot(ttree.subtree(cat), cat + '-topics-tree.dot') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment