Created
February 4, 2015 00:24
-
-
Save peteoleary/45cbbf94746567523ac8 to your computer and use it in GitHub Desktop.
Parse Penn Treebank strings into nodes and edges, easy to render with d3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function pennTreeParse(penn_string) { | |
function eatWhiteSpace(s) { | |
while (s.length && /\s/.test(s[0])) s = s.slice(1); | |
return s; | |
} | |
function isCharacter(c) { | |
return /^[a-z0-9ÀÈÌÒÙàèìòùÁÉÍÓÚÝáéíóúýÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜäëïöüçÇߨøÅ寿ÞþÐð]+/i.test( c ); | |
} | |
function isPunctuation(c) { | |
return /^[\'\?\.,-\/#!$%\^&\*;:{}=\-_`~]+/i.test( c );; // remember not to put parens in here | |
} | |
function isCorP (c) { | |
return isCharacter(c) || isPunctuation(c); | |
} | |
function getToken(s) { | |
var result = {'token': '', 'remainder': s}; | |
while (s.length && isCorP(s[0]) ) { | |
result.token += s[0]; | |
s = s.slice(1); | |
} | |
result.remainder = s; | |
return result; | |
} | |
function parsePenn(nodes, edges, parent, num, penn_string) { | |
while ((penn_string = eatWhiteSpace(penn_string)).length > 0) { | |
if (penn_string[0] == ')') { | |
return {'num': num, 'remainder': penn_string.slice(1)}; | |
} | |
var node_class = 'leaf'; | |
if (penn_string[0] == '(') { | |
node_class = 'interior'; | |
penn_string = penn_string.slice(1) | |
} | |
var result = getToken(penn_string); | |
// use result.token + '-' + '{0}'.f(num) to put the node number in the title | |
nodes.push({id: num, label: result.token, reflexive: false}); | |
if (parent != -1) { | |
edges.push({source: nodes[parent], target: nodes[num], left: false, right: true }); | |
} | |
if (node_class == 'leaf') { | |
num = num + 1; | |
penn_string = penn_string.slice(result.token.length); | |
} | |
else { | |
result = parsePenn(nodes, edges, num, num + 1, result.remainder); | |
num = result.num; | |
penn_string = result.remainder; | |
} | |
} | |
return {'num': num, 'remainder': ''}; | |
} | |
var nodes = [], edges = [] | |
parsePenn(nodes, edges, -1, 0, penn_string); | |
return {nodes:nodes, edges: edges}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment