Skip to content

Instantly share code, notes, and snippets.

@peteoleary
Created February 4, 2015 00:24
Show Gist options
  • Save peteoleary/45cbbf94746567523ac8 to your computer and use it in GitHub Desktop.
Save peteoleary/45cbbf94746567523ac8 to your computer and use it in GitHub Desktop.
Parse Penn Treebank strings into nodes and edges, easy to render with d3
function pennTreeParse(penn_string) {
function eatWhiteSpace(s) {
while (s.length && /\s/.test(s[0])) s = s.slice(1);
return s;
}
function isCharacter(c) {
return /^[a-z0-9ÀÈÌÒÙàèìòùÁÉÍÓÚÝáéíóúýÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜäëïöüçÇߨøÅ寿ÞþÐð]+/i.test( c );
}
function isPunctuation(c) {
return /^[\'\?\.,-\/#!$%\^&\*;:{}=\-_`~]+/i.test( c );; // remember not to put parens in here
}
function isCorP (c) {
return isCharacter(c) || isPunctuation(c);
}
function getToken(s) {
var result = {'token': '', 'remainder': s};
while (s.length && isCorP(s[0]) ) {
result.token += s[0];
s = s.slice(1);
}
result.remainder = s;
return result;
}
function parsePenn(nodes, edges, parent, num, penn_string) {
while ((penn_string = eatWhiteSpace(penn_string)).length > 0) {
if (penn_string[0] == ')') {
return {'num': num, 'remainder': penn_string.slice(1)};
}
var node_class = 'leaf';
if (penn_string[0] == '(') {
node_class = 'interior';
penn_string = penn_string.slice(1)
}
var result = getToken(penn_string);
// use result.token + '-' + '{0}'.f(num) to put the node number in the title
nodes.push({id: num, label: result.token, reflexive: false});
if (parent != -1) {
edges.push({source: nodes[parent], target: nodes[num], left: false, right: true });
}
if (node_class == 'leaf') {
num = num + 1;
penn_string = penn_string.slice(result.token.length);
}
else {
result = parsePenn(nodes, edges, num, num + 1, result.remainder);
num = result.num;
penn_string = result.remainder;
}
}
return {'num': num, 'remainder': ''};
}
var nodes = [], edges = []
parsePenn(nodes, edges, -1, 0, penn_string);
return {nodes:nodes, edges: edges};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment