Last active
September 23, 2016 11:53
-
-
Save hallvors/c8803a2bf533b5f21f85b20647b86fcb to your computer and use it in GitHub Desktop.
Helper script for better Japanese line wrapping, using Kuromoji tokenizer library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kuromoji.builder({ dicPath: "node_modules/kuromoji/dict/" }).build(function (err, tokenizer) { | |
if(err)console.log(err); | |
var textNodesUnder = function(node){ // http://stackoverflow.com/questions/10730309/find-all-text-nodes-in-html-page | |
var all = []; | |
for (node=node.firstChild;node;node=node.nextSibling){ | |
if (node.nodeType==3 && !/^\s+$/.test(node.data)) all.push(node); | |
else all = all.concat(textNodesUnder(node)); | |
} | |
return all; | |
}; | |
function addSpanTags(){ | |
// This is the plan: | |
// Get a list (preferably static..) of all text nodes in document | |
// tokenize each of them | |
// wrap each token in a SPAN tag styled with word-break: keep-all | |
var textnodes = textNodesUnder(document.body); | |
textnodes.forEach(function(textnode){ | |
if(textnode.parentNode.tagName in {'SCRIPT':1, 'STYLE':1}){ | |
return; // we would cause really funky effects if we messed with those nodes.. | |
} | |
var words = tokenizer.tokenize(textnode.data); | |
var indexes = []; | |
words.forEach(function(wordData){ | |
if(wordData.surface_form.length > 2) { // do not bother with 1 char words | |
indexes.push([wordData.word_position - 1, wordData.word_position - 1 + wordData.surface_form.length]); | |
} | |
}); | |
// now we have all the indexes to split this text node at | |
for(var newNode, span, after, i = indexes.length-1; i >= 0; i--) { | |
console.log('will split ' + textnode.data + ' at ' + indexes[i][1]); | |
if(indexes[i][1] != textnode.data.length){ | |
textnode.splitText(indexes[i][1]); // chops off the remainder | |
} | |
newNode = textnode.splitText(indexes[i][0]); // chops off the string to wrap | |
span = newNode.parentElement.insertBefore(document.createElement('span'), newNode); | |
span.appendChild(newNode); | |
span.className = 'avoid-wrap'; | |
} | |
}); | |
} | |
// tokenizer is ready | |
document.readyState != 'complete' ? window.addEventListener('load', addSpanTags, false) : addSpanTags(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment