Skip to content

Instantly share code, notes, and snippets.

@loretoparisi
Created March 8, 2019 17:57
Show Gist options
  • Save loretoparisi/2164537d7149daaa61ca2a2be99b036e to your computer and use it in GitHub Desktop.
Save loretoparisi/2164537d7149daaa61ca2a2be99b036e to your computer and use it in GitHub Desktop.
JavaScript - calculate Start and End of a Word in a Sentence of a Document
var text = "Lorem ipsum dolor sit amet,\nconsectetur adipiscing elit,\nsed do eiusmod tempor incididunt ut labore et dolore magna aliqua"
sentences = text.split(/\n/);
var doc = {};
doc.sentences = [];
for (var i = 0; i < sentences.length; i++) {
var sentence = sentences[i];
var words = sentence.split(/\s+/)
var sent = {}
sent.tokens = [];
sent.text = sentence;
for (var j = 0; j < words.length; j++) {
var word = words[j];
var reg = new RegExp("\\b(" + word + ")\\b", "g");
var match;
var token = {}
token.word = word
while ((match = reg.exec(text)) !== null) {
var characterOffsetBegin = match.index;
var characterOffsetEnd = characterOffsetBegin + word.length - 1;
token.characterOffsetBegin = characterOffsetBegin
token.characterOffsetEnd = characterOffsetEnd
sent.tokens.push(token);
}
}
doc.sentences.push(sent);
}
for (var s = 0; s < doc.sentences.length; s++) {
var sentence = doc.sentences[s];
console.log("", sentence.text);
for (var i = 0; sentence.tokens && i < sentence.tokens.length; i++) {
var token = sentence.tokens[i];
var begin = token.characterOffsetBegin;
var end = token.characterOffsetEnd + 1;
var reconstructed = text.substring(begin, end);
console.log("\t[" + token.word + "] begin:" + token.characterOffsetBegin + " end:" + token.characterOffsetEnd + " ---> [" + reconstructed + "]");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment