Created
March 8, 2019 17:57
-
-
Save loretoparisi/2164537d7149daaa61ca2a2be99b036e to your computer and use it in GitHub Desktop.
JavaScript - calculate Start and End of a Word in a Sentence of a Document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var text = "Lorem ipsum dolor sit amet,\nconsectetur adipiscing elit,\nsed do eiusmod tempor incididunt ut labore et dolore magna aliqua" | |
sentences = text.split(/\n/); | |
var doc = {}; | |
doc.sentences = []; | |
for (var i = 0; i < sentences.length; i++) { | |
var sentence = sentences[i]; | |
var words = sentence.split(/\s+/) | |
var sent = {} | |
sent.tokens = []; | |
sent.text = sentence; | |
for (var j = 0; j < words.length; j++) { | |
var word = words[j]; | |
var reg = new RegExp("\\b(" + word + ")\\b", "g"); | |
var match; | |
var token = {} | |
token.word = word | |
while ((match = reg.exec(text)) !== null) { | |
var characterOffsetBegin = match.index; | |
var characterOffsetEnd = characterOffsetBegin + word.length - 1; | |
token.characterOffsetBegin = characterOffsetBegin | |
token.characterOffsetEnd = characterOffsetEnd | |
sent.tokens.push(token); | |
} | |
} | |
doc.sentences.push(sent); | |
} | |
for (var s = 0; s < doc.sentences.length; s++) { | |
var sentence = doc.sentences[s]; | |
console.log("", sentence.text); | |
for (var i = 0; sentence.tokens && i < sentence.tokens.length; i++) { | |
var token = sentence.tokens[i]; | |
var begin = token.characterOffsetBegin; | |
var end = token.characterOffsetEnd + 1; | |
var reconstructed = text.substring(begin, end); | |
console.log("\t[" + token.word + "] begin:" + token.characterOffsetBegin + " end:" + token.characterOffsetEnd + " ---> [" + reconstructed + "]"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment