Last active
October 12, 2021 14:01
-
-
Save kindziora/f641c4f9a4958fea91ed9c7ee81a9e0e to your computer and use it in GitHub Desktop.
chunk a string into max length segments but keep words healthy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* chunk and dont be strict with length | |
* @param {*} text | |
* @param {*} chunkSize | |
* @returns | |
*/ | |
function chunk(text, chunkSize) { | |
let result = []; | |
let words = text.split(" "); | |
let segment = ""; | |
for (let i in words) { | |
segment += words[i] + " "; | |
if(segment.length > chunkSize) { | |
let chunk = segment.split(" "); | |
let last_word = chunk.pop(); | |
result.push( chunk.join(" ") ); | |
segment = last_word + " "; | |
} | |
} | |
return result; | |
} | |
let anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker"; | |
console.log(chunk(anakin, 15)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//chunk in strictly framed segments but keep words healthy | |
function createWordOffsets(text, chunkSize) { | |
let indexed = []; | |
let words = text.split(" "); | |
let blockIndex = 0; | |
let blockLength = { 0: 0 }; | |
let padding = 0; | |
for (let i in words) { | |
let start = words.slice(0, i).join(" ").length; | |
let end = start + words[i].length; | |
blockIndex = (end + padding) - (end % chunkSize); | |
if (typeof blockLength[blockIndex] === "undefined") blockLength[blockIndex] = 0; | |
if (blockLength[blockIndex] + words[i].length +1 > chunkSize) { | |
padding+=chunkSize; | |
blockIndex = (end + padding) - (end % chunkSize); | |
} | |
blockLength[blockIndex] += words[i].length +1 ; | |
indexed.push({ start, end, segment: words[i], blockIndex, "blockLength": blockLength[blockIndex] }); | |
} | |
return indexed; | |
} | |
function chunk(words, chunkSize) { | |
let result = []; | |
for (let i = 0; i < words[words.length - 1].end; i += chunkSize) { | |
result.push(words.filter((w) => w.blockIndex == i).map(e => e.segment).join(" ")); | |
} | |
return result; | |
} | |
let anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker"; | |
let chunksize = 16; | |
let w = createWordOffsets(anakin, chunksize); | |
console.log(w); | |
let allchunks = chunk(w, chunksize); | |
console.log(allchunks, allchunks.map(e =>e.length)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def createWordOffsets(text, chunkSize): | |
indexed = [] | |
words = text.split(" ") | |
blockIndex = 0 | |
blockLength = { 0: 0 } | |
padding = 0 | |
for i, word in enumerate(words): | |
start = len(" ".join(words[0: i])) | |
end = start + len(words[i]) | |
blockIndex = (end + padding) - (end % chunkSize) | |
if blockIndex not in blockLength : blockLength[blockIndex] = 0 | |
if blockLength[blockIndex] + len(words[i]) +1 > chunkSize: | |
padding+=chunkSize | |
blockIndex = (end + padding) - (end % chunkSize) | |
if blockIndex not in blockLength : blockLength[blockIndex] = 0 | |
blockLength[blockIndex] += len(words[i]) +1 | |
p = { "start" :start, "end" : end, "segment" :"", "blockIndex" : blockIndex , "blockLength" :""} | |
p["segment"] = words[i] | |
p["blockLength"] = blockLength[blockIndex] | |
indexed.append(p) | |
return indexed | |
def chunk(words, chunkSize): | |
result = [] | |
for i in range(0, words[len(words) - 1]["end"], chunkSize): | |
result.append(" ".join(map(lambda e: e["segment"], filter(lambda w: w["blockIndex"] == i, words)))) | |
return result | |
anakin = "Anakin Skywalker, später unter dem Namen Darth Vader bekannt, war der Sohn von Shmi Skywalker, der Ehemann von Padmé Amidala und der Vater von Luke Skywalker" | |
chunksize = 20 | |
w = createWordOffsets(anakin, chunksize) | |
print(w) | |
allchunks = chunk(w, chunksize) | |
print(allchunks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment