Last active
February 10, 2020 12:13
-
-
Save Stevearzh/cb469d8d1af83cc81c22b644d2e924c5 to your computer and use it in GitHub Desktop.
js version of https://www.onlinegdb.com/fork/S19tQ06GI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function is_ascii(str) { | |
return /^[\x00-\x7F]*$/.test(str); | |
} | |
function is_seperator(c) { | |
return [" ", ",", "。", ",", ";", ",", "?", ".", "?", ";", "《", "》"].indexOf(c) > -1; | |
} | |
function random_choice(arr) { | |
return arr[Math.floor(arr.length * Math.random())]; | |
} | |
function random_shuffle(array) { | |
var current_index = array.length, temporary_value, random_index; | |
// While there remain elements to shuffle... | |
while (0 !== current_index) { | |
// Pick a remaining element... | |
random_index = Math.floor(Math.random() * current_index); | |
current_index -= 1; | |
// And swap it with the current element. | |
temporary_value = array[current_index]; | |
array[current_index] = array[random_index]; | |
array[random_index] = temporary_value; | |
} | |
return array; | |
} | |
function tokenize(src_txt) { | |
var token_list = []; | |
var token = ""; | |
for (var c of src_txt) { | |
if (is_ascii(c)) { | |
token += c; | |
} else { | |
if (token !== "") { | |
token_list.push(token); | |
token = ""; | |
} | |
token_list.push(c); | |
} | |
} | |
if (token !== "") { | |
token_list.push(token); | |
} | |
return token_list; | |
} | |
function reorder(token_list) { | |
var n_grams = [2, 3]; | |
var i = 0; | |
var token_list_reordered = []; | |
while (i < token_list.length) { | |
var n_gram = random_choice(n_grams); | |
var j = Math.min(i + n_gram, token_list.length); | |
n_gram = token_list.slice(i, j); | |
random_shuffle(n_gram); | |
Array.prototype.push.apply(token_list_reordered, n_gram); | |
i = j; | |
} | |
return token_list_reordered; | |
} | |
function sentencize(src_txt) { | |
var sentence_list = []; | |
var sentence = ""; | |
var reordered_txt = []; | |
for (var c of src_txt) { | |
if (is_seperator(c) || !Number.isNaN(+c)) { | |
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence))); | |
reordered_txt.push(c); | |
sentence = ""; | |
} else { | |
sentence += c; | |
} | |
} | |
if (sentence !== "") { | |
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence))); | |
} | |
return reordered_txt.join(""); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment