Skip to content

Instantly share code, notes, and snippets.

@Stevearzh
Last active February 10, 2020 12:13
Show Gist options
  • Save Stevearzh/cb469d8d1af83cc81c22b644d2e924c5 to your computer and use it in GitHub Desktop.
Save Stevearzh/cb469d8d1af83cc81c22b644d2e924c5 to your computer and use it in GitHub Desktop.
function is_ascii(str) {
return /^[\x00-\x7F]*$/.test(str);
}
function is_seperator(c) {
return [" ", ",", "。", ",", ";", ",", "?", ".", "?", ";", "《", "》"].indexOf(c) > -1;
}
function random_choice(arr) {
return arr[Math.floor(arr.length * Math.random())];
}
function random_shuffle(array) {
var current_index = array.length, temporary_value, random_index;
// While there remain elements to shuffle...
while (0 !== current_index) {
// Pick a remaining element...
random_index = Math.floor(Math.random() * current_index);
current_index -= 1;
// And swap it with the current element.
temporary_value = array[current_index];
array[current_index] = array[random_index];
array[random_index] = temporary_value;
}
return array;
}
function tokenize(src_txt) {
var token_list = [];
var token = "";
for (var c of src_txt) {
if (is_ascii(c)) {
token += c;
} else {
if (token !== "") {
token_list.push(token);
token = "";
}
token_list.push(c);
}
}
if (token !== "") {
token_list.push(token);
}
return token_list;
}
function reorder(token_list) {
var n_grams = [2, 3];
var i = 0;
var token_list_reordered = [];
while (i < token_list.length) {
var n_gram = random_choice(n_grams);
var j = Math.min(i + n_gram, token_list.length);
n_gram = token_list.slice(i, j);
random_shuffle(n_gram);
Array.prototype.push.apply(token_list_reordered, n_gram);
i = j;
}
return token_list_reordered;
}
function sentencize(src_txt) {
var sentence_list = [];
var sentence = "";
var reordered_txt = [];
for (var c of src_txt) {
if (is_seperator(c) || !Number.isNaN(+c)) {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
reordered_txt.push(c);
sentence = "";
} else {
sentence += c;
}
}
if (sentence !== "") {
Array.prototype.push.apply(reordered_txt, reorder(tokenize(sentence)));
}
return reordered_txt.join("");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment