Last active
February 9, 2024 22:03
-
-
Save zawhtutwin/de145f6c5942d1083491fe6404ae4270 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const _CATEGORY_NAMES = ['C', 'M', 'V', 'S', 'A', 'F', 'I', 'E', 'G', 'D', 'P', 'W']; | |
// Category's Unicode Code Point | |
const _CATEGORY_RANGE = [ | |
['C', range(0x1021,0x1000)], // Consonants | |
['M', range(0x103E,0x103B)], // Medials | |
['V', range(0x1032,0x102B)], // Dependent Vowel Signs | |
['S', [0x1039]], // Myanmar Sign Virama | |
['A', [0x103A]], // Myanmar Sign Asat | |
['F', range(0x1038,0x1036)], // Dependent Various Signs | |
['I', [0x1024, 0x1027, 0x102A, 0x104C, 0x104D, 0x104F]], // Independent Vowels, Independent Various Signs | |
['E', [0x1023, 0x1025, 0x1026, 0x1029, 0x104E]], // Independent Vowels, Myanmar Symbol Aforementioned | |
['G', [0x103F]], // Myanmar Letter Great Sa | |
['D', range(0x1049,0x1040)], // Myanmar Digits | |
['P', range(0x104B,0x104A)], // Punctuation Marks | |
['W', [0x0020]], // White space | |
]; | |
function range(end,start){ | |
const r = Array.from({ length: end - start + 1 }, (_, i) => start + i); | |
return r; | |
} | |
//console.log(_CATEGORY_NAMES); | |
//console.log(_CATEGORY_RANGE); | |
function getCateGory(sentence){ | |
console.log(sentence); | |
let arr | |
= [...sentence].map(function(v){ | |
let list = _CATEGORY_RANGE.filter(function(cat){ | |
c = v.charCodeAt(0); | |
return cat[1].includes(c) | |
}).map(function (item){ | |
return {cat:item[0],val:v}; | |
}); | |
return list[0]; | |
}); | |
return arr; | |
} | |
const _LETTER_SEQUENCE_TABLE_INDEX = { | |
'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'I': 6, 'M': 7, 'P': 8, 'S': 9, 'V': 10, 'W': 11 | |
}; | |
const _LETTER_SEQUENCE_TABLE_2ND_CHARACTER = { | |
'A': [-1, -2, 1, 1, 0, -1, 1, 0, 1, 0, 0, 1], | |
'C': [0, -2, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1], | |
'D': [-1, 1, 0, 1, -1, -1, 1, -1, 1, -1, -1, 1], | |
'E': [-1, -2, 1, 1, 2, 0, 1, -1, 1, -1, 0, 1], | |
'F': [-1, -2, 1, 1, 2, -1, 1, -1, 1, -1, -1, 1], | |
'G': [-1, 1, 1, 1, 0, -1, 1, -1, 1, -1, 0, 1], | |
'I': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1], | |
'M': [2, -2, 1, 1, 0, 0, 1, 0, 1, -1, 0, 1], | |
'P': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1], | |
'S': [-1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], | |
'V': [2, -2, 1, 1, 0, 0, 1, -1, 1, -1, 0, 1], | |
'W': [-1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 0], | |
}; | |
const _LETTER_SEQUENCE_TABLE_3RD_CHARACTER = { | |
'AC': [3, 1, 1, 1, 1, 1, 1, -2, 1, 1, 1, 1], | |
'CC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1], | |
'EC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1], | |
'FC': [3, 1, 1, 1, 1, 1, 1, -2, 1, 1, 1, 1], | |
'MC': [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1], | |
'VC': [0, 1, 1, 1, 1, 1, 1, -2, 1, 0, 1, 1], | |
}; | |
const _LETTER_SEQUENCE_TABLE_4TH_CHARACTER = { | |
'ACM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], | |
'FCM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], | |
'VCM': [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], | |
}; | |
function _getSyllableBreakStatus(categorys, categorysLen) { | |
let letterSequenceTable; | |
if (categorysLen === 2) { | |
letterSequenceTable = _LETTER_SEQUENCE_TABLE_2ND_CHARACTER; | |
} else if (categorysLen === 3) { | |
letterSequenceTable = _LETTER_SEQUENCE_TABLE_3RD_CHARACTER; | |
} else if (categorysLen === 4) { | |
letterSequenceTable = _LETTER_SEQUENCE_TABLE_4TH_CHARACTER; | |
} else { | |
letterSequenceTable = null; | |
} | |
if (letterSequenceTable !== null) { | |
let m = categorys.slice(0, categorysLen-1); | |
console.log(m); | |
let status = letterSequenceTable[m]; | |
if (status !== undefined) { | |
let op = categorys[categorysLen - 1]; | |
let index = _LETTER_SEQUENCE_TABLE_INDEX[op]; | |
if (index !== undefined) { | |
return status[index]; | |
} | |
} | |
} | |
return -1; | |
} | |
function cut(){ | |
let sen = 'စာကြည့်တိုက်အတွင်းရှိကလေးငယ်များဆူဆူညံညံမပြုရန်သတိပြုပါ။'; | |
let len = sen.length; | |
let categories = getCateGory(sen).map((m)=>(m.cat)).join(""); | |
let text = getCateGory(sen).map((m)=>(m.val)); | |
let start = 0; | |
t =0; | |
let residueLen = sen.length; | |
let finalStr = ""; | |
let finalText = ""; | |
let count=1; | |
while(residueLen>1){ | |
let sl = categories.slice(start,start+2); | |
let i = _getSyllableBreakStatus(sl,2); | |
//console.log(categories.slice(start,start+2)+" "+i+ " "+residueLen); | |
if(i==-2 && residueLen >= 3){ | |
sl = categories.slice(start,start+3); | |
i = _getSyllableBreakStatus(sl,3); | |
} | |
if(i==-2 && residueLen >= 4){ | |
sl = categories.slice(start,start+4); | |
i = _getSyllableBreakStatus(sl,4); | |
} | |
if(i==-2){ | |
finalStr += categories[start]+"|"; | |
finalText += text[start]; | |
} | |
if(i==1){ | |
finalText += text[start]+"|"; | |
finalStr += categories[start]+"|"; | |
} | |
if(i==0){ | |
finalStr += categories[start]; | |
finalText += text[start]; | |
} | |
if(i==-1){ | |
finalStr += categories[start]; | |
finalText += text[start]; | |
} | |
count++; | |
residueLen = len - start; | |
start++; | |
} | |
console.log(finalStr); | |
//"CV|CMCAF|CVVCA|C|CMCAF|CMV|C|CVF|CCA|CMVF|CV|CV|CF|CF|C|CMV|CCA|C|CV|CMV|CV|P" | |
console.log(finalText); | |
//"စာ|ကြည့်|တိုက်|အ|တွင်း|ရှိ|က|လေး|ငယ်|များ|ဆူ|ဆူ|ညံ|ညံ|မ|ပြု|ရန်|သ|တိ|ပြု|ပါ|။" | |
} | |
cut(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment