Last active
June 14, 2021 18:05
-
-
Save teyfix/2527387233f16049285ac49d98c94153 to your computer and use it in GitHub Desktop.
seperating turkish words into syllables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/^[aeıioöuüAEIİOÖUÜ]$|^[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]$|[aeıioöuüAEIİOÖUÜ](?=[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]?)|[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ](?=$|[^aeıioöuüAEIİOÖUÜbcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]|[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ])|[aeıioöuüAEIİOÖUÜ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]|(?<=^|[^aeıioöuüAEIİOÖUÜbcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ])[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ](?=[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ])|[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ]([bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ](?=[^aeıioöuüAEIİOÖUÜbcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]|$|[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ]))?|[bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ][aeıioöuüAEIİOÖUÜ][bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ]/g |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const vowel = 'aeıioöuüAEIİOÖUÜ'; | |
const consonant = 'bcçdfgğhjklmnprsştvyzBCÇDFGĞHJKLMNPRSŞTVYZ'; | |
const alphabet = vowel + consonant; | |
const replace = [`[${vowel}]`, `[${consonant}]`, `[^${alphabet}]`]; | |
const raw = ` | |
^0$ | |
^1$ | |
0(?=101?) | |
10(?=$|2|10) | |
01 | |
(?<=^|2)1011(?=101) | |
110(1(?=2|$|10))? | |
101 | |
`; | |
const pattern = raw | |
.trim() | |
.replace(/\/\/.+$/g, '') | |
.replace(/\n/g, '|') | |
.replace(/\s+/g, '') | |
.replace(/\d/g, (m) => replace[m]); | |
const regex = new RegExp(pattern, 'g'); | |
const spell = (input) => input.match(regex); | |
const test = [ | |
{ word: 'araba', expected: 'a, ra, ba' }, | |
{ word: 'elektriklenme', expected: 'e, lek, trik, len, me' }, | |
{ word: 'endüstriyel', expected: 'en, düs, tri, yel' }, | |
{ word: 'kalabalık', expected: 'ka, la, ba, lık' }, | |
{ | |
word: 'muvaffakiyetsizleştiricileştiriveremeyebileceklerimizdenmişsinizcesine', | |
expected: | |
'mu, vaf, fa, ki, yet, siz, leş, ti, ri, ci, leş, ti, ri, ve, re, me, ye, bi, le, cek, le, ri, miz, den, miş, si, niz, ce, si, ne', | |
}, | |
]; | |
for (const { word, expected } of test) { | |
const result = spell(word); | |
if (null == result) { | |
console.log(word, 'gives no match'); | |
} | |
const joined = result.join(', '); | |
if (expected === joined) { | |
continue; | |
} | |
console.log('expected:', expected, 'but got:', joined); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment