Last active
June 6, 2022 11:13
-
-
Save rhamses/584861ba94b2fbe72f66078efe6df31e to your computer and use it in GitHub Desktop.
Sorting string by Unicode Block (Language Identity)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
All blocks at | |
https://en.wikipedia.org/wiki/Unicode_block | |
This is an example explained at | |
https://rhams.es/content/usando-unicode-para-identificar-idiomas | |
*/ | |
const ArabicBlock = new RegExp('[\u0621-\u064A]','g') | |
const ThaiBlock = new RegExp('[\u0E00-\u0E7F]','g') | |
const HangulBlock = new RegExp('[\uAC00-\uD7AF]','g') | |
const CJKBlock = new RegExp('[\u3000-\u303F]', 'g') | |
const HiraganaBlock = new RegExp('[\u3040-\u309F ]','g') | |
const KatanaBlock = new RegExp('[\u30A0-\u30FF ]','g') | |
function identifyLanguage(text) { | |
let result | |
if (text.match(ArabicBlock)) { | |
result = ['ArabicBlock', text.match(ArabicBlock)]; | |
} else if (text.match(ThaiBlock)) { | |
result = ['ThaiBlock', text.match(ThaiBlock)]; | |
} else if (text.match(HangulBlock)) { | |
result = ['HangulBlock', text.match(HangulBlock)]; | |
} else if (text.match(KatanaBlock)) { | |
result = ['KatanaBlock', text.match(KatanaBlock)]; | |
} else if (text.match(HiraganaBlock)) { | |
result = ['HiraganaBlock', text.match(HiraganaBlock)]; | |
} else if (text.match(CJKBlock)) { | |
result = ['CJKBlock', text.match(CJKBlock)]; | |
} else { | |
result = "Unicode not found"; | |
} | |
return result | |
} | |
// const text = "کمالہ_خان"; | |
const text = "ยิ่งปัดยิ่งพุ่ง"; | |
// const text = "끝없이서로의가능성을믿다"; | |
// const text = "オリエント・アルカディア"; | |
console.log(identifyLanguage(text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment