Last active
December 17, 2024 02:57
-
-
Save guersam/ce77eac6ef0c393983f53ab3d3f92f25 to your computer and use it in GitHub Desktop.
한글 유니코드 자소 분리
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object KoreanJasoDecomposer { | |
/** | |
* 한글 음절 | |
* | |
* @param onset 초성 | |
* @param nucleus 중성 | |
* @param coda 종성 | |
*/ | |
case class Syllable( | |
onset: Option[Char], | |
nucleus: Option[Char], | |
coda: Option[Char] | |
) | |
/** | |
* 유니코드 한글자소 분리 | |
* | |
* @param syllable 유니코드 한글 음절 | |
* @return 분리된 자소 모음 | |
*/ | |
def decompose(syllable: Char): Syllable = { | |
val base = syllable - BaseOffset | |
val onsetIdx = base / OnsetOffset | |
val nucleusIdx = (base - (OnsetOffset * onsetIdx)) / NucleusOffset | |
val codaIdx = base - (OnsetOffset * onsetIdx) - (NucleusOffset * nucleusIdx) | |
Syllable( | |
onset = safeGet(OnsetChars, onsetIdx), | |
nucleus = safeGet(NucleusChars, nucleusIdx), | |
coda = if (codaIdx == 0) None else safeGet(CodaChars, codaIdx) | |
) | |
} | |
/** | |
* 한글 음절에서 초성 추출 | |
* | |
* @param syllable 유니코드 한글 음절 | |
* @return 초성 | |
*/ | |
def onset(syllable: Char): Option[Char] = decompose(syllable).onset | |
/** | |
* 중성 | |
* | |
* @param syllable 유니코드 한글 음절 | |
* @return 중성 | |
*/ | |
def nucleus(syllable: Char): Option[Char] = decompose(syllable).nucleus | |
/** | |
* 종성 | |
* | |
* @param syllable 유니코드 한글 음절 | |
* @return 종성 | |
*/ | |
def coda(syllable: Char): Option[Char] = decompose(syllable).coda | |
private val BaseOffset = 44032 | |
private val OnsetOffset = 588 | |
private val NucleusOffset = 28 | |
private val OnsetChars = Array('ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', | |
'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ') | |
private val NucleusChars = Array('ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', | |
'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ') | |
private val CodaChars = | |
Array(' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', | |
'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ') | |
private def safeGet(arr: Array[Char], idx: Int): Option[Char] = | |
if (arr.indices contains idx) Some(arr(idx)) | |
else None | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cats.implicits._ | |
object KoreanPostfixer { | |
/** | |
* 로/으로 | |
* | |
* {{{ | |
* import KoreanPostfixer._ | |
* | |
* assert("고양이".`-(으)로` === "고양이로") | |
* assert( "고먐".`-(으)로` === "고먐으로") | |
* }}} | |
* | |
* @param body 앞말 | |
* @return 앞말 + (으)로 | |
*/ | |
def `-(으)로`(body: String): String = | |
body.lastOption.foldMap { c => | |
KoreanJasoDecomposer.coda(c) match { | |
case None | Some('ㄹ') => body + "로" | |
case _ => body + "으로" | |
} | |
} | |
object ops { | |
implicit class KoreanPostfixerOps(val str: String) extends AnyVal { | |
def `-(으)로`: String = KoreanPostfixer.`-(으)로`(str) | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.scalatest.FunSuite | |
class KoreanPostfixerTest extends FunSuite { | |
import KoreanPostfixer.ops._ | |
test("-(으)로") { | |
assert("고양일".`-(으)로` === "고양일로") | |
assert("고양이".`-(으)로` === "고양이로") | |
assert("고양삼".`-(으)로` === "고양삼으로") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
자모? 자소?