Last active
November 4, 2020 16:20
-
-
Save MultiversalNomad/1d3acd4a9d30b0af9aa2 to your computer and use it in GitHub Desktop.
This function takes a Japanese kana word and outputs an HTML pitch accent notation. See code comments for details.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This function takes a Japanese kana word and outputs an HTML pitch accent notation. | |
The notation is explained further in the code comments. | |
More info on Japanese pitch accent notation can be found at the following web pages: | |
- http://en.wikipedia.org/wiki/Japanese_pitch_accent | |
- http://japanese.stackexchange.com/questions/11194/how-does-pitch-accent-work-in-japanese | |
- http://www.sanseido-publ.co.jp/publ/dicts/daijirin_ac.html | |
*/ | |
// This file contains the newest implementation. The other file is not needed to run this code. | |
using System; | |
using System.Collections.Generic; | |
public static class Program | |
{ | |
public static void Main () | |
{ | |
Console.WriteLine(JPANotationHTML("ありがとう", 2)); | |
} | |
public static string JPANotationHTML(string word, int n) | |
{ | |
// Developed by Joseph Cassano (jplc.ca). | |
// | |
// This function is based on the Japanese pitch accent notation explained here: | |
// http://www.sanseido-publ.co.jp/publ/dicts/daijirin_ac.html | |
// | |
// English explanations can be found at the following: | |
// http://en.wikipedia.org/wiki/Japanese_pitch_accent | |
// http://japanese.stackexchange.com/questions/11194/how-does-pitch-accent-work-in-japanese | |
// | |
// n is the accent notation number. | |
// In the HTML notation that this function generates, the rules are the following: | |
// - Underlined mora are low pitch accented mora. | |
// - Unedited mora are high pitch accented mora. | |
// - Bolded mora are the last high pitch accented mora just before a downstep. | |
// - Small kana (like the ones used in youon) are not considered their own mora; | |
// they are paired with their preceding kana to make one mora. However, sokuon is exempt | |
// from this rule and is considered its own mora. | |
// | |
// Words must be entered in their full kana form, so any kanji or choonpu must be converted | |
// to kana (either hiragana or katakana). | |
// | |
// Example: If the word was ありがとう with a notation number of 2, the output would be: | |
// <u>あ</u><b>り</b><u>がとう</u> | |
// Example (youon): If the word was シャーベット with a notation number of 1, you'd first | |
// convert the choonpu to kana and thus enter the word as シャアベット. The output would | |
// then be: | |
// <b>シャ</b><u>アベット</u> | |
// | |
// Permission is given to use this code however you please with absolutely no restrictions. | |
System.Text.StringBuilder outString = new System.Text.StringBuilder(); | |
outString.Append(""); | |
if (word.Length > 0) | |
{ | |
string[] smallKana = {"ぁ", "ぃ", "ぅ", "ぇ", "ぉ", "ゃ", "ゅ", "ょ", "ゎ", "ァ", "ィ", "ゥ", "ェ", "ォ", "ャ", "ュ", "ョ", "ヮ"}; | |
int[] kanaIndexArray; | |
Dictionary<string, int[]> kanaDict = IndexesOf(word, smallKana, out kanaIndexArray); | |
int smallKanaNum = kanaIndexArray.Length; | |
//Console.WriteLine("Small kana in total: " + smallKanaNum); | |
int trueLength = word.Length - smallKanaNum; | |
if (n >= 0 && n <= trueLength) | |
{ | |
int tempIndex = 0; | |
int lengthLimit; | |
int formatStyle = 1; | |
if (n == 1) | |
{ | |
formatStyle = 2; | |
} | |
AppendMora(outString, word, kanaDict, formatStyle, ref tempIndex); | |
if (n > 2) | |
{ | |
int index = tempIndex; | |
int nNum = 2; | |
int tempLength = 0; | |
bool smallKanaFound = false; | |
while (nNum < n && index < word.Length) | |
{ | |
smallKanaFound = kanaDict.ContainsKey(word.Substring(index, 1)); | |
if (!smallKanaFound) | |
{ | |
nNum++; | |
} | |
index++; | |
smallKanaFound = kanaDict.ContainsKey(word.Substring(index, 1)); | |
if (smallKanaFound) | |
{ | |
index++; | |
} | |
} | |
tempLength = index - tempIndex; | |
//Console.WriteLine("Length of kana of regularly formatted mora: " + tempLength); | |
outString.Append(word.Substring(tempIndex, tempLength)); | |
tempIndex += tempLength; | |
} | |
if (n > 1) | |
{ | |
formatStyle = 2; | |
AppendMora(outString, word, kanaDict, formatStyle, ref tempIndex); | |
} | |
formatStyle = 1; | |
lengthLimit = n; | |
if (n < 2) | |
{ | |
lengthLimit = 1; | |
if (n == 0) | |
{ | |
formatStyle = 0; | |
} | |
} | |
AppendMoraChunk(outString, word, trueLength, lengthLimit, formatStyle, tempIndex); | |
} | |
} | |
return outString.ToString(); | |
} | |
public static Dictionary<string, int[]> IndexesOf(string word, string[] phraseArray, out int[] indexArray) | |
{ | |
Dictionary<string, int[]> tempKanaDict = new Dictionary<string, int[]>(); | |
int[] tempArray; | |
List<int> indexList = new List<int>(); | |
List<int> tempList; | |
for (int i = 0; i < phraseArray.Length; i++) | |
{ | |
tempArray = IndexesOf(word, phraseArray[i]); | |
tempList = new List<int>(tempArray); | |
if (tempArray.Length > 0) | |
{ | |
tempKanaDict.Add(phraseArray[i], tempArray); | |
indexList.AddRange(tempList); | |
} | |
} | |
indexArray = indexList.ToArray(); | |
Array.Sort(indexArray); | |
/* | |
foreach (int index in indexArray) | |
{ | |
Console.WriteLine("Small kana index: " + index); | |
} | |
*/ | |
return tempKanaDict; | |
} | |
public static int[] IndexesOf(string word, string phrase) | |
{ | |
List<int> kanaList = new List<int>(); | |
int loopNum = word.Length - phrase.Length; | |
string currentPhrase; | |
for (int i = 0; i <= loopNum; i++) | |
{ | |
currentPhrase = word.Substring(i, phrase.Length); | |
if (currentPhrase == phrase) | |
{ | |
kanaList.Add(i); | |
} | |
} | |
return kanaList.ToArray(); | |
} | |
public static string AppendMora(System.Text.StringBuilder outString, string word, Dictionary<string, int[]> kanaDict, int formatStyle, ref int tempIndex) | |
{ | |
string mora = ""; | |
if (tempIndex < word.Length) | |
{ | |
int tempLength = 1; | |
bool smallKanaFound = false; | |
if (tempIndex + 1 < word.Length) | |
{ | |
smallKanaFound = kanaDict.ContainsKey(word.Substring(tempIndex + 1, 1)); | |
if (smallKanaFound) | |
{ | |
tempLength = 2; | |
} | |
} | |
mora = word.Substring(tempIndex, tempLength); | |
string formatOpener; | |
string formatCloser; | |
FormatWrappers(formatStyle, out formatOpener, out formatCloser); | |
outString.Append(formatOpener + mora + formatCloser); | |
tempIndex += tempLength; | |
} | |
return mora; | |
} | |
public static string AppendMoraChunk(System.Text.StringBuilder outString, string word, int trueLength, int lengthLimit, int formatStyle, int tempIndex) | |
{ | |
string moraChunk = ""; | |
if (trueLength > lengthLimit) | |
{ | |
int tempLength = word.Length - tempIndex; | |
//Console.WriteLine("Index of last chunk: " + tempIndex + " Length of last chunk: " + tempLength); | |
moraChunk = word.Substring(tempIndex, tempLength); | |
string formatOpener; | |
string formatCloser; | |
FormatWrappers(formatStyle, out formatOpener, out formatCloser); | |
outString.Append(formatOpener + moraChunk + formatCloser); | |
} | |
return moraChunk; | |
} | |
public static void FormatWrappers(int formatStyle, out string formatOpener, out string formatCloser) | |
{ | |
formatOpener = ""; | |
formatCloser = ""; | |
if (formatStyle == 1) | |
{ | |
formatOpener = "<u>"; | |
formatCloser = "</u>"; | |
} | |
else if (formatStyle == 2) | |
{ | |
formatOpener = "<b>"; | |
formatCloser = "</b>"; | |
} | |
else if (formatStyle > 0) | |
{ | |
formatOpener = "ERROR_0"; | |
formatCloser = "ERROR_1"; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This file contains the original implementation. It is significantly simpler than the | |
// newer implementation, but it could not take small kana (like those found in youon) into | |
// account while the new implementation can. This originl implementation is kept here just so one | |
// is able to compare the two. | |
using System; | |
public static class Program | |
{ | |
public static void Main () | |
{ | |
Console.WriteLine(JPANotationHTML("ありがとう", 2)); | |
} | |
public static string JPANotationHTML(string word, int n) | |
{ | |
// Developed by Joseph Cassano (jplc.ca). | |
// | |
// This function is based on the Japanese pitch accent notation explained here: | |
// http://www.sanseido-publ.co.jp/publ/dicts/daijirin_ac.html | |
// | |
// English explanations can be found at the following: | |
// http://en.wikipedia.org/wiki/Japanese_pitch_accent | |
// http://japanese.stackexchange.com/questions/11194/how-does-pitch-accent-work-in-japanese | |
// | |
// n is the accent notation number. | |
// In the HTML notation that this function generates, the rules are the following: | |
// - Underlined mora are low pitch accented mora. | |
// - Unedited mora are high pitch accented mora. | |
// - Bolded mora are the last high pitch accented mora just before a downstep. | |
// | |
// Example: If the word was ありがとう with a notation number of 2, the output would be: | |
// <u>あ</u><b>り</b><u>がとう</u> | |
// | |
// Permission is given to use this code however you please with absolutely no restrictions. | |
System.Text.StringBuilder outString = new System.Text.StringBuilder(); | |
outString.Append(""); | |
int length = word.Length; | |
if (length > 0) | |
{ | |
if (n > 0 && n <= length) | |
{ | |
if (n > 1) | |
{ | |
outString.Append("<u>" + word.Substring(0, 1) + "</u>"); | |
if (n > 2) | |
{ | |
outString.Append(word.Substring(1, n - 2)); | |
} | |
} | |
outString.Append("<b>" + word.Substring(n - 1, 1) + "</b>"); | |
if (length > n) | |
{ | |
outString.Append("<u>" + word.Substring(n, length - n) + "</u>"); | |
} | |
} | |
else if (n == 0) | |
{ | |
outString.Append("<u>" + word.Substring(0, 1) + "</u>"); | |
if (length > 1) | |
{ | |
outString.Append(word.Substring(1, length - 1)); | |
} | |
} | |
} | |
return outString.ToString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment