Last active
February 3, 2026 22:45
-
-
Save anytizer/f92ac3ebe69cae3a1d5e7772d0e58814 to your computer and use it in GitHub Desktop.
AI generated Nepali syllables splitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // eg: giggaj => dig gaj | |
| // eg. kathmandu => kath man du | |
| #include <QCoreApplication> | |
| #include <QString> | |
| #include <QStringList> | |
| #include <QDebug> | |
| #include <QFile> | |
| #include <QTextStream> | |
| #include <QSet> | |
| #include <QRegularExpression> | |
| // Helper to determine the length of special Nepali consonant units | |
| int getConsonantUnitSize(const QString &word, int index) { | |
| if (index >= word.length()) return 0; | |
| // Check 3-letter units | |
| QString tri = word.mid(index, 3).toLower(); | |
| if (tri == "chh" || tri == "yan") return 3; | |
| // Check 2-letter units | |
| QString bi = word.mid(index, 2).toLower(); | |
| static const QSet<QString> digraphs = { | |
| "bh", "ch", "dh", "gh", "jh", "kh", "ng", "ph", "th", "sh" | |
| }; | |
| if (digraphs.contains(bi)) return 2; | |
| return 1; | |
| } | |
| // Logic to split a single word into segments | |
| QStringList splitWordToSegments(QString word) { | |
| if (word.isEmpty()) return QStringList(); | |
| QStringList segments; | |
| QString currentSegment; | |
| const QString vowels = "aeiouAEIOU"; | |
| for (int i = 0; i < word.length(); ++i) { | |
| currentSegment.append(word.at(i)); | |
| if (vowels.contains(word.at(i))) { | |
| // Keep double/triple vowels together | |
| if (i + 1 < word.length() && vowels.contains(word.at(i + 1))) { | |
| continue; | |
| } | |
| // Pull Rule: check next consonant unit | |
| int nextUnitSize = getConsonantUnitSize(word, i + 1); | |
| if (nextUnitSize > 0) { | |
| int posAfterUnit = i + 1 + nextUnitSize; | |
| // If followed by another consonant OR if it's a double consonant (e.g., pp in chappa) | |
| bool isDouble = (word.at(i+1).toLower() == word.mid(posAfterUnit, 1).toLower()); | |
| bool nextIsConsonant = (posAfterUnit >= word.length() || !vowels.contains(word.at(posAfterUnit))); | |
| if (nextIsConsonant || isDouble) { | |
| currentSegment.append(word.mid(i + 1, nextUnitSize)); | |
| i += nextUnitSize; | |
| } | |
| } | |
| segments.append(currentSegment); | |
| currentSegment.clear(); | |
| } | |
| } | |
| // Attach leftovers to the last segment | |
| if (!currentSegment.isEmpty()) { | |
| if (!segments.isEmpty()) segments.append(segments.takeLast() + currentSegment); | |
| else segments.append(currentSegment); | |
| } | |
| return segments; | |
| } | |
| int main(int argc, char *argv[]) { | |
| QCoreApplication a(argc, argv); | |
| QFile file("words.txt"); | |
| if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { | |
| qCritical() << "Could not open words.txt"; | |
| return 1; | |
| } | |
| QTextStream in(&file); | |
| // Regex to separate words from everything else (spaces, punctuation) | |
| QRegularExpression re("([a-zA-Z]+|[^a-zA-Z]+)"); | |
| while (!in.atEnd()) { | |
| QString line = in.readLine(); | |
| QString processedLine; | |
| QRegularExpressionMatchIterator it = re.globalMatch(line); | |
| while (it.hasNext()) { | |
| QRegularExpressionMatch match = it.next(); | |
| QString part = match.captured(1); | |
| if (QRegularExpression("^[a-zA-Z]+$").match(part).hasMatch()) { | |
| // Split the word and join with a space as requested | |
| processedLine += splitWordToSegments(part).join(" "); // 3 spaces | |
| } else { | |
| // Keep spaces and punctuation exactly as they are | |
| processedLine += part; | |
| } | |
| } | |
| qDebug().noquote() << processedLine << "\n"; | |
| } | |
| file.close(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment