Skip to content

Instantly share code, notes, and snippets.

@anytizer
Last active February 3, 2026 22:45
Show Gist options
  • Select an option

  • Save anytizer/f92ac3ebe69cae3a1d5e7772d0e58814 to your computer and use it in GitHub Desktop.

Select an option

Save anytizer/f92ac3ebe69cae3a1d5e7772d0e58814 to your computer and use it in GitHub Desktop.
AI generated Nepali syllables splitter
// eg: giggaj => dig gaj
// eg. kathmandu => kath man du
#include <QCoreApplication>
#include <QString>
#include <QStringList>
#include <QDebug>
#include <QFile>
#include <QTextStream>
#include <QSet>
#include <QRegularExpression>
// Helper to determine the length of special Nepali consonant units
int getConsonantUnitSize(const QString &word, int index) {
if (index >= word.length()) return 0;
// Check 3-letter units
QString tri = word.mid(index, 3).toLower();
if (tri == "chh" || tri == "yan") return 3;
// Check 2-letter units
QString bi = word.mid(index, 2).toLower();
static const QSet<QString> digraphs = {
"bh", "ch", "dh", "gh", "jh", "kh", "ng", "ph", "th", "sh"
};
if (digraphs.contains(bi)) return 2;
return 1;
}
// Logic to split a single word into segments
QStringList splitWordToSegments(QString word) {
if (word.isEmpty()) return QStringList();
QStringList segments;
QString currentSegment;
const QString vowels = "aeiouAEIOU";
for (int i = 0; i < word.length(); ++i) {
currentSegment.append(word.at(i));
if (vowels.contains(word.at(i))) {
// Keep double/triple vowels together
if (i + 1 < word.length() && vowels.contains(word.at(i + 1))) {
continue;
}
// Pull Rule: check next consonant unit
int nextUnitSize = getConsonantUnitSize(word, i + 1);
if (nextUnitSize > 0) {
int posAfterUnit = i + 1 + nextUnitSize;
// If followed by another consonant OR if it's a double consonant (e.g., pp in chappa)
bool isDouble = (word.at(i+1).toLower() == word.mid(posAfterUnit, 1).toLower());
bool nextIsConsonant = (posAfterUnit >= word.length() || !vowels.contains(word.at(posAfterUnit)));
if (nextIsConsonant || isDouble) {
currentSegment.append(word.mid(i + 1, nextUnitSize));
i += nextUnitSize;
}
}
segments.append(currentSegment);
currentSegment.clear();
}
}
// Attach leftovers to the last segment
if (!currentSegment.isEmpty()) {
if (!segments.isEmpty()) segments.append(segments.takeLast() + currentSegment);
else segments.append(currentSegment);
}
return segments;
}
int main(int argc, char *argv[]) {
QCoreApplication a(argc, argv);
QFile file("words.txt");
if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
qCritical() << "Could not open words.txt";
return 1;
}
QTextStream in(&file);
// Regex to separate words from everything else (spaces, punctuation)
QRegularExpression re("([a-zA-Z]+|[^a-zA-Z]+)");
while (!in.atEnd()) {
QString line = in.readLine();
QString processedLine;
QRegularExpressionMatchIterator it = re.globalMatch(line);
while (it.hasNext()) {
QRegularExpressionMatch match = it.next();
QString part = match.captured(1);
if (QRegularExpression("^[a-zA-Z]+$").match(part).hasMatch()) {
// Split the word and join with a space as requested
processedLine += splitWordToSegments(part).join(" "); // 3 spaces
} else {
// Keep spaces and punctuation exactly as they are
processedLine += part;
}
}
qDebug().noquote() << processedLine << "\n";
}
file.close();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment