anytizer · February 3, 2026 22:45
diff --git a/splitter.cpp b/splitter.cpp
 // eg: giggaj => dig gaj
 // eg. kathmandu => kath man du

 #include <QCoreApplication>
 #include <QString>
 #include <QStringList>
 #include <QDebug>
 #include <QFile>
 #include <QTextStream>
 #include <QSet>
 #include <QRegularExpression>

 // Helper to determine the length of special Nepali consonant units
 int getConsonantUnitSize(const QString &word, int index) {
    if (index >= word.length()) return 0;

    // Check 3-letter units
    QString tri = word.mid(index, 3).toLower();
    if (tri == "chh" || tri == "yan") return 3;

    // Check 2-letter units
    QString bi = word.mid(index, 2).toLower();
    static const QSet<QString> digraphs = {
        "bh", "ch", "dh", "gh", "jh", "kh", "ng", "ph", "th", "sh"
    };
    if (digraphs.contains(bi)) return 2;

    return 1;
 }

 // Logic to split a single word into segments
 QStringList splitWordToSegments(QString word) {
    if (word.isEmpty()) return QStringList();
    
    QStringList segments;
    QString currentSegment;
    const QString vowels = "aeiouAEIOU";

    for (int i = 0; i < word.length(); ++i) {
        currentSegment.append(word.at(i));

        if (vowels.contains(word.at(i))) {
            // Keep double/triple vowels together
            if (i + 1 < word.length() && vowels.contains(word.at(i + 1))) {
                continue;
            }

            // Pull Rule: check next consonant unit
            int nextUnitSize = getConsonantUnitSize(word, i + 1);
            if (nextUnitSize > 0) {
                int posAfterUnit = i + 1 + nextUnitSize;
                
                // If followed by another consonant OR if it's a double consonant (e.g., pp in chappa)
                bool isDouble = (word.at(i+1).toLower() == word.mid(posAfterUnit, 1).toLower());
                bool nextIsConsonant = (posAfterUnit >= word.length() || !vowels.contains(word.at(posAfterUnit)));

                if (nextIsConsonant || isDouble) {
                    currentSegment.append(word.mid(i + 1, nextUnitSize));
                    i += nextUnitSize;
                }
            }
            segments.append(currentSegment);
            currentSegment.clear();
        }
    }

    // Attach leftovers to the last segment
    if (!currentSegment.isEmpty()) {
        if (!segments.isEmpty()) segments.append(segments.takeLast() + currentSegment);
        else segments.append(currentSegment);
    }
    return segments;
 }

 int main(int argc, char *argv[]) {
    QCoreApplication a(argc, argv);

    QFile file("words.txt");
    if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
        qCritical() << "Could not open words.txt";
        return 1;
    }

    QTextStream in(&file);
    // Regex to separate words from everything else (spaces, punctuation)
    QRegularExpression re("([a-zA-Z]+|[^a-zA-Z]+)");

    while (!in.atEnd()) {
        QString line = in.readLine();
        QString processedLine;

        QRegularExpressionMatchIterator it = re.globalMatch(line);
        while (it.hasNext()) {
            QRegularExpressionMatch match = it.next();
            QString part = match.captured(1);

            if (QRegularExpression("^[a-zA-Z]+$").match(part).hasMatch()) {
                // Split the word and join with a space as requested
                processedLine += splitWordToSegments(part).join("   "); // 3 spaces
            } else {
                // Keep spaces and punctuation exactly as they are
                processedLine += part;
            }
        }
        qDebug().noquote() << processedLine << "\n";
    }

    file.close();
    return 0;
 }
	// eg: giggaj => dig gaj
	// eg. kathmandu => kath man du

	#include <QCoreApplication>
	#include <QString>
	#include <QStringList>
	#include <QDebug>
	#include <QFile>
	#include <QTextStream>
	#include <QSet>
	#include <QRegularExpression>

	// Helper to determine the length of special Nepali consonant units
	int getConsonantUnitSize(const QString &word, int index) {
	if (index >= word.length()) return 0;

	// Check 3-letter units
	QString tri = word.mid(index, 3).toLower();
	if (tri == "chh" \|\| tri == "yan") return 3;

	// Check 2-letter units
	QString bi = word.mid(index, 2).toLower();
	static const QSet<QString> digraphs = {
	"bh", "ch", "dh", "gh", "jh", "kh", "ng", "ph", "th", "sh"
	};
	if (digraphs.contains(bi)) return 2;

	return 1;
	}

	// Logic to split a single word into segments
	QStringList splitWordToSegments(QString word) {
	if (word.isEmpty()) return QStringList();

	QStringList segments;
	QString currentSegment;
	const QString vowels = "aeiouAEIOU";

	for (int i = 0; i < word.length(); ++i) {
	currentSegment.append(word.at(i));

	if (vowels.contains(word.at(i))) {
	// Keep double/triple vowels together
	if (i + 1 < word.length() && vowels.contains(word.at(i + 1))) {
	continue;
	}

	// Pull Rule: check next consonant unit
	int nextUnitSize = getConsonantUnitSize(word, i + 1);
	if (nextUnitSize > 0) {
	int posAfterUnit = i + 1 + nextUnitSize;

	// If followed by another consonant OR if it's a double consonant (e.g., pp in chappa)
	bool isDouble = (word.at(i+1).toLower() == word.mid(posAfterUnit, 1).toLower());
	bool nextIsConsonant = (posAfterUnit >= word.length() \|\| !vowels.contains(word.at(posAfterUnit)));

	if (nextIsConsonant \|\| isDouble) {
	currentSegment.append(word.mid(i + 1, nextUnitSize));
	i += nextUnitSize;
	}
	}
	segments.append(currentSegment);
	currentSegment.clear();
	}
	}

	// Attach leftovers to the last segment
	if (!currentSegment.isEmpty()) {
	if (!segments.isEmpty()) segments.append(segments.takeLast() + currentSegment);
	else segments.append(currentSegment);
	}
	return segments;
	}

	int main(int argc, char *argv[]) {
	QCoreApplication a(argc, argv);

	QFile file("words.txt");
	if (!file.open(QIODevice::ReadOnly \| QIODevice::Text)) {
	qCritical() << "Could not open words.txt";
	return 1;
	}

	QTextStream in(&file);
	// Regex to separate words from everything else (spaces, punctuation)
	QRegularExpression re("([a-zA-Z]+\|[^a-zA-Z]+)");

	while (!in.atEnd()) {
	QString line = in.readLine();
	QString processedLine;

	QRegularExpressionMatchIterator it = re.globalMatch(line);
	while (it.hasNext()) {
	QRegularExpressionMatch match = it.next();
	QString part = match.captured(1);

	if (QRegularExpression("^[a-zA-Z]+$").match(part).hasMatch()) {
	// Split the word and join with a space as requested
	processedLine += splitWordToSegments(part).join(" "); // 3 spaces
	} else {
	// Keep spaces and punctuation exactly as they are
	processedLine += part;
	}
	}
	qDebug().noquote() << processedLine << "\n";
	}

	file.close();
	return 0;
	}
No results found