Created
August 19, 2025 19:35
-
-
Save sparr/91969ebc0ce1cffcf356f7fe15312ae0 to your computer and use it in GitHub Desktop.
Python script to find po file translations that differ by just a few characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import polib | |
import Levenshtein as L | |
pofile = polib.pofile('po/base.pot') | |
entries = {} | |
for entry in pofile: | |
entry_len = len((entry.msgctxt or "") + entry.msgid) | |
if entry_len not in entries: | |
entries[entry_len] = [] | |
entries[entry_len].append(entry) | |
max_delta = 2 | |
for entries_len in range(20, 4000): | |
if entries_len in entries: | |
for i in range(len(entries[entries_len]) - 1): | |
entry = entries[entries_len][i] | |
for entries_len_2 in range(entries_len, entries_len + max_delta + 1): | |
if entries_len_2 in entries: | |
for entry2 in entries[entries_len_2][entries_len == entries_len_2 and i+1 or 0:]: | |
s1 = (entry.msgctxt and (entry.msgctxt + " : ") or "") + entry.msgid | |
s2 = (entry2.msgctxt and (entry2.msgctxt + " : ") or "") + entry2.msgid | |
dist = L.distance(s1=entry.msgid.lower(), s2=entry2.msgid.lower(), score_cutoff=2) | |
if dist > 0 and dist <= max_delta: | |
print("'", s1, "' | '", s2, "'", sep="") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment