Skip to content

Instantly share code, notes, and snippets.

@sparr
Created August 19, 2025 19:35
Show Gist options
  • Save sparr/91969ebc0ce1cffcf356f7fe15312ae0 to your computer and use it in GitHub Desktop.
Save sparr/91969ebc0ce1cffcf356f7fe15312ae0 to your computer and use it in GitHub Desktop.
Python script to find po file translations that differ by just a few characters
#!/usr/bin/env python
import polib
import Levenshtein as L
pofile = polib.pofile('po/base.pot')
entries = {}
for entry in pofile:
entry_len = len((entry.msgctxt or "") + entry.msgid)
if entry_len not in entries:
entries[entry_len] = []
entries[entry_len].append(entry)
max_delta = 2
for entries_len in range(20, 4000):
if entries_len in entries:
for i in range(len(entries[entries_len]) - 1):
entry = entries[entries_len][i]
for entries_len_2 in range(entries_len, entries_len + max_delta + 1):
if entries_len_2 in entries:
for entry2 in entries[entries_len_2][entries_len == entries_len_2 and i+1 or 0:]:
s1 = (entry.msgctxt and (entry.msgctxt + " : ") or "") + entry.msgid
s2 = (entry2.msgctxt and (entry2.msgctxt + " : ") or "") + entry2.msgid
dist = L.distance(s1=entry.msgid.lower(), s2=entry2.msgid.lower(), score_cutoff=2)
if dist > 0 and dist <= max_delta:
print("'", s1, "' | '", s2, "'", sep="")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment