Skip to content

Instantly share code, notes, and snippets.

@osoleve
Last active August 30, 2025 03:10
Show Gist options
  • Save osoleve/12b781724b93688b65375b6870aad084 to your computer and use it in GitHub Desktop.
Save osoleve/12b781724b93688b65375b6870aad084 to your computer and use it in GitHub Desktop.
import random
import verifiers as vf
from confusable_homoglyphs import confusables
from datasets import Dataset
def corrupt_text(text: str, max_replacement_rate: float = 0.02) -> str:
"""Corrupt the text by replacing characters with their homoglyphs."""
nonspace_chars = [char for char in text if not char.isspace()]
confusable_chars = [
char for char in nonspace_chars if char in confusables.confusables_data
]
random.shuffle(confusable_chars)
if not confusable_chars:
return text
max_replacements = int(len(text) * max_replacement_rate)
for _ in range(max_replacements):
char = confusable_chars.pop()
indices = [i for i, c in enumerate(text) if c == char]
index = random.choice(indices)
text = (
text[:index]
+ random.choice(confusables.confusables_data[char])["c"]
+ text[(index + 1) :]
)
return text
def corrupt_example(example, column, max_replacement_rate):
example[column] = corrupt_text(example[column], max_replacement_rate)
return example
def corrupt_column(dataset: Dataset, column: str, max_replacement_rate=0.02):
"""Corrupt the specified column of a dataset by replacing characters with their homoglyphs."""
return dataset.map(lambda example: corrupt_example(example, column, max_replacement_rate))
def corrupt_environment(
env: vf.Environment, input_column: str = "text", max_replacement_rate: float = 0.02
) -> vf.Environment:
"""
Loads a custom environment.
"""
env.dataset = corrupt_column(
env.dataset, input_column, max_replacement_rate=max_replacement_rate
)
return env
if __name__ == "__main__":
# Example usage
text = "Hello, world!"
corrupted = corrupt_text(text)
print(corrupted)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment