Last active
August 30, 2025 03:10
-
-
Save osoleve/12b781724b93688b65375b6870aad084 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import verifiers as vf | |
from confusable_homoglyphs import confusables | |
from datasets import Dataset | |
def corrupt_text(text: str, max_replacement_rate: float = 0.02) -> str: | |
"""Corrupt the text by replacing characters with their homoglyphs.""" | |
nonspace_chars = [char for char in text if not char.isspace()] | |
confusable_chars = [ | |
char for char in nonspace_chars if char in confusables.confusables_data | |
] | |
random.shuffle(confusable_chars) | |
if not confusable_chars: | |
return text | |
max_replacements = int(len(text) * max_replacement_rate) | |
for _ in range(max_replacements): | |
char = confusable_chars.pop() | |
indices = [i for i, c in enumerate(text) if c == char] | |
index = random.choice(indices) | |
text = ( | |
text[:index] | |
+ random.choice(confusables.confusables_data[char])["c"] | |
+ text[(index + 1) :] | |
) | |
return text | |
def corrupt_example(example, column, max_replacement_rate): | |
example[column] = corrupt_text(example[column], max_replacement_rate) | |
return example | |
def corrupt_column(dataset: Dataset, column: str, max_replacement_rate=0.02): | |
"""Corrupt the specified column of a dataset by replacing characters with their homoglyphs.""" | |
return dataset.map(lambda example: corrupt_example(example, column, max_replacement_rate)) | |
def corrupt_environment( | |
env: vf.Environment, input_column: str = "text", max_replacement_rate: float = 0.02 | |
) -> vf.Environment: | |
""" | |
Loads a custom environment. | |
""" | |
env.dataset = corrupt_column( | |
env.dataset, input_column, max_replacement_rate=max_replacement_rate | |
) | |
return env | |
if __name__ == "__main__": | |
# Example usage | |
text = "Hello, world!" | |
corrupted = corrupt_text(text) | |
print(corrupted) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment