Created
May 25, 2025 21:10
-
-
Save capttwinky/c804423b0dfb76da108f34e5b4c17e6e to your computer and use it in GitHub Desktop.
banned word variant generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# "pyleetspeak", | |
# "zalgo_text", | |
# "requests", | |
# "pyphen", | |
# "tqdm" | |
# ] | |
# /// | |
""" | |
username_mangler.py - A tool for generating strings that contain obfuscated banned words, | |
using homoglyph substitution, leetspeak (via pyleetspeak), symbol injection, Unicode decoration (e.g. Zalgo via zalgo_text), | |
emoji insertion, case swapping, and stores results in an SQLite database. | |
Intended for moderation and filter testing only. | |
""" | |
import argparse | |
from os import path | |
import random | |
import itertools | |
import requests | |
import sqlite3 | |
import time | |
from typing import List, Set, Dict, Optional, Tuple | |
from pyleetspeak.LeetSpeaker import LeetSpeaker | |
from zalgo_text.zalgo import zalgo | |
# Expanded mapping for homoglyphs (including Cyrillic, Greek, symbols, etc.) | |
HOMOGLYPHS: Dict[str, List[str]] = { | |
'a': ['@', '4', 'à', 'á', 'â', 'ä', 'æ', 'ã', 'å', 'α', 'а'], | |
'b': ['8', 'ß', 'ь', 'в', 'Ъ', 'β', '฿'], | |
'c': ['(', '{', '[', '<', '¢', 'с', 'ċ', 'ć', 'ç'], | |
'd': ['đ', 'cl', 'ԁ', 'ժ', 'ď'], | |
'e': ['3', '€', 'ë', 'ē', 'è', 'é', 'ê', '∑', 'е'], | |
'f': ['ƒ', 'ғ'], | |
'g': ['9', 'ɢ', 'ğ', 'ġ', 'ģ', 'ɡ', 'ɠ'], | |
'h': ['#', 'ħ', 'н', '|-|'], | |
'i': ['1', '!', '|', 'í', 'ï', 'î', 'ī', '¡', 'ι', 'і', 'ӏ', 'ɪ'], | |
'j': ['¿', 'ј', 'ʝ'], | |
'k': ['κ', 'к', '|<', '|{'], | |
'l': ['1', '|', '£', '∣', 'ℓ', 'ł', 'ι', 'ӏ'], | |
'm': ['м', 'ɱ', '₥'], | |
'n': ['η', 'и', 'п', 'ñ', 'ŋ', 'п'], | |
'o': ['0', '()', 'oh', 'ø', 'ö', 'ó', 'ò', 'ô', 'õ', 'œ', 'σ', 'о'], | |
'p': ['ρ', 'р', 'þ', '|*'], | |
'q': ['9', 'ԛ', 'φ'], | |
'r': ['®', 'я', 'ř', 'г'], | |
's': ['$', '5', '§', 'ś', 'š', 'ѕ'], | |
't': ['7', '+', '†', 'т', 'ţ', 'ŧ'], | |
'u': ['υ', 'ц', 'µ', 'ü', 'ú', 'ù', 'û', 'ū'], | |
'v': ['ν', 'ѵ', 'v̇'], | |
'w': ['ω', 'ш', 'щ', 'ŵ'], | |
'x': ['×', '%', '*', 'х', 'ж'], | |
'y': ['¥', 'γ', 'у', 'ý', 'ÿ'], | |
'z': ['2', 'ž', 'ż', 'ź', 'ʐ', 'ᴢ'], | |
} | |
# Expanded injection characters, including zero-width and more separators | |
INJECTION_CHARS: List[str] = [ | |
'_', '.', '-', '~', '*', '^', '!', '+', '=', '|', '/', '\\', ':', ';', "'", '"', | |
'·', '•', '¿', '?', '`', ',', '—', '–', '‒', '―', '…', '¡', '¤', '₋', '₌', '₍', '₎', | |
'\u200B', # Zero-width space | |
'\u200C', # Zero-width non-joiner | |
'\u200D', # Zero-width joiner | |
'\u2060', # Word joiner | |
] | |
# Emojis for insertion (sample set, extend as needed) | |
EMOJIS: List[str] = ['😈', '💀', '🤖', '👾', '🔥', '✨', '🥷'] | |
DB_PATH = 'usernames.db' | |
def fetch_banned_words() -> List[str]: | |
""" | |
Fetches a list of banned words from a GitHub repository or local cache. | |
Returns: | |
List[str]: A list of banned words. | |
""" | |
url = "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en" | |
if not path.exists(fetch_banned_words.word_file_path): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
fcnt = [line.strip() for line in response.text.splitlines() if line.strip() and not line.startswith("#")] | |
except Exception as e: | |
print(f"Error fetching word list: {e}") | |
return [] | |
with open(fetch_banned_words.word_file_path, 'w', encoding='utf-8') as ofile: | |
ofile.write('\n'.join(fcnt)) | |
else: | |
with open(fetch_banned_words.word_file_path, 'r', encoding='utf-8') as ofile: | |
fcnt = ofile.read().splitlines() | |
return fcnt | |
fetch_banned_words.word_file_path = path.join('bannedlist.txt') | |
def insert_injections(word: str, max_insertions: int = 2) -> Set[str]: | |
""" | |
Inserts random injection characters into a word to obfuscate it. | |
Args: | |
word (str): The base word. | |
max_insertions (int): Maximum number of injection characters per variant. | |
Returns: | |
Set[str]: A set of obfuscated word variants. | |
""" | |
variants = set() | |
for _ in range(5): | |
chars = list(word) | |
for _ in range(random.randint(1, max_insertions)): | |
idx = random.randint(1, len(chars)) | |
chars.insert(idx, random.choice(INJECTION_CHARS)) | |
variants.add(''.join(chars)) | |
return variants | |
def apply_homoglyphs(word: str, max_variants: int = 100) -> Set[str]: | |
""" | |
Generates homoglyph-based obfuscated variants of a word. | |
Args: | |
word (str): The base word. | |
max_variants (int): Maximum number of variants to generate. | |
Returns: | |
Set[str]: Set of homoglyph-substituted variants. | |
""" | |
def mutate(char: str) -> List[str]: | |
return [char] + HOMOGLYPHS.get(char.lower(), []) | |
possibilities = [mutate(c) for c in word] | |
combos = itertools.product(*possibilities) | |
variants = set() | |
for i, p in enumerate(combos): | |
if i >= max_variants: | |
break | |
variants.add(''.join(p)) | |
return variants | |
def apply_leetspeak(word: str) -> str: | |
""" | |
Converts a word to leetspeak using pyleetspeak. | |
Args: | |
word (str): The base word. | |
Returns: | |
str: The leetspeak variant. | |
""" | |
leeter = LeetSpeaker(change_prb=0.8, change_frq=0.6, mode="basic", seed=None, verbose=False) | |
return leeter.text2leet(word) | |
def apply_zalgo(word: str) -> str: | |
""" | |
Applies zalgo (Unicode combining character) obfuscation to a word. | |
Args: | |
word (str): The base word. | |
Returns: | |
str: The zalgo-ified variant. | |
""" | |
return zalgo().zalgofy(word) | |
def insert_emojis(word: str, emoji_mode: int = 1) -> Set[str]: | |
""" | |
Optionally inserts emojis into a word to obfuscate it. | |
Args: | |
word (str): The base word. | |
emoji_mode (int): 0 for no emoji, 1 for rare, 2 for moderate, 3 for frequent. | |
Returns: | |
Set[str]: Set of emoji-obfuscated variants, or empty set if not applied. | |
""" | |
if emoji_mode == 0: | |
return set() | |
probabilities = {1: 0.10, 2: 0.5, 3: 1.0} | |
prob = probabilities.get(emoji_mode, 0.10) | |
variants = set() | |
if random.random() > prob: | |
return set() | |
for _ in range(random.randint(1, 2)): | |
chars = list(word) | |
for _ in range(random.randint(1, 2)): | |
idx = random.randint(1, len(chars)) | |
chars.insert(idx, random.choice(EMOJIS)) | |
variants.add(''.join(chars)) | |
return variants | |
def swap_cases(word: str, max_swaps: int = 2) -> str: | |
""" | |
Randomly swaps the case of up to max_swaps characters in a word. | |
Args: | |
word (str): The base word. | |
max_swaps (int): Maximum number of characters to swap case. | |
Returns: | |
str: Variant with swapped case. | |
""" | |
indices = list(range(len(word))) | |
num_swaps = min(max_swaps, len(word)) | |
if len(indices) > 0 and num_swaps > 0: | |
swap_indices = random.sample(indices, num_swaps) | |
else: | |
swap_indices = [] | |
chars = [ | |
c.upper() if i in swap_indices and c.islower() else | |
c.lower() if i in swap_indices and c.isupper() else | |
c | |
for i, c in enumerate(word) | |
] | |
return ''.join(chars) | |
# --- SQLite Integration --- | |
def init_db(db_path: str = DB_PATH) -> None: | |
""" | |
Initializes the SQLite database for storing usernames. | |
Args: | |
db_path (str): Path to the SQLite database file. | |
""" | |
with sqlite3.connect(db_path) as conn: | |
c = conn.cursor() | |
c.execute(''' | |
CREATE TABLE IF NOT EXISTS usernames ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
base_word TEXT, | |
obfuscated TEXT, | |
method TEXT, | |
timestamp INTEGER | |
) | |
''') | |
conn.commit() | |
def save_username(base_word: str, obfuscated: str, method: str, db_path: str = DB_PATH) -> None: | |
""" | |
Saves an obfuscated username variant to the SQLite database. | |
Args: | |
base_word (str): The original word. | |
obfuscated (str): The obfuscated variant. | |
method (str): The obfuscation method used. | |
db_path (str): Path to the SQLite database file. | |
""" | |
with sqlite3.connect(db_path) as conn: | |
c = conn.cursor() | |
c.execute(''' | |
INSERT INTO usernames (base_word, obfuscated, method, timestamp) | |
VALUES (?, ?, ?, ?) | |
''', (base_word, obfuscated, method, int(time.time()))) | |
conn.commit() | |
def query_usernames(db_path: str = DB_PATH, filter_word: Optional[str] = None, limit: int = 20) -> List[Tuple[str, str, str, str]]: | |
""" | |
Queries obfuscated usernames from the SQLite database. | |
Args: | |
db_path (str): Path to the SQLite database file. | |
filter_word (Optional[str]): Filter for base word or obfuscated value. | |
limit (int): Maximum number of results to return. | |
Returns: | |
List[Tuple[str, str, str, str]]: List of tuples with (base_word, obfuscated, method, timestamp). | |
""" | |
with sqlite3.connect(db_path) as conn: | |
c = conn.cursor() | |
if filter_word: | |
c.execute(''' | |
SELECT base_word, obfuscated, method, datetime(timestamp, 'unixepoch') | |
FROM usernames | |
WHERE base_word LIKE ? OR obfuscated LIKE ? | |
ORDER BY timestamp DESC | |
LIMIT ? | |
''', (f'%{filter_word}%', f'%{filter_word}%', limit)) | |
else: | |
c.execute(''' | |
SELECT base_word, obfuscated, method, datetime(timestamp, 'unixepoch') | |
FROM usernames | |
ORDER BY timestamp DESC | |
LIMIT ? | |
''', (limit,)) | |
return c.fetchall() | |
def generate_usernames( | |
base_words: List[str], | |
limit_per_word: int = 5, | |
seed: Optional[int] = None, | |
save_to_db: bool = False, | |
emoji_mode: int = 1 | |
) -> Dict[str, Set[str]]: | |
""" | |
Generates obfuscated username variants for each base word. | |
Args: | |
base_words (List[str]): List of words to obfuscate. | |
limit_per_word (int): Maximum variants per word. | |
seed (Optional[int]): Random seed for reproducibility. | |
save_to_db (bool): Whether to save variants to SQLite. | |
emoji_mode (int): Emoji insertion mode (0-3). | |
Returns: | |
Dict[str, Set[str]]: Dictionary mapping base words to sets of obfuscated variants. | |
""" | |
if seed is not None: | |
random.seed(seed) | |
usernames: Dict[str, Set[str]] = dict() | |
for word in base_words: | |
word_variants: Set[str] = set() | |
homoglyph_variants = apply_homoglyphs(word, max_variants=100) | |
sampled_variants = random.sample(list(homoglyph_variants), min(limit_per_word, len(homoglyph_variants))) | |
for variant in sampled_variants: | |
inj = insert_injections(variant) | |
if save_to_db: | |
for v in inj: | |
save_username(word, v, 'injection') | |
word_variants.update(inj) | |
leet = apply_leetspeak(variant) | |
if save_to_db: | |
save_username(word, leet, 'leetspeak') | |
word_variants.add(leet) | |
zal = apply_zalgo(variant) | |
if save_to_db: | |
save_username(word, zal, 'zalgo') | |
word_variants.add(zal) | |
emoji = insert_emojis(variant, emoji_mode=emoji_mode) | |
if save_to_db and emoji: | |
for v in emoji: | |
save_username(word, v, 'emoji') | |
word_variants.update(emoji) | |
case = swap_cases(variant) | |
if save_to_db: | |
save_username(word, case, 'case') | |
word_variants.add(case) | |
usernames[word] = word_variants | |
return usernames | |
def main() -> None: | |
""" | |
Parses command-line arguments and runs the username mangler tool. | |
""" | |
parser = argparse.ArgumentParser(description="Generate obfuscated usernames for filter testing.") | |
parser.add_argument('--count', type=int, default=10, help='Number of banned words to use') | |
parser.add_argument('--variants', type=int, default=5, help='Variants per word') | |
parser.add_argument('--seed', type=int, default=None, help='Random seed for reproducibility') | |
parser.add_argument('--output', type=str, help='Output file (default: stdout)') | |
parser.add_argument('--sqlite', action='store_true', help='Store results in SQLite database') | |
parser.add_argument('--query', type=str, help='Query the SQLite database for usernames (use with --sqlite)') | |
parser.add_argument('--limit', type=int, default=20, help='Limit results for --query') | |
parser.add_argument('--emoji', type=int, default=1, choices=[0, 1, 2, 3], | |
help='Emoji variant rarity: 0 (never), 1 (rare - default), 2 (moderate), 3 (common)') | |
args = parser.parse_args() | |
if args.sqlite: | |
init_db() | |
if args.query and args.sqlite: | |
results = query_usernames(filter_word=args.query, limit=args.limit) | |
for base_word, obfuscated, method, timestamp in results: | |
print(f'{timestamp}\t{base_word}\t{obfuscated}\t{method}') | |
return | |
if args.seed is not None: | |
random.seed(args.seed) | |
bad_words = fetch_banned_words() | |
if not bad_words: | |
print("No banned words loaded. Exiting.") | |
return | |
usernames = generate_usernames( | |
random.sample(bad_words, args.count), | |
limit_per_word=args.variants, | |
seed=args.seed, | |
save_to_db=args.sqlite, | |
emoji_mode=args.emoji | |
) | |
out: List[str] = [] | |
for word, names in sorted(usernames.items()): | |
out.append(word) | |
out.extend(f'\t{name}' for name in sorted(names)) | |
if args.output: | |
with open(args.output, 'w', encoding='utf-8') as f: | |
f.write('\n'.join(out)) | |
else: | |
print('\n'.join(out)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment