Created
December 15, 2025 11:24
-
-
Save skorotkiewicz/4f1b4dbbde61312423d5710e1314c825 to your computer and use it in GitHub Desktop.
generate dataset for password generator RNN/LLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import random | |
| import string | |
| from faker import Faker | |
| fake = Faker() | |
| # Configuration | |
| OUTPUT_FILE = "json_password_train.jsonl" | |
| NUM_SAMPLES = 10000 # Higher count helps the model learn the JSON structure perfectly | |
| # Vocabulary | |
| NOUNS = ['apple', 'tiger', 'ocean', 'mountain', 'falcon', 'pilot', 'river', 'stone', 'sky', 'lion'] | |
| VERBS = ['jump', 'fly', 'swim', 'run', 'walk', 'sleep', 'code', 'hack'] | |
| ADJECTIVES = ['red', 'fast', 'blue', 'silent', 'brave', 'calm', 'dark', 'green', 'happy', 'cool'] | |
| SYMBOLS = ['!', '@', '#', '$', '%', '^', '&', '*'] | |
| def generate_pattern_password(include_nums, include_syms, include_caps, defined_pattern=None): | |
| # Logic to satisfy the pattern template | |
| # For this training data, we simulate the result of the pattern | |
| adj = random.choice(ADJECTIVES) | |
| noun = random.choice(NOUNS) | |
| if include_caps: | |
| adj = adj.capitalize() | |
| noun = noun.capitalize() | |
| # Construct password based on components | |
| parts = [adj, noun] | |
| if include_nums: | |
| parts.append(str(random.randint(10, 99))) | |
| if include_syms: | |
| parts.append(random.choice(SYMBOLS)) | |
| password = "".join(parts) | |
| # Generate the pattern string that describes this password to put in the JSON | |
| # e.g., "[adjective][noun][number]" | |
| pattern_str = "[adjective][noun]" | |
| if include_nums: pattern_str += "[number]" | |
| if include_syms: pattern_str += "[symbol]" | |
| return password, pattern_str | |
| def generate_phrase_password(length, caps): | |
| words = fake.sentence(nb_words=3).replace(".", "").split() | |
| if caps: | |
| words = [w.capitalize() for w in words] | |
| else: | |
| words = [w.lower() for w in words] | |
| password = "".join(words) | |
| # Ensure length constraints (simplified for training data) | |
| return password[:length] | |
| def generate_dev_password(length): | |
| chars = string.ascii_letters + string.digits + "!@#$%" | |
| return "".join(random.choice(chars) for _ in range(length)) | |
| def create_sample(): | |
| # Randomize constraints | |
| p_type = random.choice(["pattern", "phrase", "developer"]) # Using lowercase for JSON values | |
| length = random.randint(8, 20) | |
| inc_num = random.choice([True, False]) | |
| inc_sym = random.choice([True, False]) | |
| inc_cap = random.choice([True, False]) | |
| password = "" | |
| pattern_template = "" # Empty if not pattern type | |
| if p_type == "pattern": | |
| password, pattern_template = generate_pattern_password(inc_num, inc_sym, inc_cap) | |
| elif p_type == "phrase": | |
| password = generate_phrase_password(length, inc_cap) | |
| elif p_type == "developer": | |
| password = generate_dev_password(length) | |
| # Build the exact JSON input object provided by User | |
| input_json = { | |
| "passwordType": p_type, | |
| "length": length, | |
| "includeNumbers": inc_num, | |
| "includeSymbols": inc_sym, | |
| "includeCaps": inc_cap, | |
| "pattern": pattern_template | |
| } | |
| # Convert dict to string | |
| json_str = json.dumps(input_json) | |
| # RWKV Training Format: | |
| # We use a separator "\n\n" so the model knows when the JSON ends and Password begins. | |
| # The training text includes the answer. | |
| full_text = f"{json_str}\n\n{password}" | |
| return {"text": full_text} | |
| print(f"Generating {NUM_SAMPLES} samples...") | |
| with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: | |
| for _ in range(NUM_SAMPLES): | |
| f.write(json.dumps(create_sample()) + "\n") | |
| print(f"Done! File saved: {OUTPUT_FILE}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.