Skip to content

Instantly share code, notes, and snippets.

@skorotkiewicz
Created December 15, 2025 11:24
Show Gist options
  • Select an option

  • Save skorotkiewicz/4f1b4dbbde61312423d5710e1314c825 to your computer and use it in GitHub Desktop.

Select an option

Save skorotkiewicz/4f1b4dbbde61312423d5710e1314c825 to your computer and use it in GitHub Desktop.
generate dataset for password generator RNN/LLM
import json
import random
import string
from faker import Faker
fake = Faker()
# Configuration
OUTPUT_FILE = "json_password_train.jsonl"
NUM_SAMPLES = 10000 # Higher count helps the model learn the JSON structure perfectly
# Vocabulary
NOUNS = ['apple', 'tiger', 'ocean', 'mountain', 'falcon', 'pilot', 'river', 'stone', 'sky', 'lion']
VERBS = ['jump', 'fly', 'swim', 'run', 'walk', 'sleep', 'code', 'hack']
ADJECTIVES = ['red', 'fast', 'blue', 'silent', 'brave', 'calm', 'dark', 'green', 'happy', 'cool']
SYMBOLS = ['!', '@', '#', '$', '%', '^', '&', '*']
def generate_pattern_password(include_nums, include_syms, include_caps, defined_pattern=None):
# Logic to satisfy the pattern template
# For this training data, we simulate the result of the pattern
adj = random.choice(ADJECTIVES)
noun = random.choice(NOUNS)
if include_caps:
adj = adj.capitalize()
noun = noun.capitalize()
# Construct password based on components
parts = [adj, noun]
if include_nums:
parts.append(str(random.randint(10, 99)))
if include_syms:
parts.append(random.choice(SYMBOLS))
password = "".join(parts)
# Generate the pattern string that describes this password to put in the JSON
# e.g., "[adjective][noun][number]"
pattern_str = "[adjective][noun]"
if include_nums: pattern_str += "[number]"
if include_syms: pattern_str += "[symbol]"
return password, pattern_str
def generate_phrase_password(length, caps):
words = fake.sentence(nb_words=3).replace(".", "").split()
if caps:
words = [w.capitalize() for w in words]
else:
words = [w.lower() for w in words]
password = "".join(words)
# Ensure length constraints (simplified for training data)
return password[:length]
def generate_dev_password(length):
chars = string.ascii_letters + string.digits + "!@#$%"
return "".join(random.choice(chars) for _ in range(length))
def create_sample():
# Randomize constraints
p_type = random.choice(["pattern", "phrase", "developer"]) # Using lowercase for JSON values
length = random.randint(8, 20)
inc_num = random.choice([True, False])
inc_sym = random.choice([True, False])
inc_cap = random.choice([True, False])
password = ""
pattern_template = "" # Empty if not pattern type
if p_type == "pattern":
password, pattern_template = generate_pattern_password(inc_num, inc_sym, inc_cap)
elif p_type == "phrase":
password = generate_phrase_password(length, inc_cap)
elif p_type == "developer":
password = generate_dev_password(length)
# Build the exact JSON input object provided by User
input_json = {
"passwordType": p_type,
"length": length,
"includeNumbers": inc_num,
"includeSymbols": inc_sym,
"includeCaps": inc_cap,
"pattern": pattern_template
}
# Convert dict to string
json_str = json.dumps(input_json)
# RWKV Training Format:
# We use a separator "\n\n" so the model knows when the JSON ends and Password begins.
# The training text includes the answer.
full_text = f"{json_str}\n\n{password}"
return {"text": full_text}
print(f"Generating {NUM_SAMPLES} samples...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
for _ in range(NUM_SAMPLES):
f.write(json.dumps(create_sample()) + "\n")
print(f"Done! File saved: {OUTPUT_FILE}")
@skorotkiewicz
Copy link
Author

skorotkiewicz commented Dec 15, 2025

from transformers import GPTNeoForCausalLM, GPT2TokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset

# === Load your generated dataset ===
dataset = load_dataset("json", data_files="json_password_train.jsonl", split="train")

# ===  Load tokenizer and model ===
tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-125M")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ===  Tokenize ===
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# ===  Training arguments ===
training_args = TrainingArguments(
    output_dir="./gpt-neo-password",
    overwrite_output_dir=True,
    num_train_epochs=1,          # You can increase later
    per_device_train_batch_size=1,  # GTX970 -> tiny batch
    save_steps=500,
    save_total_limit=2,
    fp16=True,                   # Mixed precision if supported
    logging_steps=50,
    learning_rate=5e-5,          # Small LR for fine-tuning
)

# ===  Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# ===  Train ===
trainer.train()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment