skorotkiewicz · December 15, 2025 11:24 · skorotkiewicz · Dec 15, 2025
diff --git a/1_generate.py b/1_generate.py
 import json
 import random
 import string
 from faker import Faker

 fake = Faker()

 # Configuration
 OUTPUT_FILE = "json_password_train.jsonl"
 NUM_SAMPLES = 10000 # Higher count helps the model learn the JSON structure perfectly

 # Vocabulary
 NOUNS = ['apple', 'tiger', 'ocean', 'mountain', 'falcon', 'pilot', 'river', 'stone', 'sky', 'lion']
 VERBS = ['jump', 'fly', 'swim', 'run', 'walk', 'sleep', 'code', 'hack']
 ADJECTIVES = ['red', 'fast', 'blue', 'silent', 'brave', 'calm', 'dark', 'green', 'happy', 'cool']
 SYMBOLS = ['!', '@', '#', '$', '%', '^', '&', '*']

 def generate_pattern_password(include_nums, include_syms, include_caps, defined_pattern=None):
    # Logic to satisfy the pattern template
    # For this training data, we simulate the result of the pattern
    adj = random.choice(ADJECTIVES)
    noun = random.choice(NOUNS)
    
    if include_caps:
        adj = adj.capitalize()
        noun = noun.capitalize()

    # Construct password based on components
    parts = [adj, noun]
    
    if include_nums:
        parts.append(str(random.randint(10, 99)))
    
    if include_syms:
        parts.append(random.choice(SYMBOLS))

    password = "".join(parts)
    
    # Generate the pattern string that describes this password to put in the JSON
    # e.g., "[adjective][noun][number]"
    pattern_str = "[adjective][noun]"
    if include_nums: pattern_str += "[number]"
    if include_syms: pattern_str += "[symbol]"
    
    return password, pattern_str

 def generate_phrase_password(length, caps):
    words = fake.sentence(nb_words=3).replace(".", "").split()
    if caps:
        words = [w.capitalize() for w in words]
    else:
        words = [w.lower() for w in words]
    
    password = "".join(words)
    # Ensure length constraints (simplified for training data)
    return password[:length]

 def generate_dev_password(length):
    chars = string.ascii_letters + string.digits + "!@#$%"
    return "".join(random.choice(chars) for _ in range(length))

 def create_sample():
    # Randomize constraints
    p_type = random.choice(["pattern", "phrase", "developer"]) # Using lowercase for JSON values
    length = random.randint(8, 20)
    inc_num = random.choice([True, False])
    inc_sym = random.choice([True, False])
    inc_cap = random.choice([True, False])
    
    password = ""
    pattern_template = "" # Empty if not pattern type

    if p_type == "pattern":
        password, pattern_template = generate_pattern_password(inc_num, inc_sym, inc_cap)
    elif p_type == "phrase":
        password = generate_phrase_password(length, inc_cap)
    elif p_type == "developer":
        password = generate_dev_password(length)

    # Build the exact JSON input object provided by User
    input_json = {
        "passwordType": p_type,
        "length": length,
        "includeNumbers": inc_num,
        "includeSymbols": inc_sym,
        "includeCaps": inc_cap,
        "pattern": pattern_template
    }
    
    # Convert dict to string
    json_str = json.dumps(input_json)
    
    # RWKV Training Format:
    # We use a separator "\n\n" so the model knows when the JSON ends and Password begins.
    # The training text includes the answer.
    full_text = f"{json_str}\n\n{password}"
    
    return {"text": full_text}

 print(f"Generating {NUM_SAMPLES} samples...")
 with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for _ in range(NUM_SAMPLES):
        f.write(json.dumps(create_sample()) + "\n")

 print(f"Done! File saved: {OUTPUT_FILE}")
	import json
	import random
	import string
	from faker import Faker

	fake = Faker()

	# Configuration
	OUTPUT_FILE = "json_password_train.jsonl"
	NUM_SAMPLES = 10000 # Higher count helps the model learn the JSON structure perfectly

	# Vocabulary
	NOUNS = ['apple', 'tiger', 'ocean', 'mountain', 'falcon', 'pilot', 'river', 'stone', 'sky', 'lion']
	VERBS = ['jump', 'fly', 'swim', 'run', 'walk', 'sleep', 'code', 'hack']
	ADJECTIVES = ['red', 'fast', 'blue', 'silent', 'brave', 'calm', 'dark', 'green', 'happy', 'cool']
	SYMBOLS = ['!', '@', '#', '$', '%', '^', '&', '*']

	def generate_pattern_password(include_nums, include_syms, include_caps, defined_pattern=None):
	# Logic to satisfy the pattern template
	# For this training data, we simulate the result of the pattern
	adj = random.choice(ADJECTIVES)
	noun = random.choice(NOUNS)

	if include_caps:
	adj = adj.capitalize()
	noun = noun.capitalize()

	# Construct password based on components
	parts = [adj, noun]

	if include_nums:
	parts.append(str(random.randint(10, 99)))

	if include_syms:
	parts.append(random.choice(SYMBOLS))

	password = "".join(parts)

	# Generate the pattern string that describes this password to put in the JSON
	# e.g., "[adjective][noun][number]"
	pattern_str = "[adjective][noun]"
	if include_nums: pattern_str += "[number]"
	if include_syms: pattern_str += "[symbol]"

	return password, pattern_str

	def generate_phrase_password(length, caps):
	words = fake.sentence(nb_words=3).replace(".", "").split()
	if caps:
	words = [w.capitalize() for w in words]
	else:
	words = [w.lower() for w in words]

	password = "".join(words)
	# Ensure length constraints (simplified for training data)
	return password[:length]

	def generate_dev_password(length):
	chars = string.ascii_letters + string.digits + "!@#$%"
	return "".join(random.choice(chars) for _ in range(length))

	def create_sample():
	# Randomize constraints
	p_type = random.choice(["pattern", "phrase", "developer"]) # Using lowercase for JSON values
	length = random.randint(8, 20)
	inc_num = random.choice([True, False])
	inc_sym = random.choice([True, False])
	inc_cap = random.choice([True, False])

	password = ""
	pattern_template = "" # Empty if not pattern type

	if p_type == "pattern":
	password, pattern_template = generate_pattern_password(inc_num, inc_sym, inc_cap)
	elif p_type == "phrase":
	password = generate_phrase_password(length, inc_cap)
	elif p_type == "developer":
	password = generate_dev_password(length)

	# Build the exact JSON input object provided by User
	input_json = {
	"passwordType": p_type,
	"length": length,
	"includeNumbers": inc_num,
	"includeSymbols": inc_sym,
	"includeCaps": inc_cap,
	"pattern": pattern_template
	}

	# Convert dict to string
	json_str = json.dumps(input_json)

	# RWKV Training Format:
	# We use a separator "\n\n" so the model knows when the JSON ends and Password begins.
	# The training text includes the answer.
	full_text = f"{json_str}\n\n{password}"

	return {"text": full_text}

	print(f"Generating {NUM_SAMPLES} samples...")
	with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
	for _ in range(NUM_SAMPLES):
	f.write(json.dumps(create_sample()) + "\n")

	print(f"Done! File saved: {OUTPUT_FILE}")
No results found