ssghost · June 30, 2025 08:52
diff --git a/reward.py b/reward.py
 import requests
 from collections import OrderedDict
 import logging
 import re
 import json
 from time import sleep

 def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

 def get_score(text: str) -> float:
    base_url = "https://api.zerogpt.com/api/detect/detectText"
    headers = OrderedDict((
        ("Host", "api.zerogpt.com"),
        ("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"),
        ("Accept", "application/json, text/plain, */*"),
        ("Accept-Language", "en-US,en;q=0.5"),
        ("Accept-Encoding", "gzip, deflate, br, zstd"),
        ("Content-Type", "application/json"),
        ("Origin", "https://www.zerogpt.com"),
        ("Connection", "keep-alive"),
        ("Referer", "https://www.zerogpt.com/"),
        ("Sec-Fetch-Dest", "empty"),
        ("Sec-Fetch-Mode", "cors"),
        ("Sec-Fetch-Site", "same-site"),
        ("Priority", "u=0"),
        ("Pragma", "no-cache"),
        ("Cache-Control", "no-cache")
    ))

    data = {
        "input_text": text,
    }

    r = requests.post(base_url, headers=headers, json=data)

    #return r.json()
    j = r.json()

    if j['code'] != 200:
        logging.error(f"Failed to get score. Response: {j}")
        sleep(1)
        return 0.0 # Not ideal, but we need to return a float in all cases
    
    fake_percentage = j["data"]["fakePercentage"]
    return 100.0 - fake_percentage # Penalize for highly fake content

 def extract_answer_content(text: str) -> str | None:
    """Extract just the content between <answer> tags"""
    try:
        answer = text.split("<answer>")[-1]
        answer = answer.split("</answer>")[0]
        return answer.strip()
    except:
        return None

 def extract_reasoning_content(text: str) -> str | None:
    """Extract just the content between <reasoning> tags"""
    try:
        reasoning = text.split("<reasoning>")[-1]
        reasoning = reasoning.split("</reasoning>")[0]
        return reasoning.strip()
    except:
        return None

 def tiered_reward_func(completions, **kwargs) -> list[float]:
    """
    Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers
    """
    responses = [completion[0]["content"] for completion in completions]
    rewards = []
    strict_pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>$"
    
    for response in responses:
        reward = 0.0
        response = response.strip()
        
        # Tier 0: Individual tag rewards
        if response.count("<reasoning>") == 1:
            reward += 0.1
        if response.count("</reasoning>") == 1:
            reward += 0.1
        if response.count("<answer>") == 1:
            reward += 0.1
        if response.count("</answer>") == 1:
            reward += 0.1
        
        # Tier 1: Format checking
        soft_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
        if re.search(soft_pattern, response, re.DOTALL):
            reward += 0.25
            
            xml_score = count_xml(response)
            reward += xml_score
            
            strict_match = re.match(strict_pattern, response)
            
            # Tier 2: Only add ZeroGPT score if strict format passes
            if strict_match:
                reward += 0.5

                # Extract just the answer content for check
                answer_content = extract_answer_content(response)
                if answer_content:
                    reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range
                
                reasoning_content = extract_reasoning_content(response)
                if reasoning_content:
                    reasoning_len = len(reasoning_content)
                    reasoning_len = min(reasoning_len, 500)
                    reward += 0.005 * reasoning_len # Max out at reward of +2.5
        
        rewards.append(reward)
    
    # Print the response with the highest reward
    best_idx = rewards.index(max(rewards))
    print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")
    
    return rewards


 from transformers import pipeline
 selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457"
 classifier = pipeline("text-classification", model=selected_model)
 #classifier("I love Machine Learning!", top_k=100)
 # Returns:
 #[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}]

 def get_gibberish_score(text: str) -> float:
    lines = text.split("\n")
    results = []
    for line in lines:
        line = line.strip()
        if line: # Skip empty lines
            clf_response = classifier(line, top_k=100)
            # Find the 'clean' score
            clean_score = 0.0
            for r in clf_response:
                if r['label'] == 'clean':
                    clean_score = r['score']
                    break
            results.append(clean_score)
    # Return the average clean score
    #return sum(results) / len(results)
    # Model learned to cheat this, so instead we'll return the minimum clean score
    #return min(results)
    # Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score
    return min(results) ** 2

 def only_zerogpt_reward_func(completions, **kwargs) -> list[float]:
    """
    Reward function for ZeroGPT. Only uses ZeroGPT API to score completions
    """
    responses = [completion[0]["content"] for completion in completions]
    rewards = [get_score(response) for response in responses]
    gibberish_multipliers = [get_gibberish_score(response) for response in responses]
    # Multiply the rewards by the gibberish multipliers
    rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)]
    
    # Print the response with the highest reward
    best_idx = rewards.index(max(rewards))
    print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")
    
    return rewards

 SYSTEM_PROMPT = """
 You are a human essay writer. You write like a human, not a language model.
 """

 def load_dataset(file="prompts.json"):
    with open(file, "r") as f:
        data = json.load(f)
    
    r = []
    for d in data:
        r.append({
            "prompt": [
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT
                },
                {
                    "role": "user",
                    "content": d
                }
            ]
        })
    
    return r

 dataset = load_dataset()
	import requests
	from collections import OrderedDict
	import logging
	import re
	import json
	from time import sleep

	def count_xml(text) -> float:
	count = 0.0
	if text.count("<reasoning>\n") == 1:
	count += 0.125
	if text.count("\n</reasoning>\n") == 1:
	count += 0.125
	if text.count("\n<answer>\n") == 1:
	count += 0.125
	count -= len(text.split("\n</answer>\n")[-1])*0.001
	if text.count("\n</answer>") == 1:
	count += 0.125
	count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
	return count

	def get_score(text: str) -> float:
	base_url = "https://api.zerogpt.com/api/detect/detectText"
	headers = OrderedDict((
	("Host", "api.zerogpt.com"),
	("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"),
	("Accept", "application/json, text/plain, /"),
	("Accept-Language", "en-US,en;q=0.5"),
	("Accept-Encoding", "gzip, deflate, br, zstd"),
	("Content-Type", "application/json"),
	("Origin", "https://www.zerogpt.com"),
	("Connection", "keep-alive"),
	("Referer", "https://www.zerogpt.com/"),
	("Sec-Fetch-Dest", "empty"),
	("Sec-Fetch-Mode", "cors"),
	("Sec-Fetch-Site", "same-site"),
	("Priority", "u=0"),
	("Pragma", "no-cache"),
	("Cache-Control", "no-cache")
	))

	data = {
	"input_text": text,
	}

	r = requests.post(base_url, headers=headers, json=data)

	#return r.json()
	j = r.json()

	if j['code'] != 200:
	logging.error(f"Failed to get score. Response: {j}")
	sleep(1)
	return 0.0 # Not ideal, but we need to return a float in all cases

	fake_percentage = j["data"]["fakePercentage"]
	return 100.0 - fake_percentage # Penalize for highly fake content

	def extract_answer_content(text: str) -> str \| None:
	"""Extract just the content between <answer> tags"""
	try:
	answer = text.split("<answer>")[-1]
	answer = answer.split("</answer>")[0]
	return answer.strip()
	except:
	return None

	def extract_reasoning_content(text: str) -> str \| None:
	"""Extract just the content between <reasoning> tags"""
	try:
	reasoning = text.split("<reasoning>")[-1]
	reasoning = reasoning.split("</reasoning>")[0]
	return reasoning.strip()
	except:
	return None

	def tiered_reward_func(completions, **kwargs) -> list[float]:
	"""
	Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers
	"""
	responses = [completion[0]["content"] for completion in completions]
	rewards = []
	strict_pattern = r"^<reasoning>\n.?\n</reasoning>\n<answer>\n.?\n</answer>$"

	for response in responses:
	reward = 0.0
	response = response.strip()

	# Tier 0: Individual tag rewards
	if response.count("<reasoning>") == 1:
	reward += 0.1
	if response.count("</reasoning>") == 1:
	reward += 0.1
	if response.count("<answer>") == 1:
	reward += 0.1
	if response.count("</answer>") == 1:
	reward += 0.1

	# Tier 1: Format checking
	soft_pattern = r"<reasoning>.?</reasoning>\s<answer>.*?</answer>"
	if re.search(soft_pattern, response, re.DOTALL):
	reward += 0.25

	xml_score = count_xml(response)
	reward += xml_score

	strict_match = re.match(strict_pattern, response)

	# Tier 2: Only add ZeroGPT score if strict format passes
	if strict_match:
	reward += 0.5

	# Extract just the answer content for check
	answer_content = extract_answer_content(response)
	if answer_content:
	reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range

	reasoning_content = extract_reasoning_content(response)
	if reasoning_content:
	reasoning_len = len(reasoning_content)
	reasoning_len = min(reasoning_len, 500)
	reward += 0.005 * reasoning_len # Max out at reward of +2.5

	rewards.append(reward)

	# Print the response with the highest reward
	best_idx = rewards.index(max(rewards))
	print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")

	return rewards


	from transformers import pipeline
	selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457"
	classifier = pipeline("text-classification", model=selected_model)
	#classifier("I love Machine Learning!", top_k=100)
	# Returns:
	#[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}]

	def get_gibberish_score(text: str) -> float:
	lines = text.split("\n")
	results = []
	for line in lines:
	line = line.strip()
	if line: # Skip empty lines
	clf_response = classifier(line, top_k=100)
	# Find the 'clean' score
	clean_score = 0.0
	for r in clf_response:
	if r['label'] == 'clean':
	clean_score = r['score']
	break
	results.append(clean_score)
	# Return the average clean score
	#return sum(results) / len(results)
	# Model learned to cheat this, so instead we'll return the minimum clean score
	#return min(results)
	# Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score
	return min(results) ** 2

	def only_zerogpt_reward_func(completions, **kwargs) -> list[float]:
	"""
	Reward function for ZeroGPT. Only uses ZeroGPT API to score completions
	"""
	responses = [completion[0]["content"] for completion in completions]
	rewards = [get_score(response) for response in responses]
	gibberish_multipliers = [get_gibberish_score(response) for response in responses]
	# Multiply the rewards by the gibberish multipliers
	rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)]

	# Print the response with the highest reward
	best_idx = rewards.index(max(rewards))
	print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")

	return rewards

	SYSTEM_PROMPT = """
	You are a human essay writer. You write like a human, not a language model.
	"""

	def load_dataset(file="prompts.json"):
	with open(file, "r") as f:
	data = json.load(f)

	r = []
	for d in data:
	r.append({
	"prompt": [
	{
	"role": "system",
	"content": SYSTEM_PROMPT
	},
	{
	"role": "user",
	"content": d
	}
	]
	})

	return r

	dataset = load_dataset()