Skip to content

Instantly share code, notes, and snippets.

@ssghost
Created June 30, 2025 08:52
Show Gist options
  • Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.
Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.
import requests
from collections import OrderedDict
import logging
import re
import json
from time import sleep
def count_xml(text) -> float:
count = 0.0
if text.count("<reasoning>\n") == 1:
count += 0.125
if text.count("\n</reasoning>\n") == 1:
count += 0.125
if text.count("\n<answer>\n") == 1:
count += 0.125
count -= len(text.split("\n</answer>\n")[-1])*0.001
if text.count("\n</answer>") == 1:
count += 0.125
count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
return count
def get_score(text: str) -> float:
base_url = "https://api.zerogpt.com/api/detect/detectText"
headers = OrderedDict((
("Host", "api.zerogpt.com"),
("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"),
("Accept", "application/json, text/plain, */*"),
("Accept-Language", "en-US,en;q=0.5"),
("Accept-Encoding", "gzip, deflate, br, zstd"),
("Content-Type", "application/json"),
("Origin", "https://www.zerogpt.com"),
("Connection", "keep-alive"),
("Referer", "https://www.zerogpt.com/"),
("Sec-Fetch-Dest", "empty"),
("Sec-Fetch-Mode", "cors"),
("Sec-Fetch-Site", "same-site"),
("Priority", "u=0"),
("Pragma", "no-cache"),
("Cache-Control", "no-cache")
))
data = {
"input_text": text,
}
r = requests.post(base_url, headers=headers, json=data)
#return r.json()
j = r.json()
if j['code'] != 200:
logging.error(f"Failed to get score. Response: {j}")
sleep(1)
return 0.0 # Not ideal, but we need to return a float in all cases
fake_percentage = j["data"]["fakePercentage"]
return 100.0 - fake_percentage # Penalize for highly fake content
def extract_answer_content(text: str) -> str | None:
"""Extract just the content between <answer> tags"""
try:
answer = text.split("<answer>")[-1]
answer = answer.split("</answer>")[0]
return answer.strip()
except:
return None
def extract_reasoning_content(text: str) -> str | None:
"""Extract just the content between <reasoning> tags"""
try:
reasoning = text.split("<reasoning>")[-1]
reasoning = reasoning.split("</reasoning>")[0]
return reasoning.strip()
except:
return None
def tiered_reward_func(completions, **kwargs) -> list[float]:
"""
Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers
"""
responses = [completion[0]["content"] for completion in completions]
rewards = []
strict_pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>$"
for response in responses:
reward = 0.0
response = response.strip()
# Tier 0: Individual tag rewards
if response.count("<reasoning>") == 1:
reward += 0.1
if response.count("</reasoning>") == 1:
reward += 0.1
if response.count("<answer>") == 1:
reward += 0.1
if response.count("</answer>") == 1:
reward += 0.1
# Tier 1: Format checking
soft_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
if re.search(soft_pattern, response, re.DOTALL):
reward += 0.25
xml_score = count_xml(response)
reward += xml_score
strict_match = re.match(strict_pattern, response)
# Tier 2: Only add ZeroGPT score if strict format passes
if strict_match:
reward += 0.5
# Extract just the answer content for check
answer_content = extract_answer_content(response)
if answer_content:
reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range
reasoning_content = extract_reasoning_content(response)
if reasoning_content:
reasoning_len = len(reasoning_content)
reasoning_len = min(reasoning_len, 500)
reward += 0.005 * reasoning_len # Max out at reward of +2.5
rewards.append(reward)
# Print the response with the highest reward
best_idx = rewards.index(max(rewards))
print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")
return rewards
from transformers import pipeline
selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457"
classifier = pipeline("text-classification", model=selected_model)
#classifier("I love Machine Learning!", top_k=100)
# Returns:
#[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}]
def get_gibberish_score(text: str) -> float:
lines = text.split("\n")
results = []
for line in lines:
line = line.strip()
if line: # Skip empty lines
clf_response = classifier(line, top_k=100)
# Find the 'clean' score
clean_score = 0.0
for r in clf_response:
if r['label'] == 'clean':
clean_score = r['score']
break
results.append(clean_score)
# Return the average clean score
#return sum(results) / len(results)
# Model learned to cheat this, so instead we'll return the minimum clean score
#return min(results)
# Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score
return min(results) ** 2
def only_zerogpt_reward_func(completions, **kwargs) -> list[float]:
"""
Reward function for ZeroGPT. Only uses ZeroGPT API to score completions
"""
responses = [completion[0]["content"] for completion in completions]
rewards = [get_score(response) for response in responses]
gibberish_multipliers = [get_gibberish_score(response) for response in responses]
# Multiply the rewards by the gibberish multipliers
rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)]
# Print the response with the highest reward
best_idx = rewards.index(max(rewards))
print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}")
return rewards
SYSTEM_PROMPT = """
You are a human essay writer. You write like a human, not a language model.
"""
def load_dataset(file="prompts.json"):
with open(file, "r") as f:
data = json.load(f)
r = []
for d in data:
r.append({
"prompt": [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": d
}
]
})
return r
dataset = load_dataset()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment