Created
June 30, 2025 08:52
-
-
Save ssghost/c9d696dab3fe4eccb624c9a989db3803 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from collections import OrderedDict | |
import logging | |
import re | |
import json | |
from time import sleep | |
def count_xml(text) -> float: | |
count = 0.0 | |
if text.count("<reasoning>\n") == 1: | |
count += 0.125 | |
if text.count("\n</reasoning>\n") == 1: | |
count += 0.125 | |
if text.count("\n<answer>\n") == 1: | |
count += 0.125 | |
count -= len(text.split("\n</answer>\n")[-1])*0.001 | |
if text.count("\n</answer>") == 1: | |
count += 0.125 | |
count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001 | |
return count | |
def get_score(text: str) -> float: | |
base_url = "https://api.zerogpt.com/api/detect/detectText" | |
headers = OrderedDict(( | |
("Host", "api.zerogpt.com"), | |
("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:135.0) Gecko/20100101 Firefox/135.0"), | |
("Accept", "application/json, text/plain, */*"), | |
("Accept-Language", "en-US,en;q=0.5"), | |
("Accept-Encoding", "gzip, deflate, br, zstd"), | |
("Content-Type", "application/json"), | |
("Origin", "https://www.zerogpt.com"), | |
("Connection", "keep-alive"), | |
("Referer", "https://www.zerogpt.com/"), | |
("Sec-Fetch-Dest", "empty"), | |
("Sec-Fetch-Mode", "cors"), | |
("Sec-Fetch-Site", "same-site"), | |
("Priority", "u=0"), | |
("Pragma", "no-cache"), | |
("Cache-Control", "no-cache") | |
)) | |
data = { | |
"input_text": text, | |
} | |
r = requests.post(base_url, headers=headers, json=data) | |
#return r.json() | |
j = r.json() | |
if j['code'] != 200: | |
logging.error(f"Failed to get score. Response: {j}") | |
sleep(1) | |
return 0.0 # Not ideal, but we need to return a float in all cases | |
fake_percentage = j["data"]["fakePercentage"] | |
return 100.0 - fake_percentage # Penalize for highly fake content | |
def extract_answer_content(text: str) -> str | None: | |
"""Extract just the content between <answer> tags""" | |
try: | |
answer = text.split("<answer>")[-1] | |
answer = answer.split("</answer>")[0] | |
return answer.strip() | |
except: | |
return None | |
def extract_reasoning_content(text: str) -> str | None: | |
"""Extract just the content between <reasoning> tags""" | |
try: | |
reasoning = text.split("<reasoning>")[-1] | |
reasoning = reasoning.split("</reasoning>")[0] | |
return reasoning.strip() | |
except: | |
return None | |
def tiered_reward_func(completions, **kwargs) -> list[float]: | |
""" | |
Tiered reward function for ZeroGPT. Rewards are broken down into multiple tiers | |
""" | |
responses = [completion[0]["content"] for completion in completions] | |
rewards = [] | |
strict_pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>$" | |
for response in responses: | |
reward = 0.0 | |
response = response.strip() | |
# Tier 0: Individual tag rewards | |
if response.count("<reasoning>") == 1: | |
reward += 0.1 | |
if response.count("</reasoning>") == 1: | |
reward += 0.1 | |
if response.count("<answer>") == 1: | |
reward += 0.1 | |
if response.count("</answer>") == 1: | |
reward += 0.1 | |
# Tier 1: Format checking | |
soft_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>" | |
if re.search(soft_pattern, response, re.DOTALL): | |
reward += 0.25 | |
xml_score = count_xml(response) | |
reward += xml_score | |
strict_match = re.match(strict_pattern, response) | |
# Tier 2: Only add ZeroGPT score if strict format passes | |
if strict_match: | |
reward += 0.5 | |
# Extract just the answer content for check | |
answer_content = extract_answer_content(response) | |
if answer_content: | |
reward += get_score(answer_content) / 5.0 # Normalize to 1-20 range | |
reasoning_content = extract_reasoning_content(response) | |
if reasoning_content: | |
reasoning_len = len(reasoning_content) | |
reasoning_len = min(reasoning_len, 500) | |
reward += 0.005 * reasoning_len # Max out at reward of +2.5 | |
rewards.append(reward) | |
# Print the response with the highest reward | |
best_idx = rewards.index(max(rewards)) | |
print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}") | |
return rewards | |
from transformers import pipeline | |
selected_model = "madhurjindal/autonlp-Gibberish-Detector-492513457" | |
classifier = pipeline("text-classification", model=selected_model) | |
#classifier("I love Machine Learning!", top_k=100) | |
# Returns: | |
#[{'label': 'clean', 'score': 0.713941216468811}, {'label': 'mild gibberish', 'score': 0.1742146760225296}, {'label': 'word salad', 'score': 0.08730163425207138}, {'label': 'noise', 'score': 0.02454242669045925}] | |
def get_gibberish_score(text: str) -> float: | |
lines = text.split("\n") | |
results = [] | |
for line in lines: | |
line = line.strip() | |
if line: # Skip empty lines | |
clf_response = classifier(line, top_k=100) | |
# Find the 'clean' score | |
clean_score = 0.0 | |
for r in clf_response: | |
if r['label'] == 'clean': | |
clean_score = r['score'] | |
break | |
results.append(clean_score) | |
# Return the average clean score | |
#return sum(results) / len(results) | |
# Model learned to cheat this, so instead we'll return the minimum clean score | |
#return min(results) | |
# Apply an extra harsh penalty for gibberish by returning the square of the minimum clean score | |
return min(results) ** 2 | |
def only_zerogpt_reward_func(completions, **kwargs) -> list[float]: | |
""" | |
Reward function for ZeroGPT. Only uses ZeroGPT API to score completions | |
""" | |
responses = [completion[0]["content"] for completion in completions] | |
rewards = [get_score(response) for response in responses] | |
gibberish_multipliers = [get_gibberish_score(response) for response in responses] | |
# Multiply the rewards by the gibberish multipliers | |
rewards = [r * g for r, g in zip(rewards, gibberish_multipliers)] | |
# Print the response with the highest reward | |
best_idx = rewards.index(max(rewards)) | |
print(f"\nBest response (reward: {rewards[best_idx]:.3f}):\n{responses[best_idx]}\n{'-'*40}") | |
return rewards | |
SYSTEM_PROMPT = """ | |
You are a human essay writer. You write like a human, not a language model. | |
""" | |
def load_dataset(file="prompts.json"): | |
with open(file, "r") as f: | |
data = json.load(f) | |
r = [] | |
for d in data: | |
r.append({ | |
"prompt": [ | |
{ | |
"role": "system", | |
"content": SYSTEM_PROMPT | |
}, | |
{ | |
"role": "user", | |
"content": d | |
} | |
] | |
}) | |
return r | |
dataset = load_dataset() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment