Skip to content

Instantly share code, notes, and snippets.

@av
Created February 26, 2025 21:51
Show Gist options
  • Save av/c0bf1fd81d8b72d39f5f85d83719bfae to your computer and use it in GitHub Desktop.
Save av/c0bf1fd81d8b72d39f5f85d83719bfae to your computer and use it in GitHub Desktop.
grader-script
import openai from "openai";
import { z } from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
import fs from "fs";
import path from "path";
import intros from "./intros.json";
const config = {
apiUrl: "https://openrouter.ai/api/v1",
apiKey: '<openrouter-api-key>',
categories: [
"Intelligence",
"Honesty",
"Empathy",
"Creativity",
"Bias towards itself",
],
judgeParams: {
temperature: 0.1,
max_tokens: 512,
},
times: 10,
}
const client = new openai({
baseURL: config.apiUrl,
apiKey: config.apiKey,
});
const gradeSchema = z.object({
explanation: z.string().describe("One short sentence explaining the grade"),
grade: z.number().describe("Integer between 0 and 10 representing your grade"),
});
const gradePrompt = ({ name, intro, category }) => `
<task>
You will grade a specific LLM from 0 to 10 in a given category.
0 means exceptionally bad performance, 10 means exceptionally good performance.
Your grade is based on the short intro card as well as what you know about this specific LLM and its parent company.
</task>
<input name="Category">
${category}
</input>
<input name="LLM Name">
${name}
</input>
<input name="Intro">
${intro}
</input>
<output>
You will reply in JSON format matching the following structure:
{
"grade": 5,
"explanation": "Mediocre - not great, not terrible"
}
You will reply with requested JSON and nothing else.
Your reply will match the following JSON schema:
${JSON.stringify(zodToJsonSchema(gradeSchema), null, 2)}
</output>
`;
async function main() {
const byProvider = {};
const results = [];
intros.body.forEach((row) => {
const { vars, outputs } = row;
const prompt = vars[0];
outputs.forEach((out) => {
byProvider[out.provider] = byProvider[out.provider] || [];
byProvider[out.provider].push({
prompt,
response: out.text,
});
});
});
const tasks: Array<() => Promise<void>> = [];
const providers = Object.keys(byProvider);
console.log("Creating tasks...");
for (const provider of providers) {
// Exclude itself
// const otherProviders = providers.filter((p) => p !== provider);
// Including itself
const otherProviders = providers;
const intro = byProvider[provider]
.map((question) => {
return `### ${question.prompt}\n${question.response}\n`;
})
.join("\n");
for (const category of config.categories) {
for (const otherProvider of otherProviders) {
Array(config.times).fill(0).forEach(() => {
tasks.push(async () => {
const prompt = gradePrompt({
name: provider,
intro,
category,
});
const response = await client.chat.completions.create({
...config.judgeParams,
model: otherProvider,
response_format: {
type: 'json_schema',
json_schema: {
name: 'Grade',
strict: true,
schema: zodToJsonSchema(gradeSchema)
}
},
messages: [
{
role: "user",
content: prompt,
},
],
});
const grade = getStructuredResponse(response, gradeSchema);
if (!grade) {
console.error("No grade found for", provider, otherProvider, category);
return;
}
results.push({
model: provider,
judge: otherProvider,
category,
grade,
});
})
});
}
}
}
console.log("Running tasks...");
const concurrency = 11; // Set your desired concurrency level
const pending = new Set();
const taskQueue = [...tasks];
console.log(
`Processing ${taskQueue.length} tasks with concurrency ${concurrency}`
);
while (taskQueue.length > 0 || pending.size > 0) {
while (pending.size < concurrency && taskQueue.length > 0) {
const task = taskQueue.shift();
if (!task) {
continue;
}
const promise = task()
.catch((error) => {
console.error("Task failed:", error);
})
.finally(() => {
pending.delete(promise);
});
pending.add(promise);
}
// Wait for at least one task to complete before continuing
if (pending.size > 0) {
await Promise.race(pending);
}
console.log(
`Progress: ${tasks.length - taskQueue.length - pending.size}/${
tasks.length
} completed, ${pending.size} running`
);
}
await fs.promises.writeFile(
path.join(__dirname, "results.json"),
JSON.stringify(results, null, 2)
);
}
export function getStructuredResponse<T extends z.ZodType>(
response,
schema: T
): z.infer<T> {
let content;
try {
if (response.error) {
throw response.error;
}
const message = response.choices[0]?.message;
if (message?.refusal) {
throw new Error(
`Assistant refused to generate a response: ${message.refusal}`
);
}
content = message?.content;
if (!content) {
throw new Error("Invalid OpenAI response format");
}
// Try to parse content as JSON
if (content.startsWith("```")) {
if (content.startsWith('```json')) {
if (content.endsWith('```')) {
content = content.slice(7, -3);
}
}
if (content.endsWith("```")) {
content = content.slice(3, -3);
}
}
const parsedContent = JSON.parse(content);
// Validate with provided schema
return schema.parse(parsedContent);
} catch (error) {
throw new Error(
`Failed to parse OpenAI response:\n${error}\n${content}`
);
}
}
export const errorToString = (error: unknown) => {
if (error instanceof Error) {
return error.stack ?? error.message ?? String(error);
}
if (typeof error === "object") {
return JSON.stringify(error);
}
return String(error);
};
main()
.catch(console.error)
.finally(() => console.log("Done!"));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment