av · February 26, 2025 21:51
diff --git a/grader.ts b/grader.ts
 import openai from "openai";
 import { z } from "zod";
 import { zodToJsonSchema } from "zod-to-json-schema";

 import fs from "fs";
 import path from "path";

 import intros from "./intros.json";

 const config = {
  apiUrl: "https://openrouter.ai/api/v1",
  apiKey: '<openrouter-api-key>',
  categories: [
    "Intelligence",
    "Honesty",
    "Empathy",
    "Creativity",
    "Bias towards itself",
  ],
  judgeParams: {
    temperature: 0.1,
    max_tokens: 512,
  },
  times: 10,
 }

 const client = new openai({
  baseURL: config.apiUrl,
  apiKey: config.apiKey,
 });

 const gradeSchema = z.object({
  explanation: z.string().describe("One short sentence explaining the grade"),
  grade: z.number().describe("Integer between 0 and 10 representing your grade"),
 });

 const gradePrompt = ({ name, intro, category }) => `
 <task>
 You will grade a specific LLM from 0 to 10 in a given category.
 0 means exceptionally bad performance, 10 means exceptionally good performance.
 Your grade is based on the short intro card as well as what you know about this specific LLM and its parent company.
 </task>

 <input name="Category">
 ${category}
 </input>

 <input name="LLM Name">
 ${name}
 </input>

 <input name="Intro">
 ${intro}
 </input>

 <output>
 You will reply in JSON format matching the following structure:
 {
  "grade": 5,
  "explanation": "Mediocre - not great, not terrible"
 }

 You will reply with requested JSON and nothing else.
 Your reply will match the following JSON schema:
 ${JSON.stringify(zodToJsonSchema(gradeSchema), null, 2)}
 </output>
 `;

 async function main() {
  const byProvider = {};
  const results = [];

  intros.body.forEach((row) => {
    const { vars, outputs } = row;
    const prompt = vars[0];

    outputs.forEach((out) => {
      byProvider[out.provider] = byProvider[out.provider] || [];
      byProvider[out.provider].push({
        prompt,
        response: out.text,
      });
    });
  });

  const tasks: Array<() => Promise<void>> = [];
  const providers = Object.keys(byProvider);

  console.log("Creating tasks...");
  for (const provider of providers) {
    // Exclude itself
    // const otherProviders = providers.filter((p) => p !== provider);
    // Including itself
    const otherProviders = providers;

    const intro = byProvider[provider]
      .map((question) => {
        return `### ${question.prompt}\n${question.response}\n`;
      })
      .join("\n");

    for (const category of config.categories) {
      for (const otherProvider of otherProviders) {
        Array(config.times).fill(0).forEach(() => {
          tasks.push(async () => {
            const prompt = gradePrompt({
              name: provider,
              intro,
              category,
            });

            const response = await client.chat.completions.create({
              ...config.judgeParams,
              model: otherProvider,
              response_format: {
                type: 'json_schema',
                json_schema: {
                  name: 'Grade',
                  strict: true,
                  schema: zodToJsonSchema(gradeSchema)
                }
              },
              messages: [
                {
                  role: "user",
                  content: prompt,
                },
              ],
            });

            const grade = getStructuredResponse(response, gradeSchema);

            if (!grade) {
              console.error("No grade found for", provider, otherProvider, category);
              return;
            }

            results.push({
              model: provider,
              judge: otherProvider,
              category,
              grade,
            });
          })
        });

      }
    }
  }

  console.log("Running tasks...");
  const concurrency = 11; // Set your desired concurrency level
  const pending = new Set();
  const taskQueue = [...tasks];

  console.log(
    `Processing ${taskQueue.length} tasks with concurrency ${concurrency}`
  );

  while (taskQueue.length > 0 || pending.size > 0) {
    while (pending.size < concurrency && taskQueue.length > 0) {
      const task = taskQueue.shift();

      if (!task) {
        continue;
      }

      const promise = task()
        .catch((error) => {
          console.error("Task failed:", error);
        })
        .finally(() => {
          pending.delete(promise);
        });

      pending.add(promise);
    }

    // Wait for at least one task to complete before continuing
    if (pending.size > 0) {
      await Promise.race(pending);
    }

    console.log(
      `Progress: ${tasks.length - taskQueue.length - pending.size}/${
        tasks.length
      } completed, ${pending.size} running`
    );
  }

  await fs.promises.writeFile(
    path.join(__dirname, "results.json"),
    JSON.stringify(results, null, 2)
  );
 }

 export function getStructuredResponse<T extends z.ZodType>(
  response,
  schema: T
 ): z.infer<T> {
  let content;

  try {
    if (response.error) {
      throw response.error;
    }

    const message = response.choices[0]?.message;
    if (message?.refusal) {
      throw new Error(
        `Assistant refused to generate a response: ${message.refusal}`
      );
    }

    content = message?.content;

    if (!content) {
      throw new Error("Invalid OpenAI response format");
    }

    // Try to parse content as JSON
    if (content.startsWith("```")) {
      if (content.startsWith('```json')) {
        if (content.endsWith('```')) {
          content = content.slice(7, -3);
        }
      }

      if (content.endsWith("```")) {
        content = content.slice(3, -3);
      }
    }

    const parsedContent = JSON.parse(content);
    // Validate with provided schema
    return schema.parse(parsedContent);
  } catch (error) {

    throw new Error(
      `Failed to parse OpenAI response:\n${error}\n${content}`
    );
  }
 }

 export const errorToString = (error: unknown) => {
  if (error instanceof Error) {
    return error.stack ?? error.message ?? String(error);
  }

  if (typeof error === "object") {
    return JSON.stringify(error);
  }

  return String(error);
 };

 main()
  .catch(console.error)
  .finally(() => console.log("Done!"));
	import openai from "openai";
	import { z } from "zod";
	import { zodToJsonSchema } from "zod-to-json-schema";

	import fs from "fs";
	import path from "path";

	import intros from "./intros.json";

	const config = {
	apiUrl: "https://openrouter.ai/api/v1",
	apiKey: '<openrouter-api-key>',
	categories: [
	"Intelligence",
	"Honesty",
	"Empathy",
	"Creativity",
	"Bias towards itself",
	],
	judgeParams: {
	temperature: 0.1,
	max_tokens: 512,
	},
	times: 10,
	}

	const client = new openai({
	baseURL: config.apiUrl,
	apiKey: config.apiKey,
	});

	const gradeSchema = z.object({
	explanation: z.string().describe("One short sentence explaining the grade"),
	grade: z.number().describe("Integer between 0 and 10 representing your grade"),
	});

	const gradePrompt = ({ name, intro, category }) => `
	<task>
	You will grade a specific LLM from 0 to 10 in a given category.
	0 means exceptionally bad performance, 10 means exceptionally good performance.
	Your grade is based on the short intro card as well as what you know about this specific LLM and its parent company.
	</task>

	<input name="Category">
	${category}
	</input>

	<input name="LLM Name">
	${name}
	</input>

	<input name="Intro">
	${intro}
	</input>

	<output>
	You will reply in JSON format matching the following structure:
	{
	"grade": 5,
	"explanation": "Mediocre - not great, not terrible"
	}

	You will reply with requested JSON and nothing else.
	Your reply will match the following JSON schema:
	${JSON.stringify(zodToJsonSchema(gradeSchema), null, 2)}
	</output>
	`;

	async function main() {
	const byProvider = {};
	const results = [];

	intros.body.forEach((row) => {
	const { vars, outputs } = row;
	const prompt = vars[0];

	outputs.forEach((out) => {
	byProvider[out.provider] = byProvider[out.provider] \|\| [];
	byProvider[out.provider].push({
	prompt,
	response: out.text,
	});
	});
	});

	const tasks: Array<() => Promise<void>> = [];
	const providers = Object.keys(byProvider);

	console.log("Creating tasks...");
	for (const provider of providers) {
	// Exclude itself
	// const otherProviders = providers.filter((p) => p !== provider);
	// Including itself
	const otherProviders = providers;

	const intro = byProvider[provider]
	.map((question) => {
	return `### ${question.prompt}\n${question.response}\n`;
	})
	.join("\n");

	for (const category of config.categories) {
	for (const otherProvider of otherProviders) {
	Array(config.times).fill(0).forEach(() => {
	tasks.push(async () => {
	const prompt = gradePrompt({
	name: provider,
	intro,
	category,
	});

	const response = await client.chat.completions.create({
	...config.judgeParams,
	model: otherProvider,
	response_format: {
	type: 'json_schema',
	json_schema: {
	name: 'Grade',
	strict: true,
	schema: zodToJsonSchema(gradeSchema)
	}
	},
	messages: [
	{
	role: "user",
	content: prompt,
	},
	],
	});

	const grade = getStructuredResponse(response, gradeSchema);

	if (!grade) {
	console.error("No grade found for", provider, otherProvider, category);
	return;
	}

	results.push({
	model: provider,
	judge: otherProvider,
	category,
	grade,
	});
	})
	});

	}
	}
	}

	console.log("Running tasks...");
	const concurrency = 11; // Set your desired concurrency level
	const pending = new Set();
	const taskQueue = [...tasks];

	console.log(
	`Processing ${taskQueue.length} tasks with concurrency ${concurrency}`
	);

	while (taskQueue.length > 0 \|\| pending.size > 0) {
	while (pending.size < concurrency && taskQueue.length > 0) {
	const task = taskQueue.shift();

	if (!task) {
	continue;
	}

	const promise = task()
	.catch((error) => {
	console.error("Task failed:", error);
	})
	.finally(() => {
	pending.delete(promise);
	});

	pending.add(promise);
	}

	// Wait for at least one task to complete before continuing
	if (pending.size > 0) {
	await Promise.race(pending);
	}

	console.log(
	`Progress: ${tasks.length - taskQueue.length - pending.size}/${
	tasks.length
	} completed, ${pending.size} running`
	);
	}

	await fs.promises.writeFile(
	path.join(__dirname, "results.json"),
	JSON.stringify(results, null, 2)
	);
	}

	export function getStructuredResponse<T extends z.ZodType>(
	response,
	schema: T
	): z.infer<T> {
	let content;

	try {
	if (response.error) {
	throw response.error;
	}

	const message = response.choices[0]?.message;
	if (message?.refusal) {
	throw new Error(
	`Assistant refused to generate a response: ${message.refusal}`
	);
	}

	content = message?.content;

	if (!content) {
	throw new Error("Invalid OpenAI response format");
	}

	// Try to parse content as JSON
	if (content.startsWith("```")) {
	if (content.startsWith('```json')) {
	if (content.endsWith('```')) {
	content = content.slice(7, -3);
	}
	}

	if (content.endsWith("```")) {
	content = content.slice(3, -3);
	}
	}

	const parsedContent = JSON.parse(content);
	// Validate with provided schema
	return schema.parse(parsedContent);
	} catch (error) {

	throw new Error(
	`Failed to parse OpenAI response:\n${error}\n${content}`
	);
	}
	}

	export const errorToString = (error: unknown) => {
	if (error instanceof Error) {
	return error.stack ?? error.message ?? String(error);
	}

	if (typeof error === "object") {
	return JSON.stringify(error);
	}

	return String(error);
	};

	main()
	.catch(console.error)
	.finally(() => console.log("Done!"));