vinnymac · January 15, 2025 07:52
diff --git a/capital.eval.ts b/capital.eval.ts
 import { createAzure } from '@ai-sdk/azure';
 import { streamText } from 'ai';
 import { Levenshtein, type Scorer } from 'autoevals';
 import { evalite } from 'evalite';
 import { traceAISDKModel } from 'evalite/ai-sdk';

 process.env.AZURE_API_KEY = process.env.AZURE_OPENAI_API_KEY;
 process.env.AZURE_OPENAI_RESOURCE_NAME = process.env.AZURE_OPENAI_RESOURCE_NAME;

 const azure = createAzure({
  resourceName: process.env.AZURE_OPENAI_RESOURCE_NAME,
  apiKey: process.env.AZURE_OPENAI_API_KEY,
  apiVersion: process.env.OPENAI_API_VERSION,
 });

 async function streamToText(stream: AsyncIterable<string>): Promise<string> {
  let text = '';
  for await (const chunk of stream) {
    text += chunk;
  }
  return text;
 }

 // Create a custom Azure-based factuality scorer that implements the Scorer interface
 const AzureFactuality: Scorer<string, string> = async ({ output, expected }) => {
  const result = await streamText({
    model: azure(deploymentId),
    system: `You are an AI assistant that evaluates the factual accuracy of responses.
      Compare the following output with the expected answer and determine if they convey the same factual information.
      Respond with a score between 0 and 1, where:
      1 = Completely factually accurate and equivalent
      0 = Completely factually inaccurate or different
      Only respond with the numerical score, nothing else.`,
    prompt: `Output: "${output}"
      Expected: "${expected}"`,
  });

  const text = await streamToText(result.textStream);
  const score = parseFloat(text);
  return {
    name: 'AzureFactuality',
    score,
    reason: `Factual similarity score: ${score}`,
  };
 };

 const deploymentId = process.env.AZURE_OPENAI_DEPLOYMENT_ID;

 evalite('Test Capitals', {
  data: async () => [
    {
      input: `What's the capital of France?`,
      expected: `Paris`,
    },
    {
      input: `What's the capital of Germany?`,
      expected: `Berlin`,
    },
  ],
  task: async (input) => {
    const result = await streamText({
      model: traceAISDKModel(azure(deploymentId)),
      system: `
        Answer the question concisely. Answer in as few words as possible.
        Remove full stops from the end of the output.
        If the country has no capital, return '<country> has no capital'.
        If the country does not exist, return 'Unknown'.
      `,
      prompt: input,
    });

    return result.textStream;
  },
  scorers: [
    AzureFactuality,
    Levenshtein,
  ],
 });
	import { createAzure } from '@ai-sdk/azure';
	import { streamText } from 'ai';
	import { Levenshtein, type Scorer } from 'autoevals';
	import { evalite } from 'evalite';
	import { traceAISDKModel } from 'evalite/ai-sdk';

	process.env.AZURE_API_KEY = process.env.AZURE_OPENAI_API_KEY;
	process.env.AZURE_OPENAI_RESOURCE_NAME = process.env.AZURE_OPENAI_RESOURCE_NAME;

	const azure = createAzure({
	resourceName: process.env.AZURE_OPENAI_RESOURCE_NAME,
	apiKey: process.env.AZURE_OPENAI_API_KEY,
	apiVersion: process.env.OPENAI_API_VERSION,
	});

	async function streamToText(stream: AsyncIterable<string>): Promise<string> {
	let text = '';
	for await (const chunk of stream) {
	text += chunk;
	}
	return text;
	}

	// Create a custom Azure-based factuality scorer that implements the Scorer interface
	const AzureFactuality: Scorer<string, string> = async ({ output, expected }) => {
	const result = await streamText({
	model: azure(deploymentId),
	system: `You are an AI assistant that evaluates the factual accuracy of responses.
	Compare the following output with the expected answer and determine if they convey the same factual information.
	Respond with a score between 0 and 1, where:
	1 = Completely factually accurate and equivalent
	0 = Completely factually inaccurate or different
	Only respond with the numerical score, nothing else.`,
	prompt: `Output: "${output}"
	Expected: "${expected}"`,
	});

	const text = await streamToText(result.textStream);
	const score = parseFloat(text);
	return {
	name: 'AzureFactuality',
	score,
	reason: `Factual similarity score: ${score}`,
	};
	};

	const deploymentId = process.env.AZURE_OPENAI_DEPLOYMENT_ID;

	evalite('Test Capitals', {
	data: async () => [
	{
	input: `What's the capital of France?`,
	expected: `Paris`,
	},
	{
	input: `What's the capital of Germany?`,
	expected: `Berlin`,
	},
	],
	task: async (input) => {
	const result = await streamText({
	model: traceAISDKModel(azure(deploymentId)),
	system: `
	Answer the question concisely. Answer in as few words as possible.
	Remove full stops from the end of the output.
	If the country has no capital, return '<country> has no capital'.
	If the country does not exist, return 'Unknown'.
	`,
	prompt: input,
	});

	return result.textStream;
	},
	scorers: [
	AzureFactuality,
	Levenshtein,
	],
	});