Skip to content

Instantly share code, notes, and snippets.

@ianmacartney
Last active June 5, 2025 13:18
Show Gist options
  • Save ianmacartney/53dafa51d37469534846105e39d99a25 to your computer and use it in GitHub Desktop.
Save ianmacartney/53dafa51d37469534846105e39d99a25 to your computer and use it in GitHub Desktop.
Implementation of chat completions and embeddings for any OpenAI-compliant services, using browser fetch and no imports/dependencies
export const CONFIG = {
// Together AI:
url: "https://api.together.xyz",
chatModel: "meta-llama/Llama-3-8b-chat-hf",
embeddingModel: "togethercomputer/m2-bert-80M-8k-retrieval", // dim 768
// OpenAI:
// url: "https://api.openai.com",
// chatModel: "gpt-4o",
// embeddingModel: "text-embedding-ada-002", // dim 1536
};
/**
* Easy to use API for OpenAI-compliant LLM servers
*/
export const completions = completionsViaFetch(CONFIG);
export const { chat, chatStream } = simpleCompletionsAPI(
completions,
CONFIG.chatModel,
);
export type SimpleCompletionsAPI = {
/**
* Simple non-streaming interface to LLM chat completions.
* @param messages The messages like you'd pass to OpenAI's .chat.completions.create
* @returns A string of the chat completion.
*/
chat: (messages: ChatCompletionMessageParam[]) => Promise<string>;
/**
* Simple streaming interface to LLM chat completions.
* @param messages The messages like you'd pass to OpenAI's .chat.completions.create
* @returns An async iterable of strings, each a part of the chat completion.
*/
chatStream: (
messages: ChatCompletionMessageParam[],
) => Promise<AsyncIterable<string>>;
};
export const embeddings = embeddingsViaFetch(CONFIG);
export const { embed, embedBatch } = simpleEmbeddingsAPI(
embeddings,
CONFIG.embeddingModel,
);
export type SimpleEmbeddingsAPI = {
/**
* Simple API to get an embedding for a single text.
* @param text The text to create an embedding for
* @returns An array of numbers representing the embedding
*/
embed: (text: string) => Promise<Array<number>>;
/**
* Simple API to get embeddings for multiple texts in batch.
* @param texts An array of texts to create embeddings for.
* @returns An array of embeddings (array of numbers), in the order of the input texts.
*/
embedBatch: (texts: string[]) => Promise<Array<Array<number>>>;
};
/**
* Completions API
*/
/**
* Makes a completions API using fetch, like OpenAI's .chat.completions.
* @param config Specifies the URL of the LLM server
* @returns Object with `create`: equivalent to OpenAI's `.chat.completions`
*/
export function completionsViaFetch(config: { url: string }) {
return {
async create(body) {
const response = await retryWithBackoff(async () => {
const response = await fetch(config.url + "/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: "Bearer " + process.env.LLM_API_KEY,
},
body: JSON.stringify(body),
});
if (!response.ok) {
const error = await response.text();
console.error({ error });
throw {
retry: shouldRetry(response),
error: new Error(
`Chat completion failed with code ${response.status}: ${error}`,
),
};
}
return response;
});
if (!body.stream) {
const json = (await response.json()) as ChatCompletion;
if (json.choices[0].message?.content === undefined) {
throw new Error(
"Unexpected result from OpenAI: " + JSON.stringify(json),
);
}
return json;
}
const stream = response.body;
if (!stream) throw new Error("No body in response");
return {
[Symbol.asyncIterator]: async function* () {
for await (const data of splitStream(stream)) {
if (data.startsWith("data:")) {
const json = data.substring("data:".length).trimStart();
if (json.startsWith("[DONE]")) {
return;
}
yield JSON.parse(json);
} else {
console.debug("Unexpected data:", data);
}
}
},
};
},
} as CompletionsAPI;
}
async function* splitStream(body: ReadableStream<Uint8Array>) {
const reader = body.getReader();
let lastFragment = "";
try {
while (true) {
const { value, done } = await reader.read();
if (done) {
// Flush the last fragment now that we're done
if (lastFragment !== "") {
yield lastFragment;
}
break;
}
const data = new TextDecoder().decode(value);
lastFragment += data;
const parts = lastFragment.split("\n\n");
// Yield all except for the last part
for (let i = 0; i < parts.length - 1; i += 1) {
yield parts[i];
}
// Save the last part as the new last fragment
lastFragment = parts[parts.length - 1];
}
} finally {
reader.releaseLock();
}
}
/**
*
* @param api Equivalent of OpenAI's .chat.completions or completionsViaFetch(CONFIG)
* @param model The model name, like "gpt-4" or "llama3"
* @returns Two functions: `chat` and `chatStream`, with simple interfaces.
*/
export function simpleCompletionsAPI(
api: CompletionsAPI, // completionsViaFetch(CONFIG) or (new OpenA().chat.completions)
model: string,
): SimpleCompletionsAPI {
return {
chat: async (messages: ChatCompletionMessageParam[]): Promise<string> => {
const response = await api.create({
model,
messages,
stream: false,
});
if (!response.choices[0].message?.content) {
throw new Error(
"Unexpected result from OpenAI: " + JSON.stringify(response),
);
}
return response.choices[0].message.content;
},
chatStream: async (
messages: ChatCompletionMessageParam[],
): Promise<AsyncIterable<string>> => {
const response = await api.create({
model,
messages,
stream: true,
});
return {
async *[Symbol.asyncIterator]() {
for await (const chunk of response) {
if (chunk.choices[0].delta?.content) {
yield chunk.choices[0].delta.content;
}
}
},
};
},
};
}
/**
* Embeddings
*/
export function embeddingsViaFetch(config: { url: string }): EmbeddingsAPI {
return {
create: async (body) => {
const json = await retryWithBackoff(async () => {
const result = await fetch(config.url + "/v1/embeddings", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: "Bearer " + process.env.LLM_API_KEY,
},
body: JSON.stringify(body),
});
if (!result.ok) {
throw {
retry: shouldRetry(result),
error: new Error(
`Embedding failed with code ${result.status}: ${await result.text()}`,
),
};
}
return (await result.json()) as CreateEmbeddingResponse;
});
return json;
},
};
}
export function simpleEmbeddingsAPI(
// either embeddingsViaFetch or (new OpenAI().embeddings)
api: EmbeddingsAPI,
model: string,
): SimpleEmbeddingsAPI {
return {
embed: async (text: string): Promise<Array<number>> => {
const json = await api.create({
input: text,
model,
});
return json.data[0].embedding;
},
embedBatch: async (texts: string[]): Promise<Array<Array<number>>> => {
const json = await api.create({
input: texts,
model,
});
const allembeddings = json.data;
allembeddings.sort((a, b) => a.index - b.index);
return allembeddings.map(({ embedding }) => embedding);
},
};
}
/**
* Helpers
*/
function shouldRetry(response: Response) {
return (
response.headers.get("x-should-retry") !== "false" &&
(response.headers.get("x-should-retry") === "true" ||
response.status === 408 || // Timeout
response.status === 409 || // Lock timeout
response.status === 429 || // Rate limit
response.status >= 500) // Internal server error
);
}
// Retry after this much time, based on the retry number.
const RETRY_BACKOFF = [1000, 10000, 20000]; // In ms
const RETRY_JITTER = 100; // In ms
type RetryError = { retry: boolean; error: any };
// Retry a function with exponential backoff.
export async function retryWithBackoff<T>(fn: () => Promise<T>): Promise<T> {
let i = 0;
for (; i <= RETRY_BACKOFF.length; i++) {
try {
const start = Date.now();
const result = await fn();
const ms = Date.now() - start;
if (i > 0) console.log(`Attempt ${i + 1} succeeded in ${ms}ms`);
return result;
} catch (e) {
const retryError = e as RetryError;
if (i < RETRY_BACKOFF.length) {
if (retryError.retry) {
console.log(
`Attempt ${i + 1} failed, waiting ${RETRY_BACKOFF[i]}ms to retry...`,
Date.now(),
);
await new Promise((resolve) =>
setTimeout(
resolve,
RETRY_BACKOFF[i] + RETRY_JITTER * Math.random(),
),
);
continue;
}
}
if (retryError.error) throw retryError.error;
else throw e;
}
}
throw new Error("Unreachable");
}
/**
* Types to use as our API. Simplified from the OpenAI API.
*/
export interface CompletionsAPI {
/**
* Creates a model response for the given chat conversation.
*/
create(body: ChatCompletionCreateParamsNonStreaming): Promise<ChatCompletion>;
create(
body: ChatCompletionCreateParamsStreaming,
): Promise<AsyncIterable<ChatCompletionChunk>>;
}
export interface ChatCompletion {
id: string;
choices: Array<{
finish_reason:
| "stop"
| "length"
| "tool_calls"
| "content_filter"
| "function_call";
index: number;
logprobs: {
content: Array<ChatCompletionTokenLogprob> | null;
} | null;
message: {
content: string | null;
role: "assistant";
/** @deprecated Deprecated and replaced by `tool_calls` */
function_call?: {
arguments: string;
name: string;
};
tool_calls?: Array<ChatCompletionMessageToolCall>;
};
}>;
created: number;
model: string;
object: "chat.completion";
system_fingerprint?: string;
usage?: CompletionUsage;
}
export interface ChatCompletionChunk {
id: string;
choices: Array<{
delta: {
content?: string | null;
/** @deprecated: Deprecated and replaced by `tool_calls`. The name and arguments of */
function_call?: { arguments?: string; name?: string };
role?: "system" | "user" | "assistant" | "tool";
tool_calls?: Array<{
index: number;
id?: string;
function?: {
arguments?: string;
name?: string;
};
type?: "function";
}>;
};
finish_reason:
| "stop"
| "length"
| "tool_calls"
| "content_filter"
| "function_call"
| null;
index: number;
logprobs?: { content: Array<ChatCompletionTokenLogprob> | null } | null;
}>;
created: number;
model: string;
object: "chat.completion.chunk";
system_fingerprint?: string;
usage?: CompletionUsage;
}
export interface CompletionUsage {
completion_tokens: number;
prompt_tokens: number;
total_tokens: number;
}
export interface ChatCompletionMessageToolCall {
id: string;
function: {
arguments: string;
name: string;
};
type: "function";
}
export interface ChatCompletionTokenLogprob {
token: string;
bytes: Array<number> | null;
logprob: number;
top_logprobs: Array<{
token: string;
bytes: Array<number> | null;
logprob: number;
}>;
}
export type ChatCompletionCreateParams =
| ChatCompletionCreateParamsNonStreaming
| ChatCompletionCreateParamsStreaming;
export interface ChatCompletionCreateParamsBase {
messages: Array<ChatCompletionMessageParam>;
model: string;
frequency_penalty?: number | null;
/** @deprecated in favor of `tools`. */
functions?: Array<{
name: string;
description?: string;
parameters?: Record<string, unknown>;
}>;
logit_bias?: Record<string, number> | null;
logprobs?: boolean | null;
max_tokens?: number | null;
n?: number | null;
presence_penalty?: number | null;
response_format?: { type?: "text" | "json_object" };
seed?: number | null;
stop?: string | null | Array<string>;
stream?: boolean | null;
stream_options?: {
include_usage?: boolean;
} | null;
temperature?: number | null;
tool_choice?:
| "none" // the model will not call any tool and instead generates a message.
| "auto" // the model can pick between generating a message or calling one or more tools.
| "required" // the model must call one or more tools.
| { function: { name: string }; type: "function" }; // forces the tool.
tools?: Array<{
function: {
name: string;
description?: string;
parameters?: Record<string, unknown>;
};
type: "function";
}>;
top_logprobs?: number | null;
top_p?: number | null;
user?: string;
}
export type ChatCompletionMessageParam =
| {
role: "system";
content: string;
name?: string;
}
| {
role: "user";
content:
| string
| Array<
| {
text: string;
type: "text";
}
| {
image_url: { url: string; detail?: "auto" | "low" | "high" };
type: "image_url";
}
>;
name?: string;
}
| {
role: "assistant";
content?: string | null;
name?: string;
tool_calls?: Array<ChatCompletionMessageToolCall>;
}
| {
content: string;
role: "tool";
tool_call_id: string;
};
export interface ChatCompletionCreateParamsNonStreaming
extends ChatCompletionCreateParamsBase {
stream?: false | null;
}
export interface ChatCompletionCreateParamsStreaming
extends ChatCompletionCreateParamsBase {
stream: true;
}
/** Embeddings */
export interface EmbeddingsAPI {
/**
* Creates an embedding vector representing the input text.
*/
create(body: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
}
export interface CreateEmbeddingResponse {
data: Array<Embedding>;
model: string;
object: "list";
usage: {
prompt_tokens: number;
total_tokens: number;
};
}
export interface Embedding {
embedding: Array<number>;
index: number;
object: "embedding";
}
export interface EmbeddingCreateParams {
input: string | Array<string> | Array<number> | Array<Array<number>>;
model: string;
dimensions?: number;
encoding_format?: "float" | "base64";
user?: string;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment