Last active
January 15, 2025 04:32
-
-
Save davidgilbertson/d03115a27c3d1f35d35609ebf558df53 to your computer and use it in GitHub Desktop.
A function to embed texts with OpenAI, with concurrency and allowance for max inputs and max token length
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Literal | |
import asyncio | |
import numpy as np | |
from openai import AsyncOpenAI | |
import tiktoken | |
def embed( | |
texts: list[str], | |
model: Literal[ | |
"text-embedding-3-small", | |
"text-embedding-3-large", | |
] = "text-embedding-3-small", | |
dimensions: int = None, | |
) -> np.ndarray: | |
MAX_BATCH = 2048 # API limit | |
MAX_TOKENS = 8191 # API limit | |
client = AsyncOpenAI() | |
# Limit the number of tokens per text | |
# We'll pass these tokens directly to the API | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
tokens = [tokenizer.encode(t)[:MAX_TOKENS] for t in texts] | |
# Fetch embeddings for a single batch | |
async def embed_batch(**kwargs) -> list[list[float]]: | |
response = await client.embeddings.create(**kwargs) | |
return [r.embedding for r in response.data] | |
# Split and process inputs in parallel batches | |
async def async_embed() -> np.ndarray: | |
coros = [] | |
for i in range(0, len(tokens), MAX_BATCH): | |
batch = tokens[i : i + MAX_BATCH] | |
kwargs = dict(input=batch, model=model) | |
if dimensions: | |
kwargs["dimensions"] = dimensions | |
coros.append(embed_batch(**kwargs)) | |
responses = await asyncio.gather(*coros) | |
embeddings = [emb for response in responses for emb in response] | |
return np.asarray(embeddings) | |
return asyncio.run(async_embed()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment