Skip to content

Instantly share code, notes, and snippets.

@FarisHijazi
Created November 10, 2024 06:37
Show Gist options
  • Save FarisHijazi/e1e19078048b21c80c5ae761c59e974d to your computer and use it in GitHub Desktop.
Save FarisHijazi/e1e19078048b21c80c5ae761c59e974d to your computer and use it in GitHub Desktop.
from __future__ import annotations
import logging
import warnings
from collections.abc import Callable, Sequence
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
import numpy as np
import requests
from langchain.pydantic_v1 import BaseModel, Extra, Field, root_validator
from langchain.schema.embeddings import Embeddings
from langchain.utils import get_from_dict_or_env, get_pydantic_field_names
from tenacity import (
AsyncRetrying,
before_sleep_log,
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
logger = logging.getLogger(__name__)
def _create_retry_decorator(embeddings: AverageEmbeddingsAPI) -> Callable[[Any], Any]:
import openai
min_seconds = 4
max_seconds = 10
# Wait 2^x * 1 second between each retry starting with
# 4 seconds, then up to 10 seconds, then 10 seconds afterwards
return retry(
reraise=True,
stop=stop_after_attempt(embeddings.max_retries),
wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds),
retry=(
retry_if_exception_type(openai.Timeout)
| retry_if_exception_type(openai.APIError)
| retry_if_exception_type(openai.APIConnectionError)
| retry_if_exception_type(openai.RateLimitError)
# | retry_if_exception_type(openai.ServiceUnavailableError)
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def _async_retry_decorator(embeddings: AverageEmbeddingsAPI) -> Any:
import openai
min_seconds = 4
max_seconds = 10
# Wait 2^x * 1 second between each retry starting with
# 4 seconds, then up to 10 seconds, then 10 seconds afterwards
async_retrying = AsyncRetrying(
reraise=True,
stop=stop_after_attempt(embeddings.max_retries),
wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds),
retry=(
retry_if_exception_type(openai.Timeout)
| retry_if_exception_type(openai.APIError)
| retry_if_exception_type(openai.APIConnectionError)
| retry_if_exception_type(openai.RateLimitError)
# | retry_if_exception_type(openai.ServiceUnavailableError)
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def wrap(func: Callable) -> Callable:
async def wrapped_f(*args: Any, **kwargs: Any) -> Callable:
async for _ in async_retrying:
return await func(*args, **kwargs)
raise AssertionError("this is unreachable")
return wrapped_f
return wrap
# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-AverageEmbeddingsAPI
def _check_response(response: dict, skip_empty: bool = False) -> dict:
if any(len(d["embedding"]) == 1 for d in response["data"]) and not skip_empty:
import openai
raise openai.APIError("OpenAI API returned an empty embedding")
return response
def embed_with_retry(embeddings: AverageEmbeddingsAPI, **kwargs: Any) -> Any:
"""Use tenacity to retry the embedding call."""
retry_decorator = _create_retry_decorator(embeddings)
@retry_decorator
def _embed_with_retry(**kwargs: Any) -> Any:
response = embeddings.client.create(**kwargs)
return _check_response(response, skip_empty=embeddings.skip_empty)
return _embed_with_retry(**kwargs)
async def async_embed_with_retry(embeddings: AverageEmbeddingsAPI, **kwargs: Any) -> Any:
"""Use tenacity to retry the embedding call."""
@_async_retry_decorator(embeddings)
async def _async_embed_with_retry(**kwargs: Any) -> Any:
response = await embeddings.client.acreate(**kwargs)
return _check_response(response, skip_empty=embeddings.skip_empty)
return await _async_embed_with_retry(**kwargs)
class AverageEmbeddingsAPI(BaseModel, Embeddings):
"""OpenAI embedding models.
To use, you should have the ``openai`` python package installed, and the
environment variable ``OPENAI_API_KEY`` set with your API key or pass it
as a named parameter to the constructor.
Example:
.. code-block:: python
from langchain.embeddings import AverageEmbeddingsAPI
openai = AverageEmbeddingsAPI(openai_api_key="my-api-key")
In order to use the library with Microsoft Azure endpoints, you need to set
the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION.
The OPENAI_API_TYPE must be set to 'azure' and the others correspond to
the properties of your endpoint.
In addition, the deployment name must be passed as the model parameter.
Example:
.. code-block:: python
import os
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = "https://<your-endpoint.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = "your AzureOpenAI key"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["OPENAI_PROXY"] = "http://your-corporate-proxy:8080"
from langchain.embeddings.openai import AverageEmbeddingsAPI
embeddings = AverageEmbeddingsAPI(
deployment="your-embeddings-deployment-name",
model="your-embeddings-model-name",
openai_api_base="https://your-endpoint.openai.azure.com/",
openai_api_type="azure",
)
text = "This is a test query."
query_result = embeddings.embed_query(text)
"""
client: Any = None #: :meta private:
model: str = "text-embedding-ada-002"
deployment: str = model # to support Azure OpenAI Service custom deployment names
openai_api_version: str | None = None
# to support Azure OpenAI Service custom endpoints
openai_api_base: str | None = None
# to support Azure OpenAI Service custom endpoints
openai_api_type: str | None = None
# to support explicit proxy for OpenAI
openai_proxy: str | None = None
embedding_ctx_length: int = 8191
"""The maximum number of tokens to embed at once."""
openai_api_key: str | None = None
openai_organization: str | None = None
allowed_special: Literal["all"] | set[str] = set()
disallowed_special: Literal["all"] | set[str] | Sequence[str] = "all"
chunk_size: int = 1000
"""Maximum number of texts to embed in each batch"""
max_retries: int = 6
"""Maximum number of retries to make when generating."""
request_timeout: float | tuple[float, float] | None = None
"""Timeout in seconds for the OpenAPI request."""
headers: Any = None
tiktoken_model_name: str | None = None
"""The model name to pass to tiktoken when using this class.
Tiktoken is used to count the number of tokens in documents to constrain
them to be under a certain limit. By default, when set to None, this will
be the same as the embedding model name. However, there are some cases
where you may want to use this Embedding class with a model name not
supported by tiktoken. This can include when using Azure embeddings or
when using one of the many model providers that expose an OpenAI-like
API but with different models. In those cases, in order to avoid erroring
when tiktoken is called, you can specify a model name to use here."""
show_progress_bar: bool = False
"""Whether to show a progress bar when embedding."""
model_kwargs: dict[str, Any] = Field(default_factory=dict)
"""Holds any model parameters valid for `create` call not explicitly specified."""
skip_empty: bool = False
"""Whether to skip empty strings when embedding or raise an error.
Defaults to not skipping."""
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
@root_validator(pre=True)
def build_extra(cls, values: dict[str, Any]) -> dict[str, Any]:
"""Build extra kwargs from additional params that were passed in."""
all_required_field_names = get_pydantic_field_names(cls)
extra = values.get("model_kwargs", {})
for field_name in list(values):
if field_name in extra:
raise ValueError(f"Found {field_name} supplied twice.")
if field_name not in all_required_field_names:
warnings.warn(
f"""WARNING! {field_name} is not default parameter.
{field_name} was transferred to model_kwargs.
Please confirm that {field_name} is what you intended."""
)
extra[field_name] = values.pop(field_name)
invalid_model_kwargs = all_required_field_names.intersection(extra.keys())
if invalid_model_kwargs:
raise ValueError(
f"Parameters {invalid_model_kwargs} should be specified explicitly. "
f"Instead they were passed in as part of `model_kwargs` parameter."
)
values["model_kwargs"] = extra
return values
@root_validator(pre=True)
def validate_environment(cls, values: dict) -> dict:
"""Validate that api key and python package exists in environment."""
values["openai_api_key"] = get_from_dict_or_env(values, "openai_api_key", "OPENAI_API_KEY")
values["openai_api_base"] = get_from_dict_or_env(
values,
"openai_api_base",
"OPENAI_API_BASE",
default="",
)
values["openai_api_type"] = get_from_dict_or_env(
values,
"openai_api_type",
"OPENAI_API_TYPE",
default="",
)
values["openai_proxy"] = get_from_dict_or_env(
values,
"openai_proxy",
"OPENAI_PROXY",
default="",
)
if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
default_api_version = "2022-12-01"
# Azure OpenAI embedding models allow a maximum of 16 texts
# at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
default_chunk_size = 16
else:
default_api_version = ""
default_chunk_size = 1000
values["openai_api_version"] = get_from_dict_or_env(
values,
"openai_api_version",
"OPENAI_API_VERSION",
default=default_api_version,
)
values["openai_organization"] = get_from_dict_or_env(
values,
"openai_organization",
"OPENAI_ORGANIZATION",
default="",
)
if "chunk_size" not in values:
values["chunk_size"] = default_chunk_size
try:
import openai
values["client"] = openai.Embedding
except ImportError:
raise ImportError(
"Could not import openai python package. " "Please install it with `pip install openai`."
)
return values
@property
def _invocation_params(self) -> dict:
openai_args = {
"model": self.model,
"request_timeout": self.request_timeout,
"headers": self.headers,
"api_key": self.openai_api_key,
"organization": self.openai_organization,
"api_base": self.openai_api_base,
"api_type": self.openai_api_type,
"api_version": self.openai_api_version,
**self.model_kwargs,
}
if self.openai_api_type in ("azure", "azure_ad", "azuread"):
openai_args["engine"] = self.deployment
if self.openai_proxy:
try:
import openai
except ImportError:
raise ImportError(
"Could not import openai python package. " "Please install it with `pip install openai`."
)
# TODO: The 'openai.proxy' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(proxy={
# "http": self.openai_proxy,
# "https": self.openai_proxy,
# })'
# openai.proxy = {
# "http": self.openai_proxy,
# "https": self.openai_proxy,
# } # type: ignore[assignment] # noqa: E501
return openai_args
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
def _get_len_safe_embeddings(
self, texts: list[str], *, engine: str, chunk_size: int | None = None
) -> list[list[float]]:
embeddings: list[list[float]] = [[] for _ in range(len(texts))]
try:
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"Could not import AutoTokenizer python package. "
"This is needed in order to for AverageEmbeddingsAPI. "
"Please install it with `pip install transformers tokenizers`."
)
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
encoding = AutoTokenizer.from_pretrained(model_name)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text,
# add_special_tokens=self.allowed_special,
# disallowed_special=self.disallowed_special,
)
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
batched_embeddings: list[list[float]] = []
_chunk_size = chunk_size or self.chunk_size
if self.show_progress_bar:
try:
from tqdm.auto import tqdm
_iter = tqdm(range(0, len(tokens), _chunk_size))
except ImportError:
_iter = range(0, len(tokens), _chunk_size)
else:
_iter = range(0, len(tokens), _chunk_size)
for i in _iter:
response = embed_with_retry(
self,
# XXX is the line that is different between my implementation and langchain's
input=encoding.batch_decode(tokens[i : i + _chunk_size]),
**self._invocation_params,
)
batched_embeddings.extend(r["embedding"] for r in response["data"])
results: list[list[list[float]]] = [[] for _ in range(len(texts))]
num_tokens_in_batch: list[list[int]] = [[] for _ in range(len(texts))]
for i in range(len(indices)):
if self.skip_empty and len(batched_embeddings[i]) == 1:
continue
results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
for i in range(len(texts)):
_result = results[i]
if len(_result) == 0:
average = embed_with_retry(
self,
input="",
**self._invocation_params,
)["data"][
0
]["embedding"]
else:
average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
embeddings[i] = (average / np.linalg.norm(average)).tolist()
return embeddings
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
async def _aget_len_safe_embeddings(
self, texts: list[str], *, engine: str, chunk_size: int | None = None
) -> list[list[float]]:
embeddings: list[list[float]] = [[] for _ in range(len(texts))]
try:
import tiktoken
except ImportError:
raise ImportError(
"Could not import tiktoken python package. "
"This is needed in order to for AverageEmbeddingsAPI. "
"Please install it with `pip install tiktoken`."
)
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
batched_embeddings: list[list[float]] = []
_chunk_size = chunk_size or self.chunk_size
for i in range(0, len(tokens), _chunk_size):
response = await async_embed_with_retry(
self,
input=tokens[i : i + _chunk_size],
**self._invocation_params,
)
batched_embeddings.extend(r["embedding"] for r in response["data"])
results: list[list[list[float]]] = [[] for _ in range(len(texts))]
num_tokens_in_batch: list[list[int]] = [[] for _ in range(len(texts))]
for i in range(len(indices)):
results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
for i in range(len(texts)):
_result = results[i]
if len(_result) == 0:
average = (
await async_embed_with_retry(
self,
input="",
**self._invocation_params,
)
)["data"][
0
]["embedding"]
else:
average = np.average(_result, axis=0, weights=num_tokens_in_batch[i])
embeddings[i] = (average / np.linalg.norm(average)).tolist()
return embeddings
def embed_documents(self, texts: list[str], chunk_size: int | None = 0) -> list[list[float]]:
"""Call out to OpenAI's embedding endpoint for embedding search docs.
Args:
texts: The list of texts to embed.
chunk_size: The chunk size of embeddings. If None, will use the chunk size
specified by the class.
Returns:
List of embeddings, one for each text.
"""
# NOTE: to keep things simple, we assume the list may contain texts longer
# than the maximum context and use length-safe embedding function.
return self._get_len_safe_embeddings(texts, engine=self.deployment)
async def aembed_documents(self, texts: list[str], chunk_size: int | None = 0) -> list[list[float]]:
"""Call out to OpenAI's embedding endpoint async for embedding search docs.
Args:
texts: The list of texts to embed.
chunk_size: The chunk size of embeddings. If None, will use the chunk size
specified by the class.
Returns:
List of embeddings, one for each text.
"""
# NOTE: to keep things simple, we assume the list may contain texts longer
# than the maximum context and use length-safe embedding function.
return await self._aget_len_safe_embeddings(texts, engine=self.deployment)
def embed_query(self, text: str) -> list[float]:
"""Call out to OpenAI's embedding endpoint for embedding query text.
Args:
text: The text to embed.
Returns:
Embedding for the text.
"""
return self.embed_documents([text])[0]
async def aembed_query(self, text: str) -> list[float]:
"""Call out to OpenAI's embedding endpoint async for embedding query text.
Args:
text: The text to embed.
Returns:
Embedding for the text.
"""
embeddings = await self.aembed_documents([text])
return embeddings[0]
class HFInferenceServerEmbeddings(Embeddings):
def __init__(self, model_id="BAAI/bge-large-en-v1.5", endpoint=None, tokenizer=None):
self.model_id = model_id
self.endpoint = endpoint
self.tokenizer = tokenizer
# if self.tokenizer is None:
# self.tokenizer = AutoTokenizer.from_pretrained(model_id)
def embed_query(self, text):
# tokens = self.tokenizer.encode(text)
# must batch into multiples of context size and make multiple calls, and then take the average
data = {"inputs": text}
headers = {"Content-Type": "application/json"}
response = requests.post(self.endpoint, json=data, headers=headers)
response.raise_for_status() # Ensure the request was successful
j = response.json()
assert len(j) == 1, "Critical error: something is wrong, there should only be one embedding here"
return j[0]
def embed_documents(self, documents):
# chunk document into batches of 32
results = []
for i in range(0, len(documents), 32):
batch = documents[i : i + 32]
for item in batch:
results.append(self.embed_query(item))
return results
from llama_index.embeddings import TextEmbeddingsInference
# from openai.datalib.numpy_helper import numpy as np
# TODO: don't write your own API client, instead use the OPENAI one and make your modifications
class TextEmbeddingsInferenceLangchain(Embeddings):
def __init__(self, model_id="BAAI/bge-large-en-v1.5", api_base="http://localhost:8080", retry_count=5):
self.model_id = model_id
# self.api_base = api_base
# self.tokenizer = tokenizer
assert retry_count > 0
self.retry_count = retry_count
self.client = TextEmbeddingsInference(
base_url=api_base,
model_name=model_id, # required for formatting inference text,
timeout=7, # timeout in seconds
embed_batch_size=64, # batch size for embedding
)
# if self.tokenizer is None:
# self.tokenizer = AutoTokenizer.from_pretrained(model_id)
def embed_query(self, text):
# tokens = self.tokenizer.encode(text)
# must batch into multiples of context size and make multiple calls, and then take the average
for i in range(self.retry_count):
try:
return self.client.get_text_embedding(text)
except requests.exceptions.HTTPError as e:
if i == self.retry_count - 1:
e.args = (f"Failed to embed query: {text} after {self.retry_count} retries", *e.args)
raise e
continue
raise Exception(f"Failed to embed query: {text} after {self.retry_count} retries", e)
def embed_documents(self, documents):
# TODO: make sure to split text into chunks (max length is probably 100000 through experiment)
# chunk document into batches of 32
results = []
for i in range(0, len(documents), 32):
batch = documents[i : i + 32]
for document in batch:
results_ = self.embed_query(document)
for result in results_:
assert type(result) is float, f"Failed to embed batch documents[{i}]: {document}"
results.append(results_)
return results
async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
"""Asynchronous Embed search docs."""
return await self.client.aget_text_embedding_batch(texts)
async def aembed_query(self, text: str) -> list[float]:
"""Asynchronous Embed query text."""
return await self.client.aget_text_embedding(text)
if __name__ == "__main__":
# embeddings = TextEmbeddingsInferenceLangchain(
# # model_name="your-model-name",
# api_base="http://127.0.0.1:5001",
# )
# print('creating AverageEmbeddingsAPI')
embeddings = AverageEmbeddingsAPI(
openai_api_key="your-api-key",
openai_api_base="http://127.0.0.1:5001",
# openai_proxy="http://127.0.0.1:5001/openai",
max_retries=1,
chunk_size=32,
tiktoken_model_name="BAAI/bge-large-en-v1.5",
)
embeddings.client.OBJECT_NAME = "openai"
# embeddings.client = TextEmbeddingsInferenceClient()
# openai_embeddings = AverageEmbeddingsAPI
text = "What is deep learning?" * 10000
query_result = embeddings.embed_query(text)
print("query_result", query_result)
# print('openai', openai_embeddings.embed_query(text))
# exit(0)
print("embedding documents")
doc_result = embeddings.embed_documents([text] * 10000)
print("doc_result", len(doc_result))
# # print('loading embedding model')
# # embedding_model = HFInferenceServerEmbeddings()
# # print('embedding query')
# # embeddings = embedding_model.embed_query("asdf")
# # print("embedded values", embeddings)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment