rbiswasfc · October 14, 2024 15:47
diff --git a/05_jina_v3_lora_embed.ipynb b/05_jina_v3_lora_embed.ipynb
 {
  "cells": [
    {
      "metadata": {
        "trusted": false
      },
      "id": "b23c3628",
      "cell_type": "code",
      "source": "import os, re, numpy as np, torch, transformers\nfrom transformers import AutoModel, AutoTokenizer",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "c99431d9",
      "cell_type": "code",
      "source": "np.set_printoptions(precision=3, linewidth=200)\ntorch.set_printoptions(precision=3, linewidth=200)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "b10d69ec",
      "cell_type": "markdown",
      "source": "### Model & Tokenizer"
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "5537a730",
      "cell_type": "code",
      "source": "%%capture\narch = \"jinaai/jina-embeddings-v3\"\ntok = AutoTokenizer.from_pretrained(arch)\nmodel = AutoModel.from_pretrained(arch, trust_remote_code=True).to(\"cuda\")",
      "execution_count": 5,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "4e63f5bc",
      "cell_type": "markdown",
      "source": "### Examples & Task"
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "09705a0e",
      "cell_type": "code",
      "source": "texts = [\"Follow the white rabbit.\", \"Sigue al conejo blanco.\", \"This is a test.\"] # test samples\ntask = \"retrieval.passage\" # \"retrieval.passage\", \"separation\", \"classification\", \"text-matching\"",
      "execution_count": 20,
      "outputs": []
    },
    {
      "metadata": {},
      "id": "169f1343",
      "cell_type": "markdown",
      "source": "### Get embeddings using model call"
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "e4c19ec3",
      "cell_type": "code",
      "source": "inp_texts = [model._task_instructions[task] + t for t in texts]\ninp_texts",
      "execution_count": 21,
      "outputs": [
        {
          "data": {
            "text/plain": "['Represent the document for retrieval: Follow the white rabbit.',\n 'Represent the document for retrieval: Sigue al conejo blanco.',\n 'Represent the document for retrieval: This is a test.']"
          },
          "execution_count": 21,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "32e7744a",
      "cell_type": "code",
      "source": "inputs = tok(inp_texts, padding=True, return_tensors=\"pt\")\ninputs = {k:v.to(\"cuda\") for k,v in inputs.items()}",
      "execution_count": 22,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "b1a57206",
      "cell_type": "code",
      "source": "task_id = model._adaptation_map[task]\nnum_examples = len(texts)\nadapter_mask = torch.full((num_examples,), task_id, dtype=torch.int32, device=model.device)\nlora_arguments = {\"adapter_mask\": adapter_mask}\nlora_arguments",
      "execution_count": 23,
      "outputs": [
        {
          "data": {
            "text/plain": "{'adapter_mask': tensor([1, 1, 1], device='cuda:0', dtype=torch.int32)}"
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "c152abf4",
      "cell_type": "code",
      "source": "# get embeddings\nwith torch.inference_mode():\n    token_embs = model.roberta.forward(**inputs, **lora_arguments).last_hidden_state\n\ntoken_embs = token_embs.float() # Accumulate in fp32 to avoid overflow\nembeddings_a = model.roberta.mean_pooling(token_embs, inputs[\"attention_mask\"]) # mean_pooling\nembeddings_a = torch.nn.functional.normalize(embeddings_a, p=2, dim=1) # normalize_embeddings\nembeddings_a = embeddings_a.cpu().numpy() # convert to numpy\nembeddings_a[0][:10]",
      "execution_count": 24,
      "outputs": [
        {
          "data": {
            "text/plain": "array([-0.148, -0.016,  0.029, -0.026,  0.025,  0.028,  0.031,  0.019,  0.004, -0.036], dtype=float32)"
          },
          "execution_count": 24,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "id": "da421285",
      "cell_type": "markdown",
      "source": "### Get Embeddings using high level API"
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "46875b01",
      "cell_type": "code",
      "source": "embeddings_b = model.encode(texts, task=task)\nembeddings_b[0][:10]",
      "execution_count": 25,
      "outputs": [
        {
          "data": {
            "text/plain": "array([-0.148, -0.016,  0.029, -0.026,  0.025,  0.028,  0.031,  0.019,  0.004, -0.036], dtype=float32)"
          },
          "execution_count": 25,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "id": "5cea38ed",
      "cell_type": "markdown",
      "source": "### Test"
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "bb2f14a9",
      "cell_type": "code",
      "source": "np.testing.assert_allclose(embeddings_a, embeddings_b, rtol=1e-5, atol=1e-8)\nassert np.all(np.abs(embeddings_a - embeddings_b) < 1e-6), \"Embeddings are not exactly equal\"\nprint(\"Embeddings are equal within tolerance\")",
      "execution_count": 26,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "Embeddings are equal within tolerance\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "3928ebcd",
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "18323907",
      "cell_type": "code",
      "source": "# code_dir =\"~/.cache/huggingface/modules/transformers_modules/jinaai/xlm-roberta-flash-implementation/12700ba4972d9e900313a85ae855f5a76fb9500e\"\n# !cat $code_dir/modeling_xlm_roberta.py",
      "execution_count": 103,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "d5cc0762",
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "id": "81c702b8",
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3 (ipykernel)",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.15",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "05_jina_v3_lora_embed.ipynb",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"metadata": {
	"trusted": false
	},
	"id": "b23c3628",
	"cell_type": "code",
	"source": "import os, re, numpy as np, torch, transformers\nfrom transformers import AutoModel, AutoTokenizer",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "c99431d9",
	"cell_type": "code",
	"source": "np.set_printoptions(precision=3, linewidth=200)\ntorch.set_printoptions(precision=3, linewidth=200)",
	"execution_count": 3,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "b10d69ec",
	"cell_type": "markdown",
	"source": "### Model & Tokenizer"
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "5537a730",
	"cell_type": "code",
	"source": "%%capture\narch = \"jinaai/jina-embeddings-v3\"\ntok = AutoTokenizer.from_pretrained(arch)\nmodel = AutoModel.from_pretrained(arch, trust_remote_code=True).to(\"cuda\")",
	"execution_count": 5,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "4e63f5bc",
	"cell_type": "markdown",
	"source": "### Examples & Task"
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "09705a0e",
	"cell_type": "code",
	"source": "texts = [\"Follow the white rabbit.\", \"Sigue al conejo blanco.\", \"This is a test.\"] # test samples\ntask = \"retrieval.passage\" # \"retrieval.passage\", \"separation\", \"classification\", \"text-matching\"",
	"execution_count": 20,
	"outputs": []
	},
	{
	"metadata": {},
	"id": "169f1343",
	"cell_type": "markdown",
	"source": "### Get embeddings using model call"
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "e4c19ec3",
	"cell_type": "code",
	"source": "inp_texts = [model._task_instructions[task] + t for t in texts]\ninp_texts",
	"execution_count": 21,
	"outputs": [
	{
	"data": {
	"text/plain": "['Represent the document for retrieval: Follow the white rabbit.',\n 'Represent the document for retrieval: Sigue al conejo blanco.',\n 'Represent the document for retrieval: This is a test.']"
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "32e7744a",
	"cell_type": "code",
	"source": "inputs = tok(inp_texts, padding=True, return_tensors=\"pt\")\ninputs = {k:v.to(\"cuda\") for k,v in inputs.items()}",
	"execution_count": 22,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "b1a57206",
	"cell_type": "code",
	"source": "task_id = model._adaptation_map[task]\nnum_examples = len(texts)\nadapter_mask = torch.full((num_examples,), task_id, dtype=torch.int32, device=model.device)\nlora_arguments = {\"adapter_mask\": adapter_mask}\nlora_arguments",
	"execution_count": 23,
	"outputs": [
	{
	"data": {
	"text/plain": "{'adapter_mask': tensor([1, 1, 1], device='cuda:0', dtype=torch.int32)}"
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "c152abf4",
	"cell_type": "code",
	"source": "# get embeddings\nwith torch.inference_mode():\n token_embs = model.roberta.forward(inputs, lora_arguments).last_hidden_state\n\ntoken_embs = token_embs.float() # Accumulate in fp32 to avoid overflow\nembeddings_a = model.roberta.mean_pooling(token_embs, inputs[\"attention_mask\"]) # mean_pooling\nembeddings_a = torch.nn.functional.normalize(embeddings_a, p=2, dim=1) # normalize_embeddings\nembeddings_a = embeddings_a.cpu().numpy() # convert to numpy\nembeddings_a[0][:10]",
	"execution_count": 24,
	"outputs": [
	{
	"data": {
	"text/plain": "array([-0.148, -0.016, 0.029, -0.026, 0.025, 0.028, 0.031, 0.019, 0.004, -0.036], dtype=float32)"
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"id": "da421285",
	"cell_type": "markdown",
	"source": "### Get Embeddings using high level API"
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "46875b01",
	"cell_type": "code",
	"source": "embeddings_b = model.encode(texts, task=task)\nembeddings_b[0][:10]",
	"execution_count": 25,
	"outputs": [
	{
	"data": {
	"text/plain": "array([-0.148, -0.016, 0.029, -0.026, 0.025, 0.028, 0.031, 0.019, 0.004, -0.036], dtype=float32)"
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"id": "5cea38ed",
	"cell_type": "markdown",
	"source": "### Test"
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "bb2f14a9",
	"cell_type": "code",
	"source": "np.testing.assert_allclose(embeddings_a, embeddings_b, rtol=1e-5, atol=1e-8)\nassert np.all(np.abs(embeddings_a - embeddings_b) < 1e-6), \"Embeddings are not exactly equal\"\nprint(\"Embeddings are equal within tolerance\")",
	"execution_count": 26,
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": "Embeddings are equal within tolerance\n"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "3928ebcd",
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "18323907",
	"cell_type": "code",
	"source": "# code_dir =\"~/.cache/huggingface/modules/transformers_modules/jinaai/xlm-roberta-flash-implementation/12700ba4972d9e900313a85ae855f5a76fb9500e\"\n# !cat $code_dir/modeling_xlm_roberta.py",
	"execution_count": 103,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "d5cc0762",
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"id": "81c702b8",
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3 (ipykernel)",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.10.15",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"gist": {
	"id": "",
	"data": {
	"description": "05_jina_v3_lora_embed.ipynb",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}