alonsosilvaallende · June 18, 2024 17:50
diff --git a/extractor-qwen2-1_5b.ipynb b/extractor-qwen2-1_5b.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyOMSJQiUqjtm63bPGe+Tute",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/alonsosilvaallende/f9eb7c4efb848fe8726c4046357ce92f/extractor-qwen2-1_5b.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "v7OisGnnYM_6"
      },
      "outputs": [],
      "source": [
        "%pip install --quiet --upgrade outlines"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from outlines.integrations.transformers import JSONPrefixAllowedTokens"
      ],
      "metadata": {
        "id": "-1J-GZrnYO4r"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pydantic import BaseModel, Field\n",
        "from typing import List\n",
        "from transformers import pipeline"
      ],
      "metadata": {
        "id": "ABrzjt5JYY5n"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "pipe = pipeline(\"text-generation\", model=\"Qwen/Qwen2-1.5B-Instruct\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LM6E7FzwYd9g",
        "outputId": "8b96a915-0da9-4872-ba04-d88ddd1e2933"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
            "You will be able to reuse this secret in all of your notebooks.\n",
            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
            "  warnings.warn(\n",
            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "class Person(BaseModel):\n",
        "    first_name: str\n",
        "    surname: str\n",
        "\n",
        "class People(BaseModel):\n",
        "    people: List[Person] = Field(..., description=\"List of Person(s)\")\n",
        "\n",
        "prefix_allowed_tokens_fn = JSONPrefixAllowedTokens(\n",
        "    schema=People, tokenizer_or_pipe=pipe\n",
        ")\n",
        "\n",
        "results = pipe(\n",
        "    [\"He is Tom Jones and she is Linda Smith\"],\n",
        "    return_full_text=False,\n",
        "    do_sample=False,\n",
        "    max_new_tokens=50,\n",
        "    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nBhDph2WYixh",
        "outputId": "55981e4a-e83c-4d27-fc9d-aa56dd6eadbe"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Compiling FSM index for all state transitions: 100%|██████████| 70/70 [00:04<00:00, 14.56it/s]\n",
            "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:515: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
            "  warnings.warn(\n",
            "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:520: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
            "  warnings.warn(\n",
            "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:537: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.\n",
            "  warnings.warn(\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import json\n",
        "\n",
        "json.loads(results[0][0]['generated_text'])['people']"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0z37KdJYa1j-",
        "outputId": "8716b4f6-9f2b-49fe-fbaa-444358e1b38a"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'first_name': 'Tom', 'surname': 'Jones'},\n",
              " {'first_name': 'Linda', 'surname': 'Smith'}]"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyOMSJQiUqjtm63bPGe+Tute",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/alonsosilvaallende/f9eb7c4efb848fe8726c4046357ce92f/extractor-qwen2-1_5b.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"id": "v7OisGnnYM_6"
	},
	"outputs": [],
	"source": [
	"%pip install --quiet --upgrade outlines"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from outlines.integrations.transformers import JSONPrefixAllowedTokens"
	],
	"metadata": {
	"id": "-1J-GZrnYO4r"
	},
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from pydantic import BaseModel, Field\n",
	"from typing import List\n",
	"from transformers import pipeline"
	],
	"metadata": {
	"id": "ABrzjt5JYY5n"
	},
	"execution_count": 3,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"pipe = pipeline(\"text-generation\", model=\"Qwen/Qwen2-1.5B-Instruct\")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "LM6E7FzwYd9g",
	"outputId": "8b96a915-0da9-4872-ba04-d88ddd1e2933"
	},
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stderr",
	"text": [
	"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
	"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
	"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
	"You will be able to reuse this secret in all of your notebooks.\n",
	"Please note that authentication is recommended but still optional to access public models or datasets.\n",
	" warnings.warn(\n",
	"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"class Person(BaseModel):\n",
	" first_name: str\n",
	" surname: str\n",
	"\n",
	"class People(BaseModel):\n",
	" people: List[Person] = Field(..., description=\"List of Person(s)\")\n",
	"\n",
	"prefix_allowed_tokens_fn = JSONPrefixAllowedTokens(\n",
	" schema=People, tokenizer_or_pipe=pipe\n",
	")\n",
	"\n",
	"results = pipe(\n",
	" [\"He is Tom Jones and she is Linda Smith\"],\n",
	" return_full_text=False,\n",
	" do_sample=False,\n",
	" max_new_tokens=50,\n",
	" prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "nBhDph2WYixh",
	"outputId": "55981e4a-e83c-4d27-fc9d-aa56dd6eadbe"
	},
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stderr",
	"text": [
	"Compiling FSM index for all state transitions: 100%\|██████████\| 70/70 [00:04<00:00, 14.56it/s]\n",
	"/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:515: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
	" warnings.warn(\n",
	"/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:520: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
	" warnings.warn(\n",
	"/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:537: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.\n",
	" warnings.warn(\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import json\n",
	"\n",
	"json.loads(results[0][0]['generated_text'])['people']"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "0z37KdJYa1j-",
	"outputId": "8716b4f6-9f2b-49fe-fbaa-444358e1b38a"
	},
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"[{'first_name': 'Tom', 'surname': 'Jones'},\n",
	" {'first_name': 'Linda', 'surname': 'Smith'}]"
	]
	},
	"metadata": {},
	"execution_count": 6
	}
	]
	}
	]
	}