dvgodoy · February 3, 2025 13:47
diff --git a/Chapter0_MultiGPU.ipynb b/Chapter0_MultiGPU.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8f1077b2",
   "metadata": {},
   "source": [
    "## Chapter 0: TL;DR [Unofficial Multi-GPU version]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b02138f6",
   "metadata": {},
   "source": [
    "### 1. Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0909e283-298f-4e59-8b68-f4785fb02bc9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting transformers\n",
      "  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting datasets\n",
      "  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)\n",
      "Collecting peft\n",
      "  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)\n",
      "Collecting trl\n",
      "  Downloading trl-0.14.0-py3-none-any.whl.metadata (12 kB)\n",
      "Collecting bitsandbytes\n",
      "  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)\n",
      "Collecting accelerate\n",
      "  Downloading accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)\n",
      "Requirement already satisfied: filelock in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from transformers) (3.13.1)\n",
      "Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)\n",
      "  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)\n",
      "Requirement already satisfied: numpy>=1.17 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from transformers) (1.26.4)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from transformers) (24.0)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from transformers) (6.0.1)\n",
      "Collecting regex!=2019.12.17 (from transformers)\n",
      "  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from transformers) (2.31.0)\n",
      "Collecting tokenizers<0.22,>=0.21 (from transformers)\n",
      "  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
      "Collecting safetensors>=0.4.1 (from transformers)\n",
      "  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
      "Collecting tqdm>=4.27 (from transformers)\n",
      "  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.7/57.7 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting pyarrow>=15.0.0 (from datasets)\n",
      "  Downloading pyarrow-19.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)\n",
      "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
      "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
      "Collecting pandas (from datasets)\n",
      "  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.9/89.9 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting requests (from transformers)\n",
      "  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)\n",
      "Collecting xxhash (from datasets)\n",
      "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
      "Collecting multiprocess<0.70.17 (from datasets)\n",
      "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
      "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
      "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
      "Collecting aiohttp (from datasets)\n",
      "  Downloading aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
      "Requirement already satisfied: psutil in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from peft) (5.9.8)\n",
      "Requirement already satisfied: torch>=1.13.0 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from peft) (2.2.1)\n",
      "Collecting rich (from trl)\n",
      "  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)\n",
      "Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)\n",
      "  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)\n",
      "Collecting aiosignal>=1.1.2 (from aiohttp->datasets)\n",
      "  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)\n",
      "Collecting async-timeout<6.0,>=4.0 (from aiohttp->datasets)\n",
      "  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)\n",
      "Collecting attrs>=17.3.0 (from aiohttp->datasets)\n",
      "  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)\n",
      "Collecting frozenlist>=1.1.1 (from aiohttp->datasets)\n",
      "  Downloading frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)\n",
      "Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)\n",
      "  Downloading multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)\n",
      "Collecting propcache>=0.2.0 (from aiohttp->datasets)\n",
      "  Downloading propcache-0.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)\n",
      "Collecting yarl<2.0,>=1.17.0 (from aiohttp->datasets)\n",
      "  Downloading yarl-1.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (69 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.2/69.2 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (4.10.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from requests->transformers) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from requests->transformers) (3.6)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from requests->transformers) (2.2.1)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from requests->transformers) (2024.2.2)\n",
      "Requirement already satisfied: sympy in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from torch>=1.13.0->peft) (1.12)\n",
      "Requirement already satisfied: networkx in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.2.1)\n",
      "Requirement already satisfied: jinja2 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.1.3)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from pandas->datasets) (2.9.0)\n",
      "Collecting pytz>=2020.1 (from pandas->datasets)\n",
      "  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)\n",
      "Collecting tzdata>=2022.7 (from pandas->datasets)\n",
      "  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
      "Collecting markdown-it-py>=2.2.0 (from rich->trl)\n",
      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from rich->trl) (2.17.2)\n",
      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->trl)\n",
      "  Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
      "Requirement already satisfied: six>=1.5 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft) (2.1.5)\n",
      "Requirement already satisfied: mpmath>=0.19 in /opt/micromamba/envs/python_310/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\n",
      "Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading peft-0.14.0-py3-none-any.whl (374 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.8/374.8 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading trl-0.14.0-py3-none-any.whl (313 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m313.9/313.9 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.7/69.7 MB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading accelerate-1.3.0-py3-none-any.whl (336 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m336.6/336.6 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hDownloading huggingface_hub-0.28.1-py3-none-any.whl (464 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m464.1/464.1 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pyarrow-19.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.1/42.1 MB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.7/781.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (461 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.0/462.0 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading tqdm-4.67.1-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading rich-13.9.4-py3-none-any.whl (242 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m242.4/242.4 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading aiohappyeyeballs-2.4.4-py3-none-any.whl (14 kB)\n",
      "Downloading aiosignal-1.3.2-py2.py3-none-any.whl (7.6 kB)\n",
      "Downloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)\n",
      "Downloading attrs-25.1.0-py3-none-any.whl (63 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.2/63.2 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (241 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.9/241.9 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (124 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.6/124.6 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading propcache-0.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (205 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m205.1/205.1 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading pytz-2025.1-py2.py3-none-any.whl (507 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.9/507.9 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m346.8/346.8 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading yarl-1.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (319 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
      "Installing collected packages: pytz, xxhash, tzdata, tqdm, safetensors, requests, regex, pyarrow, propcache, multidict, mdurl, fsspec, frozenlist, dill, attrs, async-timeout, aiohappyeyeballs, yarl, pandas, multiprocess, markdown-it-py, huggingface-hub, aiosignal, tokenizers, rich, bitsandbytes, aiohttp, accelerate, transformers, peft, datasets, trl\n",
      "  Attempting uninstall: requests\n",
      "    Found existing installation: requests 2.31.0\n",
      "    Uninstalling requests-2.31.0:\n",
      "      Successfully uninstalled requests-2.31.0\n",
      "Successfully installed accelerate-1.3.0 aiohappyeyeballs-2.4.4 aiohttp-3.11.11 aiosignal-1.3.2 async-timeout-5.0.1 attrs-25.1.0 bitsandbytes-0.45.1 datasets-3.2.0 dill-0.3.8 frozenlist-1.5.0 fsspec-2024.9.0 huggingface-hub-0.28.1 markdown-it-py-3.0.0 mdurl-0.1.2 multidict-6.1.0 multiprocess-0.70.16 pandas-2.2.3 peft-0.14.0 propcache-0.2.1 pyarrow-19.0.0 pytz-2025.1 regex-2024.11.6 requests-2.32.3 rich-13.9.4 safetensors-0.5.2 tokenizers-0.21.0 tqdm-4.67.1 transformers-4.48.2 trl-0.14.0 tzdata-2025.1 xxhash-3.5.0 yarl-1.18.3\n"
     ]
    }
   ],
   "source": [
    "!pip install transformers datasets peft trl bitsandbytes accelerate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c352416e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from accelerate.utils import write_basic_config\n",
    "\n",
    "# Set the visible devices here\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0,1' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ca99302f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PosixPath('/home/user/.cache/huggingface/accelerate/default_config.yaml')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Removes existing config [adjust for your username]\n",
    "!rm -rf /home/user/.cache/huggingface/accelerate/default_config.yaml\n",
    "\n",
    "# Writes basic Accelerate configuration for multi-GPU training\n",
    "write_basic_config()  # Write a config file"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a5f5929",
   "metadata": {},
   "source": [
    "Let's check the configuration: the `num_processes` should match the number of visible devices you have set. Notice the `main_training_function` name, which we'll have to implement later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d3def2d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"compute_environment\": \"LOCAL_MACHINE\",\n",
      "  \"debug\": false,\n",
      "  \"distributed_type\": \"MULTI_GPU\",\n",
      "  \"downcast_bf16\": false,\n",
      "  \"enable_cpu_affinity\": false,\n",
      "  \"machine_rank\": 0,\n",
      "  \"main_training_function\": \"main\",\n",
      "  \"mixed_precision\": \"no\",\n",
      "  \"num_machines\": 1,\n",
      "  \"num_processes\": 2,\n",
      "  \"rdzv_backend\": \"static\",\n",
      "  \"same_network\": false,\n",
      "  \"tpu_use_cluster\": false,\n",
      "  \"tpu_use_sudo\": false,\n",
      "  \"use_cpu\": false\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "!cat /home/user/.cache/huggingface/accelerate/default_config.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2542ec5",
   "metadata": {},
   "source": [
    "***\n",
    "**==> RESTART THE NOTEBOOK!**\n",
    "***"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "90df85ff",
   "metadata": {},
   "source": [
    "### 2. Downloading Model to HF Cache"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4d8bbd4e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a7cab400fe6a417fb069007d5fe65493",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "15b7853934c24ffa8095a622d5c61539",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00002.safetensors:  34%|###4      | 1.71G/4.97G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f08f2650640c4e44873a5a1cbf922467",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "86237206306541a98f7cc159c3ebc851",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4127d029a0624874beb3a3edf3bdb1bc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import AutoModelForCausalLM\n",
    "\n",
    "repo_id = 'microsoft/Phi-3-mini-4k-instruct'\n",
    "\n",
    "# Downloads the model to the hugging face cache, so it doesn't have to during training\n",
    "model = AutoModelForCausalLM.from_pretrained(repo_id, device_map='cpu')\n",
    "\n",
    "# Removes the model, as it will be loaded independently by each process\n",
    "del model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfb0bb04",
   "metadata": {},
   "source": [
    "### 3. Loading Tokenizer and Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "10eefeff",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "repo_id = 'microsoft/Phi-3-mini-4k-instruct'\n",
    "tokenizer = AutoTokenizer.from_pretrained(repo_id)\n",
    "\n",
    "dataset = load_dataset(\"dvgodoy/yoda_sentences\", split=\"train\")\n",
    "dataset = dataset.rename_column(\"sentence\", \"prompt\")\n",
    "dataset = dataset.rename_column(\"translation_extra\", \"completion\")\n",
    "dataset = dataset.remove_columns([\"translation\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad5e0d06",
   "metadata": {},
   "source": [
    "### 4. Main Training Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5342d605",
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/huggingface/alignment-handbook/blob/main/src/alignment/model_utils.py\n",
    "def get_current_device():\n",
    "    from accelerate import Accelerator\n",
    "    #### Each process will have its own index, corresponding to a different GPU\n",
    "    return Accelerator().local_process_index if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# https://github.com/huggingface/alignment-handbook/blob/main/src/alignment/model_utils.py\n",
    "def get_kbit_device_map():\n",
    "    return {\"\": get_current_device()} if torch.cuda.is_available() else None\n",
    "\n",
    "def main(repo_id, tokenizer, dataset):\n",
    "    from transformers import BitsAndBytesConfig\n",
    "    from trl import SFTConfig, SFTTrainer\n",
    "    from peft import LoraConfig\n",
    "    from accelerate import Accelerator\n",
    "    import torch\n",
    "    \n",
    "    ### MODEL\n",
    "    #### Quantization Config\n",
    "    bnb_config = BitsAndBytesConfig(\n",
    "       load_in_4bit=True,\n",
    "       bnb_4bit_quant_type=\"nf4\",\n",
    "       bnb_4bit_use_double_quant=True,\n",
    "       bnb_4bit_compute_dtype=torch.float32\n",
    "    )\n",
    "    \n",
    "    ### ORIGINAL MODEL\n",
    "    # model = AutoModelForCausalLM.from_pretrained(repo_id,\n",
    "    #                                          device_map=\"cuda:0\",\n",
    "    #                                          quantization_config=bnb_config\n",
    "    # )\n",
    "    #### Any other model arguments should be specified here\n",
    "    #### The model itself will be created by SFTTrainer using the repo_id and this args\n",
    "    model_kwargs = dict(\n",
    "        device_map=get_kbit_device_map(), # it will dynamically assign a GPU\n",
    "        quantization_config=bnb_config,\n",
    "    )\n",
    "    ### LoRA CONFIG\n",
    "    #### The SFTTrainer will apply the config and call the prepare_for_kbit_training() method\n",
    "    config = LoraConfig(\n",
    "        r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train\n",
    "        lora_alpha=16,         # multiplier, usually 2*r\n",
    "        bias=\"none\",           # BEWARE: training biases *modifies* base model's behavior\n",
    "        lora_dropout=0.05,\n",
    "        task_type=\"CAUSAL_LM\",\n",
    "        # Newer models, such as Phi-3 at time of writing, may require\n",
    "        # manually setting target modules\n",
    "        target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],\n",
    "    )\n",
    "        \n",
    "    ### SFTConfig\n",
    "    sft_config = SFTConfig(\n",
    "        ### Model initialization args, so SFTTrainer can dynamically create the\n",
    "        ### model and assign it to the right GPU\n",
    "        model_init_kwargs=model_kwargs,\n",
    "\n",
    "        ## GROUP 1: Memory usage\n",
    "        # These arguments will squeeze the most out of your GPU's RAM\n",
    "        # Checkpointing\n",
    "        gradient_checkpointing=True,\n",
    "        # this saves a LOT of memory\n",
    "        # Set this to avoid exceptions in newer versions of PyTorch\n",
    "        gradient_checkpointing_kwargs={'use_reentrant': False},\n",
    "        # Gradient Accumulation / Batch size\n",
    "        # Actual batch (for updating) is same (1x) as micro-batch size\n",
    "        gradient_accumulation_steps=1,\n",
    "        # The initial (micro) batch size to start off with\n",
    "        per_device_train_batch_size=16,\n",
    "        # If batch size would cause OOM, halves its size until it works\n",
    "        auto_find_batch_size=True,\n",
    "\n",
    "        ## GROUP 2: Dataset-related\n",
    "        max_seq_length=64,\n",
    "        # Dataset\n",
    "        # packing a dataset means no padding is needed\n",
    "        packing=True,\n",
    "\n",
    "        ## GROUP 3: These are typical training parameters\n",
    "        num_train_epochs=10,\n",
    "        learning_rate=3e-4,\n",
    "        # Optimizer\n",
    "        # 8-bit Adam optimizer - doesn't help much if you're using LoRA!\n",
    "        optim='paged_adamw_8bit',\n",
    "\n",
    "        ## GROUP 4: Logging parameters\n",
    "        logging_steps=10,\n",
    "        logging_dir='./logs',\n",
    "        output_dir='./phi3-mini-yoda-adapter',\n",
    "        report_to='none'\n",
    "    )\n",
    "    \n",
    "    ### SFTTrainer\n",
    "    trainer = SFTTrainer(\n",
    "        model=repo_id,     ### this is a string, not a model\n",
    "        processing_class=tokenizer,\n",
    "        args=sft_config,\n",
    "        train_dataset=dataset,\n",
    "        peft_config=config,\n",
    "    )\n",
    "\n",
    "    print(f'n_gpu: {sft_config.n_gpu}; Mode: {sft_config.parallel_mode}')\n",
    "    print(f'Num Processes: {trainer.accelerator.num_processes}; Device: {trainer.accelerator.device}; Process Index: {trainer.accelerator.process_index}')\n",
    "    print(f'Accel Type: {trainer.accelerator.distributed_type}')\n",
    "\n",
    "    ### Training\n",
    "    train_result = trainer.train()\n",
    "    ### Saving the model\n",
    "    trainer.save_state()\n",
    "    trainer.save_model(sft_config.output_dir)\n",
    "    ### Saving configuration\n",
    "    if trainer.accelerator.is_main_process:\n",
    "        trainer.model.config.save_pretrained(sft_config.output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0ab46ba0-1f88-4997-a71e-967a2c634cc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Copy-and-paste the text below in your GitHub issue\n",
      "\n",
      "- `Accelerate` version: 1.3.0\n",
      "- Platform: Linux-6.8.0-48-generic-x86_64-with-glibc2.35\n",
      "- `accelerate` bash location: /opt/micromamba/envs/python_310/bin/accelerate\n",
      "- Python version: 3.10.14\n",
      "- Numpy version: 1.26.4\n",
      "- PyTorch version (GPU?): 2.2.1 (True)\n",
      "- PyTorch XPU available: False\n",
      "- PyTorch NPU available: False\n",
      "- PyTorch MLU available: False\n",
      "- PyTorch MUSA available: False\n",
      "- System RAM: 188.77 GB\n",
      "- GPU type: NVIDIA GeForce RTX 3090\n",
      "- `Accelerate` default config:\n",
      "\t- compute_environment: LOCAL_MACHINE\n",
      "\t- distributed_type: MULTI_GPU\n",
      "\t- mixed_precision: no\n",
      "\t- use_cpu: False\n",
      "\t- debug: False\n",
      "\t- num_processes: 2\n",
      "\t- machine_rank: 0\n",
      "\t- num_machines: 1\n",
      "\t- rdzv_backend: static\n",
      "\t- same_network: False\n",
      "\t- main_training_function: main\n",
      "\t- enable_cpu_affinity: False\n",
      "\t- downcast_bf16: False\n",
      "\t- tpu_use_cluster: False\n",
      "\t- tpu_use_sudo: False\n"
     ]
    }
   ],
   "source": [
    "# double-check your environment and setup\n",
    "!accelerate env"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "981510c3",
   "metadata": {},
   "source": [
    "### 5. CUDA Status"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "69b34a1f-1d3a-4bfd-b7f8-c5360bbd666b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "# https://github.com/huggingface/accelerate/issues/940\n",
    "torch.cuda.is_initialized()  # must be FALSE"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a765a8e",
   "metadata": {},
   "source": [
    "If CUDA has been already initialized, you'll get an error when starting the `notebook_launcher`.\n",
    "\n",
    "If you got `True` in the cell above, **RESTART THE NOTEBOOK** and **SKIP DIRECTLY TO STEP 3!**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5767d2e6",
   "metadata": {},
   "source": [
    "### 6. Launch\n",
    "\n",
    "Even if you got `False` in the cell above, if you **still get an error when starting the `notebook_launcher`**, **RESTART THE NOTEBOOK ONCE AGAIN** and **SKIP DIRECTLY TO STEP 3!**\n",
    "\n",
    "If everything goes well, you should see the following messages after a few seconds (or maybe 1 minute):\n",
    "\n",
    "```\n",
    "Launching training on 2 GPUs.\n",
    "\n",
    "{'': 0}\n",
    "n_gpu: 1; Mode: ParallelMode.DISTRIBUTED\n",
    "Num Processes: 2; Device: cuda:0; Process Index: 0\n",
    "Accel Type: MULTI_GPU\n",
    "{'': 1}\n",
    "n_gpu: 1; Mode: ParallelMode.DISTRIBUTED\n",
    "Num Processes: 2; Device: cuda:1; Process Index: 1\n",
    "Accel Type: MULTI_GPU\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a4c5d921",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Launching training on 2 GPUs.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/micromamba/envs/python_310/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.\n",
      "  warnings.warn(\n",
      "/opt/micromamba/envs/python_310/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n_gpu: 1; Mode: ParallelMode.DISTRIBUTED\n",
      "Num Processes: 2; Device: cuda:0; Process Index: 0\n",
      "Accel Type: MULTI_GPU\n",
      "n_gpu: 1; Mode: ParallelMode.DISTRIBUTED\n",
      "Num Processes: 2; Device: cuda:1; Process Index: 1\n",
      "Accel Type: MULTI_GPU\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n",
      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n",
      "[rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n",
      "[rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='110' max='110' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [110/110 03:09, Epoch 10/10]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>3.021900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>1.773200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>30</td>\n",
       "      <td>1.524300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>40</td>\n",
       "      <td>1.398000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50</td>\n",
       "      <td>1.292600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>60</td>\n",
       "      <td>1.151000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>70</td>\n",
       "      <td>0.988500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>80</td>\n",
       "      <td>0.802800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>90</td>\n",
       "      <td>0.629700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>100</td>\n",
       "      <td>0.514300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>110</td>\n",
       "      <td>0.438000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='110' max='110' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [110/110 03:09, Epoch 10/10]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>3.021900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>1.773200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>30</td>\n",
       "      <td>1.524300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>40</td>\n",
       "      <td>1.398000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50</td>\n",
       "      <td>1.292600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>60</td>\n",
       "      <td>1.151000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>70</td>\n",
       "      <td>0.988500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>80</td>\n",
       "      <td>0.802800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>90</td>\n",
       "      <td>0.629700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>100</td>\n",
       "      <td>0.514300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>110</td>\n",
       "      <td>0.438000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2025-02-03 11:07:52,626] torch.distributed.elastic.multiprocessing.api: [WARNING] Closing process 1701 via signal SIGTERM\n"
     ]
    }
   ],
   "source": [
    "from accelerate import notebook_launcher\n",
    "\n",
    "args = (repo_id, tokenizer, dataset)\n",
    "notebook_launcher(main, args, num_processes=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ffa75830",
   "metadata": {},
   "source": [
    "### 7. Reloading and Querying the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9085e30a-8254-4457-be08-dc702114e8c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b2c7edfa012549daa6b6a247b296a101",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from peft import AutoPeftModelForCausalLM\n",
    "\n",
    "# Merge adapter with base model\n",
    "model = AutoPeftModelForCausalLM.from_pretrained('./phi3-mini-yoda-adapter', device_map='auto')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d807c856-279c-48bf-86da-42e6896c70a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def gen_prompt(tokenizer, sentence):\n",
    "    converted_sample = [\n",
    "        {\"role\": \"user\", \"content\": sentence},\n",
    "    ]\n",
    "    prompt = tokenizer.apply_chat_template(converted_sample, \n",
    "                                           tokenize=False, \n",
    "                                           add_generation_prompt=True)\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9b5d9228-5043-45b6-8e59-8b76027764b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):\n",
    "    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors=\"pt\").to(model.device)\n",
    "\n",
    "    model.eval()\n",
    "    generation_output = model.generate(**tokenized_input,\n",
    "                                       eos_token_id=tokenizer.eos_token_id,\n",
    "                                       max_new_tokens=max_new_tokens)\n",
    "    \n",
    "    output = tokenizer.batch_decode(generation_output, \n",
    "                                    skip_special_tokens=skip_special_tokens)\n",
    "    return output[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c7312a38-a56b-4cbe-80d6-52f29e0dbd15",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|user|>\n",
      "There is bacon in this sandwich.<|end|>\n",
      "<|assistant|>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "sentence = 'There is bacon in this sandwich.'\n",
    "tokenizer = AutoTokenizer.from_pretrained('./phi3-mini-yoda-adapter')\n",
    "prompt = gen_prompt(tokenizer, sentence)\n",
    "print(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cff0b6ad-9638-47e4-b7fd-0bf37eaff078",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|user|> There is bacon in this sandwich.<|end|><|assistant|> In this sandwich, bacon there is.<|end|><|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "print(generate(model, tokenizer, prompt))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }