jrobles98 · August 15, 2024 19:57
diff --git a/Qwen2 ablation working Jupyter NB b/Qwen2 ablation working Jupyter NB
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5ba52b5a-e51c-43e2-b3c6-b6bbabd5d720",
   "metadata": {},
   "source": [
    "# INIT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1696c4b-327b-4523-9d04-8415291281b0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install transformers torch "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "id": "1e4ee1ef-3cdf-4c91-a43b-b40a910e7279",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load model directly\n",
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM, Qwen2Model, Qwen2Config\n",
    "import timeit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "id": "f3df2370-fa5f-4e63-922e-8bc7402a4859",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2-0.5B\")\n",
    "model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2-0.5B\")\n",
    "new_model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2-0.5B\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "id": "7705bd5f-5a8c-4139-88d1-1e82d636d832",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model:\n",
      "Qwen2ForCausalLM(\n",
      "  (model): Qwen2Model(\n",
      "    (embed_tokens): Embedding(151936, 896)\n",
      "    (layers): ModuleList(\n",
      "      (0-23): 24 x Qwen2DecoderLayer(\n",
      "        (self_attn): Qwen2SdpaAttention(\n",
      "          (q_proj): Linear(in_features=896, out_features=896, bias=True)\n",
      "          (k_proj): Linear(in_features=896, out_features=128, bias=True)\n",
      "          (v_proj): Linear(in_features=896, out_features=128, bias=True)\n",
      "          (o_proj): Linear(in_features=896, out_features=896, bias=False)\n",
      "          (rotary_emb): Qwen2RotaryEmbedding()\n",
      "        )\n",
      "        (mlp): Qwen2MLP(\n",
      "          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
      "          (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
      "          (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n",
      "          (act_fn): SiLU()\n",
      "        )\n",
      "        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
      "        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
      "      )\n",
      "    )\n",
      "    (norm): Qwen2RMSNorm((896,), eps=1e-06)\n",
      "  )\n",
      "  (lm_head): Linear(in_features=896, out_features=151936, bias=False)\n",
      ")\n",
      "\n",
      "\n",
      "Config:\n",
      "Qwen2Config {\n",
      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B\",\n",
      "  \"architectures\": [\n",
      "    \"Qwen2ForCausalLM\"\n",
      "  ],\n",
      "  \"attention_dropout\": 0.0,\n",
      "  \"bos_token_id\": 151643,\n",
      "  \"eos_token_id\": 151643,\n",
      "  \"hidden_act\": \"silu\",\n",
      "  \"hidden_size\": 896,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 4864,\n",
      "  \"max_position_embeddings\": 131072,\n",
      "  \"max_window_layers\": 24,\n",
      "  \"model_type\": \"qwen2\",\n",
      "  \"num_attention_heads\": 14,\n",
      "  \"num_hidden_layers\": 24,\n",
      "  \"num_key_value_heads\": 2,\n",
      "  \"rms_norm_eps\": 1e-06,\n",
      "  \"rope_theta\": 1000000.0,\n",
      "  \"sliding_window\": null,\n",
      "  \"tie_word_embeddings\": true,\n",
      "  \"torch_dtype\": \"bfloat16\",\n",
      "  \"transformers_version\": \"4.44.0\",\n",
      "  \"use_cache\": true,\n",
      "  \"use_sliding_window\": false,\n",
      "  \"vocab_size\": 151936\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"Model:\\n\" + str(model))\n",
    "print(\"\\n\\nConfig:\\n\" + str(model.config))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47f47a98-e8e1-4e29-87be-5336e8764841",
   "metadata": {},
   "source": [
    "# Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "id": "b82fae4e-23ce-4270-8ea0-ddbf0ab4460e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare the total numer of layers to keep (including the last)\n",
    "\n",
    "n_total_layers = 20"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7938babc-f2a2-4529-b247-77a372ffb27f",
   "metadata": {},
   "source": [
    "> This following cell is not going to work because the configuration is used only in the instantiation of the model and thier layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "id": "c2fe3b3f-fce3-4fae-9315-e2e52f54f72b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Param:                             \tMODEL   \t\tNEW_MODEL\n",
      "\n",
      "\n",
      "vocab_size                         \t{'vocab_size\t\t{'vocab_size':\n",
      "max_position_embeddings            \t32768          \t\t32768\n",
      "hidden_size                        \t4096           \t\t4096\n",
      "intermediate_size                  \t22016          \t\t22016\n",
      "num_hidden_layers                  \t32             --->\t20\n",
      "num_attention_heads                \t32             \t\t32\n",
      "use_sliding_window                 \tFalse          \t\tFalse\n",
      "sliding_window                     \tNone           \t\tNone\n",
      "max_window_layers                  \t28             --->\t20\n",
      "num_key_value_heads                \t32             \t\t32\n",
      "hidden_act                         \tsilu           \t\tsilu\n",
      "initializer_range                  \t0.02           \t\t0.02\n",
      "rms_norm_eps                       \t1e-06          \t\t1e-06\n",
      "use_cache                          \tTrue           \t\tTrue\n",
      "rope_theta                         \t10000.0        \t\t10000.0\n",
      "attention_dropout                  \t0.0            \t\t0.0\n",
      "return_dict                        \tTrue           \t\tTrue\n",
      "output_hidden_states               \tFalse          \t\tFalse\n",
      "output_attentions                  \tFalse          \t\tFalse\n",
      "torchscript                        \tFalse          \t\tFalse\n",
      "torch_dtype                        \tNone           \t\tNone\n",
      "use_bfloat16                       \tFalse          \t\tFalse\n",
      "tf_legacy_loss                     \tFalse          \t\tFalse\n",
      "pruned_heads                       \t{}             \t\t{}\n",
      "tie_word_embeddings                \tFalse          \t\tFalse\n",
      "chunk_size_feed_forward            \t0              \t\t0\n",
      "is_encoder_decoder                 \tFalse          \t\tFalse\n",
      "is_decoder                         \tFalse          \t\tFalse\n",
      "cross_attention_hidden_size        \tNone           \t\tNone\n",
      "add_cross_attention                \tFalse          \t\tFalse\n",
      "tie_encoder_decoder                \tFalse          \t\tFalse\n",
      "max_length                         \t20             \t\t20\n",
      "min_length                         \t0              \t\t0\n",
      "do_sample                          \tFalse          \t\tFalse\n",
      "early_stopping                     \tFalse          \t\tFalse\n",
      "num_beams                          \t1              \t\t1\n",
      "num_beam_groups                    \t1              \t\t1\n",
      "diversity_penalty                  \t0.0            \t\t0.0\n",
      "temperature                        \t1.0            \t\t1.0\n",
      "top_k                              \t50             \t\t50\n",
      "top_p                              \t1.0            \t\t1.0\n",
      "typical_p                          \t1.0            \t\t1.0\n",
      "repetition_penalty                 \t1.0            \t\t1.0\n",
      "length_penalty                     \t1.0            \t\t1.0\n",
      "no_repeat_ngram_size               \t0              \t\t0\n",
      "encoder_no_repeat_ngram_size       \t0              \t\t0\n",
      "bad_words_ids                      \tNone           \t\tNone\n",
      "num_return_sequences               \t1              \t\t1\n",
      "output_scores                      \tFalse          \t\tFalse\n",
      "return_dict_in_generate            \tFalse          \t\tFalse\n",
      "forced_bos_token_id                \tNone           \t\tNone\n",
      "forced_eos_token_id                \tNone           \t\tNone\n",
      "remove_invalid_values              \tFalse          \t\tFalse\n",
      "exponential_decay_length_penalty   \tNone           \t\tNone\n",
      "suppress_tokens                    \tNone           \t\tNone\n",
      "begin_suppress_tokens              \tNone           \t\tNone\n",
      "architectures                      \tNone           \t\tNone\n",
      "finetuning_task                    \tNone           \t\tNone\n",
      "id2label                           \t{0: 'LABEL_0\t\t{0: 'LABEL_0',\n",
      "label2id                           \t{'LABEL_0': \t\t{'LABEL_0': 0,\n",
      "tokenizer_class                    \tNone           \t\tNone\n",
      "prefix                             \tNone           \t\tNone\n",
      "bos_token_id                       \tNone           \t\tNone\n",
      "pad_token_id                       \tNone           \t\tNone\n",
      "eos_token_id                       \tNone           \t\tNone\n",
      "sep_token_id                       \tNone           \t\tNone\n",
      "decoder_start_token_id             \tNone           \t\tNone\n",
      "task_specific_params               \tNone           \t\tNone\n",
      "problem_type                       \tNone           \t\tNone\n",
      "_name_or_path                      \t               \t\t\n",
      "transformers_version               \t4.44.0         \t\t4.44.0\n",
      "model_type                         \tqwen2          \t\tqwen2\n"
     ]
    }
   ],
   "source": [
    "old_config = Qwen2Config(model.config.to_dict())\n",
    "new_config = Qwen2Config(model.config.to_dict())\n",
    "new_config.update(\n",
    "    {\n",
    "        \"num_hidden_layers\":n_total_layers,\n",
    "        \"max_window_layers\":n_total_layers,\n",
    "    }\n",
    ")\n",
    "\n",
    "# apply config on new model\n",
    "new_model.config = new_config\n",
    "\n",
    "# reporting with a header\n",
    "print(\"\\nParam:\" + \" \"*29 + \"\\t\" + \"MODEL\" + \" \"*3 + \"\\t\\t\" + \"NEW_MODEL\\n\\n\")\n",
    "_ = [print(param + \" \"*(35-len(param)) + \"\\t\" + str(old)[:12] + \" \"*(15-len(str(old))) + (\"--->\" if old!=new else \"\\t\") + \"\\t\" + str(new)[:14]) for param,old,new in zip(dict(old_config.to_dict()).keys(), dict(old_config.to_dict()).values(), dict(new_config.to_dict()).values())]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c77a8f2b-ebeb-440d-abb2-1c785ab543ea",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Simple test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "id": "6094fade-e963-4a6c-b6e2-8f3ab38ce411",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = tokenizer('''\n",
    "<|im_start|>user Hola<|im_end|>\n",
    "<|im_start|>assistant ''', return_tensors=\"pt\", return_attention_mask=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "id": "4a3b5522-831b-491b-bf1f-5a157a55e10c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "<|im_start|>user Hola<|im_end|>\n",
      "<|im_start|>assistant 1. The first sentence of the first paragraph is a title, so it is not a proper title. The second sentence is a subtitle, so it is not a proper subtitle. The third sentence is a paragraph, so it is a proper paragraph.\n",
      "9.49 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 \n",
    "outputs = model.generate(**inputs, max_new_tokens=50)\n",
    "text = tokenizer.batch_decode(outputs)[0]\n",
    "print(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "981396e4-f91c-43fe-b7bf-8ee14b9f184a",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# Layer decomposition/recomposition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "id": "508c36ec-016b-4905-801c-4c9026bf2aed",
   "metadata": {},
   "outputs": [],
   "source": [
    "old_layers = model.model.layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "id": "265fdc05-6164-492c-9669-d240d5598a29",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ModuleList(\n",
       "  (0-23): 24 x Qwen2DecoderLayer(\n",
       "    (self_attn): Qwen2SdpaAttention(\n",
       "      (q_proj): Linear(in_features=896, out_features=896, bias=True)\n",
       "      (k_proj): Linear(in_features=896, out_features=128, bias=True)\n",
       "      (v_proj): Linear(in_features=896, out_features=128, bias=True)\n",
       "      (o_proj): Linear(in_features=896, out_features=896, bias=False)\n",
       "      (rotary_emb): Qwen2RotaryEmbedding()\n",
       "    )\n",
       "    (mlp): Qwen2MLP(\n",
       "      (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
       "      (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
       "      (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n",
       "      (act_fn): SiLU()\n",
       "    )\n",
       "    (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
       "    (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 265,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "old_layers"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76c30af4-d102-47da-b058-8249c7bed767",
   "metadata": {},
   "source": [
    "### Create a `ModuleList`\n",
    "> ###### We created `ModuleList` with the first  `n` layers and also the last one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "id": "000d8119-06af-4493-9334-4a12c3ef6546",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '23']\n"
     ]
    }
   ],
   "source": [
    "\n",
    "new_layers = torch.nn.ModuleList(\n",
    "    old_layers[:n_total_layers-1] + \n",
    "    [\n",
    "        old_layers[-1]\n",
    "    ]\n",
    ")\n",
    "print([str(l.self_attn.layer_idx)for l in new_layers])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "id": "fdd95432-3de9-42bd-b046-2adbf35f1b01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '23']\n"
     ]
    }
   ],
   "source": [
    "# TEST: to not use the final layer, only the first ones found\n",
    "new_layers = torch.nn.ModuleList(\n",
    "    old_layers[:n_total_layers]\n",
    ")\n",
    "print([str(l.self_attn.layer_idx)for l in new_layers])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4251e8dd-de8c-445a-a7cb-43d115548903",
   "metadata": {},
   "source": [
    "### Reassign the layer ids and also their config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "id": "06e88779-7905-4264-ab2c-a3eb0296b021",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']\n"
     ]
    }
   ],
   "source": [
    "for i, layer in enumerate(new_layers):\n",
    "    layer.layer_idx = layer.self_attn.layer_idx = i\n",
    "    \n",
    "print([str(l.self_attn.layer_idx)for l in new_layers])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4380a691-08cf-47bd-bce9-da18b0bbf68f",
   "metadata": {},
   "source": [
    "# Model Interpolation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "id": "f614cc02-c326-4663-8fb4-a72a87c192af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Qwen2Config {\n",
       "  \"attention_dropout\": 0.0,\n",
       "  \"hidden_act\": \"silu\",\n",
       "  \"hidden_size\": 4096,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 22016,\n",
       "  \"max_position_embeddings\": 32768,\n",
       "  \"max_window_layers\": 20,\n",
       "  \"model_type\": \"qwen2\",\n",
       "  \"num_attention_heads\": 32,\n",
       "  \"num_hidden_layers\": 20,\n",
       "  \"num_key_value_heads\": 32,\n",
       "  \"rms_norm_eps\": 1e-06,\n",
       "  \"rope_theta\": 10000.0,\n",
       "  \"sliding_window\": null,\n",
       "  \"tie_word_embeddings\": false,\n",
       "  \"transformers_version\": \"4.44.0\",\n",
       "  \"use_cache\": true,\n",
       "  \"use_sliding_window\": false,\n",
       "  \"vocab_size\": {\n",
       "    \"_name_or_path\": \"Qwen/Qwen2-0.5B\",\n",
       "    \"add_cross_attention\": false,\n",
       "    \"architectures\": [\n",
       "      \"Qwen2ForCausalLM\"\n",
       "    ],\n",
       "    \"attention_dropout\": 0.0,\n",
       "    \"bad_words_ids\": null,\n",
       "    \"begin_suppress_tokens\": null,\n",
       "    \"bos_token_id\": 151643,\n",
       "    \"chunk_size_feed_forward\": 0,\n",
       "    \"cross_attention_hidden_size\": null,\n",
       "    \"decoder_start_token_id\": null,\n",
       "    \"diversity_penalty\": 0.0,\n",
       "    \"do_sample\": false,\n",
       "    \"early_stopping\": false,\n",
       "    \"encoder_no_repeat_ngram_size\": 0,\n",
       "    \"eos_token_id\": 151643,\n",
       "    \"exponential_decay_length_penalty\": null,\n",
       "    \"finetuning_task\": null,\n",
       "    \"forced_bos_token_id\": null,\n",
       "    \"forced_eos_token_id\": null,\n",
       "    \"hidden_act\": \"silu\",\n",
       "    \"hidden_size\": 896,\n",
       "    \"id2label\": {\n",
       "      \"0\": \"LABEL_0\",\n",
       "      \"1\": \"LABEL_1\"\n",
       "    },\n",
       "    \"initializer_range\": 0.02,\n",
       "    \"intermediate_size\": 4864,\n",
       "    \"is_decoder\": false,\n",
       "    \"is_encoder_decoder\": false,\n",
       "    \"label2id\": {\n",
       "      \"LABEL_0\": 0,\n",
       "      \"LABEL_1\": 1\n",
       "    },\n",
       "    \"length_penalty\": 1.0,\n",
       "    \"max_length\": 20,\n",
       "    \"max_position_embeddings\": 131072,\n",
       "    \"max_window_layers\": 24,\n",
       "    \"min_length\": 0,\n",
       "    \"model_type\": \"qwen2\",\n",
       "    \"no_repeat_ngram_size\": 0,\n",
       "    \"num_attention_heads\": 14,\n",
       "    \"num_beam_groups\": 1,\n",
       "    \"num_beams\": 1,\n",
       "    \"num_hidden_layers\": 24,\n",
       "    \"num_key_value_heads\": 2,\n",
       "    \"num_return_sequences\": 1,\n",
       "    \"output_attentions\": false,\n",
       "    \"output_hidden_states\": false,\n",
       "    \"output_scores\": false,\n",
       "    \"pad_token_id\": null,\n",
       "    \"prefix\": null,\n",
       "    \"problem_type\": null,\n",
       "    \"pruned_heads\": {},\n",
       "    \"remove_invalid_values\": false,\n",
       "    \"repetition_penalty\": 1.0,\n",
       "    \"return_dict\": true,\n",
       "    \"return_dict_in_generate\": false,\n",
       "    \"rms_norm_eps\": 1e-06,\n",
       "    \"rope_theta\": 1000000.0,\n",
       "    \"sep_token_id\": null,\n",
       "    \"sliding_window\": null,\n",
       "    \"suppress_tokens\": null,\n",
       "    \"task_specific_params\": null,\n",
       "    \"temperature\": 1.0,\n",
       "    \"tf_legacy_loss\": false,\n",
       "    \"tie_encoder_decoder\": false,\n",
       "    \"tie_word_embeddings\": true,\n",
       "    \"tokenizer_class\": null,\n",
       "    \"top_k\": 50,\n",
       "    \"top_p\": 1.0,\n",
       "    \"torch_dtype\": \"bfloat16\",\n",
       "    \"torchscript\": false,\n",
       "    \"transformers_version\": \"4.44.0\",\n",
       "    \"typical_p\": 1.0,\n",
       "    \"use_bfloat16\": false,\n",
       "    \"use_cache\": true,\n",
       "    \"use_sliding_window\": false,\n",
       "    \"vocab_size\": 151936\n",
       "  }\n",
       "}"
      ]
     },
     "execution_count": 268,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "id": "ce4e50df-fa65-4f94-9212-29d86acc4862",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_model.model.layers = new_layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "id": "b3c58c14-7b51-4a6c-8f98-a36cfd9bca55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Qwen2Model(\n",
       "  (embed_tokens): Embedding(151936, 896)\n",
       "  (layers): ModuleList(\n",
       "    (0-19): 20 x Qwen2DecoderLayer(\n",
       "      (self_attn): Qwen2SdpaAttention(\n",
       "        (q_proj): Linear(in_features=896, out_features=896, bias=True)\n",
       "        (k_proj): Linear(in_features=896, out_features=128, bias=True)\n",
       "        (v_proj): Linear(in_features=896, out_features=128, bias=True)\n",
       "        (o_proj): Linear(in_features=896, out_features=896, bias=False)\n",
       "        (rotary_emb): Qwen2RotaryEmbedding()\n",
       "      )\n",
       "      (mlp): Qwen2MLP(\n",
       "        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
       "        (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n",
       "        (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n",
       "        (act_fn): SiLU()\n",
       "      )\n",
       "      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
       "      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n",
       "    )\n",
       "  )\n",
       "  (norm): Qwen2RMSNorm((896,), eps=1e-06)\n",
       ")"
      ]
     },
     "execution_count": 270,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_model.model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf3b611e-5973-492e-9f12-923dc62a7b0c",
   "metadata": {},
   "source": [
    "# Final testing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20e81995-b63a-4547-a381-ee35defec98a",
   "metadata": {},
   "source": [
    "#### Generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "id": "9f9c5e19-c90f-4ae6-aac7-5c68b70f9286",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9.55 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 \n",
    "outputs = model.generate(**inputs, max_new_tokens=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "id": "e6059909-2993-46ea-ad2b-04cf3b95379b",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.36 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -r1 -n1 \n",
    "outputs = new_model.generate(**inputs, max_new_tokens=50)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13b39e56-c1a5-4deb-9221-8dfa20fdec3f",
   "metadata": {},
   "source": [
    "#### decode the answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "id": "b5f96806-6ec0-4f74-b78d-7b8bf212c04d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "<|im_start|>user Hola, te llamas Juana<|im_end|>\n",
      "<|im_start|>assistant 1. Hola, te llamas Juana\n"
     ]
    }
   ],
   "source": [
    "text = tokenizer.batch_decode(outputs)[0]\n",
    "print(text) 1. Hola, te llamas Juana"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aecdb247-ff40-44aa-abb6-9013c7425955",
   "metadata": {},
   "source": [
    "# CLEANUP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 276,
   "id": "867af9c0-0ec8-42b0-8893-994afaf3f324",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfuly deleted: \"model\".\n",
      "Successfuly deleted: \"new_model\".\n",
      "Successfuly deleted: \"old_config\".\n",
      "Successfuly deleted: \"new_config\".\n",
      "Successfuly deleted: \"old_layers\".\n",
      "Successfuly deleted: \"new_layers\".\n",
      "Successfuly deleted: \"tokenizer\".\n"
     ]
    }
   ],
   "source": [
    "var_bin = (\"model\", \"new_model\", \"old_config\", \"new_config\", \"old_layers\", \"new_layers\", \"tokenizer\")\n",
    "for variable in var_bin:\n",
    "    try:\n",
    "        exec(f'del {variable} ')\n",
    "        print(f'Successfuly deleted: \"{variable}\".')\n",
    "    except NameError:\n",
    "        print(f'The variable \"{variable}\" was not found.')\n",
    "del var_bin"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }