Created
August 15, 2024 19:57
-
-
Save jrobles98/788b6d9734bcbe76319e1ebdbae045e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "5ba52b5a-e51c-43e2-b3c6-b6bbabd5d720", | |
"metadata": {}, | |
"source": [ | |
"# INIT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d1696c4b-327b-4523-9d04-8415291281b0", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install transformers torch " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 258, | |
"id": "1e4ee1ef-3cdf-4c91-a43b-b40a910e7279", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Load model directly\n", | |
"import torch\n", | |
"from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM, Qwen2Model, Qwen2Config\n", | |
"import timeit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 260, | |
"id": "f3df2370-fa5f-4e63-922e-8bc7402a4859", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2-0.5B\")\n", | |
"model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2-0.5B\")\n", | |
"new_model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen2-0.5B\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 261, | |
"id": "7705bd5f-5a8c-4139-88d1-1e82d636d832", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Model:\n", | |
"Qwen2ForCausalLM(\n", | |
" (model): Qwen2Model(\n", | |
" (embed_tokens): Embedding(151936, 896)\n", | |
" (layers): ModuleList(\n", | |
" (0-23): 24 x Qwen2DecoderLayer(\n", | |
" (self_attn): Qwen2SdpaAttention(\n", | |
" (q_proj): Linear(in_features=896, out_features=896, bias=True)\n", | |
" (k_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (v_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (o_proj): Linear(in_features=896, out_features=896, bias=False)\n", | |
" (rotary_emb): Qwen2RotaryEmbedding()\n", | |
" )\n", | |
" (mlp): Qwen2MLP(\n", | |
" (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n", | |
" (act_fn): SiLU()\n", | |
" )\n", | |
" (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" )\n", | |
" )\n", | |
" (norm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" )\n", | |
" (lm_head): Linear(in_features=896, out_features=151936, bias=False)\n", | |
")\n", | |
"\n", | |
"\n", | |
"Config:\n", | |
"Qwen2Config {\n", | |
" \"_name_or_path\": \"Qwen/Qwen2-0.5B\",\n", | |
" \"architectures\": [\n", | |
" \"Qwen2ForCausalLM\"\n", | |
" ],\n", | |
" \"attention_dropout\": 0.0,\n", | |
" \"bos_token_id\": 151643,\n", | |
" \"eos_token_id\": 151643,\n", | |
" \"hidden_act\": \"silu\",\n", | |
" \"hidden_size\": 896,\n", | |
" \"initializer_range\": 0.02,\n", | |
" \"intermediate_size\": 4864,\n", | |
" \"max_position_embeddings\": 131072,\n", | |
" \"max_window_layers\": 24,\n", | |
" \"model_type\": \"qwen2\",\n", | |
" \"num_attention_heads\": 14,\n", | |
" \"num_hidden_layers\": 24,\n", | |
" \"num_key_value_heads\": 2,\n", | |
" \"rms_norm_eps\": 1e-06,\n", | |
" \"rope_theta\": 1000000.0,\n", | |
" \"sliding_window\": null,\n", | |
" \"tie_word_embeddings\": true,\n", | |
" \"torch_dtype\": \"bfloat16\",\n", | |
" \"transformers_version\": \"4.44.0\",\n", | |
" \"use_cache\": true,\n", | |
" \"use_sliding_window\": false,\n", | |
" \"vocab_size\": 151936\n", | |
"}\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Model:\\n\" + str(model))\n", | |
"print(\"\\n\\nConfig:\\n\" + str(model.config))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "47f47a98-e8e1-4e29-87be-5336e8764841", | |
"metadata": {}, | |
"source": [ | |
"# Config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 262, | |
"id": "b82fae4e-23ce-4270-8ea0-ddbf0ab4460e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Declare the total numer of layers to keep (including the last)\n", | |
"\n", | |
"n_total_layers = 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7938babc-f2a2-4529-b247-77a372ffb27f", | |
"metadata": {}, | |
"source": [ | |
"> This following cell is not going to work because the configuration is used only in the instantiation of the model and thier layers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 263, | |
"id": "c2fe3b3f-fce3-4fae-9315-e2e52f54f72b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"Param: \tMODEL \t\tNEW_MODEL\n", | |
"\n", | |
"\n", | |
"vocab_size \t{'vocab_size\t\t{'vocab_size':\n", | |
"max_position_embeddings \t32768 \t\t32768\n", | |
"hidden_size \t4096 \t\t4096\n", | |
"intermediate_size \t22016 \t\t22016\n", | |
"num_hidden_layers \t32 --->\t20\n", | |
"num_attention_heads \t32 \t\t32\n", | |
"use_sliding_window \tFalse \t\tFalse\n", | |
"sliding_window \tNone \t\tNone\n", | |
"max_window_layers \t28 --->\t20\n", | |
"num_key_value_heads \t32 \t\t32\n", | |
"hidden_act \tsilu \t\tsilu\n", | |
"initializer_range \t0.02 \t\t0.02\n", | |
"rms_norm_eps \t1e-06 \t\t1e-06\n", | |
"use_cache \tTrue \t\tTrue\n", | |
"rope_theta \t10000.0 \t\t10000.0\n", | |
"attention_dropout \t0.0 \t\t0.0\n", | |
"return_dict \tTrue \t\tTrue\n", | |
"output_hidden_states \tFalse \t\tFalse\n", | |
"output_attentions \tFalse \t\tFalse\n", | |
"torchscript \tFalse \t\tFalse\n", | |
"torch_dtype \tNone \t\tNone\n", | |
"use_bfloat16 \tFalse \t\tFalse\n", | |
"tf_legacy_loss \tFalse \t\tFalse\n", | |
"pruned_heads \t{} \t\t{}\n", | |
"tie_word_embeddings \tFalse \t\tFalse\n", | |
"chunk_size_feed_forward \t0 \t\t0\n", | |
"is_encoder_decoder \tFalse \t\tFalse\n", | |
"is_decoder \tFalse \t\tFalse\n", | |
"cross_attention_hidden_size \tNone \t\tNone\n", | |
"add_cross_attention \tFalse \t\tFalse\n", | |
"tie_encoder_decoder \tFalse \t\tFalse\n", | |
"max_length \t20 \t\t20\n", | |
"min_length \t0 \t\t0\n", | |
"do_sample \tFalse \t\tFalse\n", | |
"early_stopping \tFalse \t\tFalse\n", | |
"num_beams \t1 \t\t1\n", | |
"num_beam_groups \t1 \t\t1\n", | |
"diversity_penalty \t0.0 \t\t0.0\n", | |
"temperature \t1.0 \t\t1.0\n", | |
"top_k \t50 \t\t50\n", | |
"top_p \t1.0 \t\t1.0\n", | |
"typical_p \t1.0 \t\t1.0\n", | |
"repetition_penalty \t1.0 \t\t1.0\n", | |
"length_penalty \t1.0 \t\t1.0\n", | |
"no_repeat_ngram_size \t0 \t\t0\n", | |
"encoder_no_repeat_ngram_size \t0 \t\t0\n", | |
"bad_words_ids \tNone \t\tNone\n", | |
"num_return_sequences \t1 \t\t1\n", | |
"output_scores \tFalse \t\tFalse\n", | |
"return_dict_in_generate \tFalse \t\tFalse\n", | |
"forced_bos_token_id \tNone \t\tNone\n", | |
"forced_eos_token_id \tNone \t\tNone\n", | |
"remove_invalid_values \tFalse \t\tFalse\n", | |
"exponential_decay_length_penalty \tNone \t\tNone\n", | |
"suppress_tokens \tNone \t\tNone\n", | |
"begin_suppress_tokens \tNone \t\tNone\n", | |
"architectures \tNone \t\tNone\n", | |
"finetuning_task \tNone \t\tNone\n", | |
"id2label \t{0: 'LABEL_0\t\t{0: 'LABEL_0',\n", | |
"label2id \t{'LABEL_0': \t\t{'LABEL_0': 0,\n", | |
"tokenizer_class \tNone \t\tNone\n", | |
"prefix \tNone \t\tNone\n", | |
"bos_token_id \tNone \t\tNone\n", | |
"pad_token_id \tNone \t\tNone\n", | |
"eos_token_id \tNone \t\tNone\n", | |
"sep_token_id \tNone \t\tNone\n", | |
"decoder_start_token_id \tNone \t\tNone\n", | |
"task_specific_params \tNone \t\tNone\n", | |
"problem_type \tNone \t\tNone\n", | |
"_name_or_path \t \t\t\n", | |
"transformers_version \t4.44.0 \t\t4.44.0\n", | |
"model_type \tqwen2 \t\tqwen2\n" | |
] | |
} | |
], | |
"source": [ | |
"old_config = Qwen2Config(model.config.to_dict())\n", | |
"new_config = Qwen2Config(model.config.to_dict())\n", | |
"new_config.update(\n", | |
" {\n", | |
" \"num_hidden_layers\":n_total_layers,\n", | |
" \"max_window_layers\":n_total_layers,\n", | |
" }\n", | |
")\n", | |
"\n", | |
"# apply config on new model\n", | |
"new_model.config = new_config\n", | |
"\n", | |
"# reporting with a header\n", | |
"print(\"\\nParam:\" + \" \"*29 + \"\\t\" + \"MODEL\" + \" \"*3 + \"\\t\\t\" + \"NEW_MODEL\\n\\n\")\n", | |
"_ = [print(param + \" \"*(35-len(param)) + \"\\t\" + str(old)[:12] + \" \"*(15-len(str(old))) + (\"--->\" if old!=new else \"\\t\") + \"\\t\" + str(new)[:14]) for param,old,new in zip(dict(old_config.to_dict()).keys(), dict(old_config.to_dict()).values(), dict(new_config.to_dict()).values())]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c77a8f2b-ebeb-440d-abb2-1c785ab543ea", | |
"metadata": { | |
"jp-MarkdownHeadingCollapsed": true | |
}, | |
"source": [ | |
"# Simple test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 207, | |
"id": "6094fade-e963-4a6c-b6e2-8f3ab38ce411", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"inputs = tokenizer('''\n", | |
"<|im_start|>user Hola<|im_end|>\n", | |
"<|im_start|>assistant ''', return_tensors=\"pt\", return_attention_mask=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 208, | |
"id": "4a3b5522-831b-491b-bf1f-5a157a55e10c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", | |
"Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"<|im_start|>user Hola<|im_end|>\n", | |
"<|im_start|>assistant 1. The first sentence of the first paragraph is a title, so it is not a proper title. The second sentence is a subtitle, so it is not a proper subtitle. The third sentence is a paragraph, so it is a proper paragraph.\n", | |
"9.49 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 \n", | |
"outputs = model.generate(**inputs, max_new_tokens=50)\n", | |
"text = tokenizer.batch_decode(outputs)[0]\n", | |
"print(text)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "981396e4-f91c-43fe-b7bf-8ee14b9f184a", | |
"metadata": { | |
"editable": true, | |
"slideshow": { | |
"slide_type": "" | |
}, | |
"tags": [] | |
}, | |
"source": [ | |
"# Layer decomposition/recomposition" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 264, | |
"id": "508c36ec-016b-4905-801c-4c9026bf2aed", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"old_layers = model.model.layers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 265, | |
"id": "265fdc05-6164-492c-9669-d240d5598a29", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"ModuleList(\n", | |
" (0-23): 24 x Qwen2DecoderLayer(\n", | |
" (self_attn): Qwen2SdpaAttention(\n", | |
" (q_proj): Linear(in_features=896, out_features=896, bias=True)\n", | |
" (k_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (v_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (o_proj): Linear(in_features=896, out_features=896, bias=False)\n", | |
" (rotary_emb): Qwen2RotaryEmbedding()\n", | |
" )\n", | |
" (mlp): Qwen2MLP(\n", | |
" (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n", | |
" (act_fn): SiLU()\n", | |
" )\n", | |
" (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" )\n", | |
")" | |
] | |
}, | |
"execution_count": 265, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"old_layers" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "76c30af4-d102-47da-b058-8249c7bed767", | |
"metadata": {}, | |
"source": [ | |
"### Create a `ModuleList`\n", | |
"> ###### We created `ModuleList` with the first `n` layers and also the last one" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 266, | |
"id": "000d8119-06af-4493-9334-4a12c3ef6546", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '23']\n" | |
] | |
} | |
], | |
"source": [ | |
"\n", | |
"new_layers = torch.nn.ModuleList(\n", | |
" old_layers[:n_total_layers-1] + \n", | |
" [\n", | |
" old_layers[-1]\n", | |
" ]\n", | |
")\n", | |
"print([str(l.self_attn.layer_idx)for l in new_layers])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 216, | |
"id": "fdd95432-3de9-42bd-b046-2adbf35f1b01", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '23']\n" | |
] | |
} | |
], | |
"source": [ | |
"# TEST: to not use the final layer, only the first ones found\n", | |
"new_layers = torch.nn.ModuleList(\n", | |
" old_layers[:n_total_layers]\n", | |
")\n", | |
"print([str(l.self_attn.layer_idx)for l in new_layers])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4251e8dd-de8c-445a-a7cb-43d115548903", | |
"metadata": {}, | |
"source": [ | |
"### Reassign the layer ids and also their config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 267, | |
"id": "06e88779-7905-4264-ab2c-a3eb0296b021", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']\n" | |
] | |
} | |
], | |
"source": [ | |
"for i, layer in enumerate(new_layers):\n", | |
" layer.layer_idx = layer.self_attn.layer_idx = i\n", | |
" \n", | |
"print([str(l.self_attn.layer_idx)for l in new_layers])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4380a691-08cf-47bd-bce9-da18b0bbf68f", | |
"metadata": {}, | |
"source": [ | |
"# Model Interpolation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 268, | |
"id": "f614cc02-c326-4663-8fb4-a72a87c192af", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Qwen2Config {\n", | |
" \"attention_dropout\": 0.0,\n", | |
" \"hidden_act\": \"silu\",\n", | |
" \"hidden_size\": 4096,\n", | |
" \"initializer_range\": 0.02,\n", | |
" \"intermediate_size\": 22016,\n", | |
" \"max_position_embeddings\": 32768,\n", | |
" \"max_window_layers\": 20,\n", | |
" \"model_type\": \"qwen2\",\n", | |
" \"num_attention_heads\": 32,\n", | |
" \"num_hidden_layers\": 20,\n", | |
" \"num_key_value_heads\": 32,\n", | |
" \"rms_norm_eps\": 1e-06,\n", | |
" \"rope_theta\": 10000.0,\n", | |
" \"sliding_window\": null,\n", | |
" \"tie_word_embeddings\": false,\n", | |
" \"transformers_version\": \"4.44.0\",\n", | |
" \"use_cache\": true,\n", | |
" \"use_sliding_window\": false,\n", | |
" \"vocab_size\": {\n", | |
" \"_name_or_path\": \"Qwen/Qwen2-0.5B\",\n", | |
" \"add_cross_attention\": false,\n", | |
" \"architectures\": [\n", | |
" \"Qwen2ForCausalLM\"\n", | |
" ],\n", | |
" \"attention_dropout\": 0.0,\n", | |
" \"bad_words_ids\": null,\n", | |
" \"begin_suppress_tokens\": null,\n", | |
" \"bos_token_id\": 151643,\n", | |
" \"chunk_size_feed_forward\": 0,\n", | |
" \"cross_attention_hidden_size\": null,\n", | |
" \"decoder_start_token_id\": null,\n", | |
" \"diversity_penalty\": 0.0,\n", | |
" \"do_sample\": false,\n", | |
" \"early_stopping\": false,\n", | |
" \"encoder_no_repeat_ngram_size\": 0,\n", | |
" \"eos_token_id\": 151643,\n", | |
" \"exponential_decay_length_penalty\": null,\n", | |
" \"finetuning_task\": null,\n", | |
" \"forced_bos_token_id\": null,\n", | |
" \"forced_eos_token_id\": null,\n", | |
" \"hidden_act\": \"silu\",\n", | |
" \"hidden_size\": 896,\n", | |
" \"id2label\": {\n", | |
" \"0\": \"LABEL_0\",\n", | |
" \"1\": \"LABEL_1\"\n", | |
" },\n", | |
" \"initializer_range\": 0.02,\n", | |
" \"intermediate_size\": 4864,\n", | |
" \"is_decoder\": false,\n", | |
" \"is_encoder_decoder\": false,\n", | |
" \"label2id\": {\n", | |
" \"LABEL_0\": 0,\n", | |
" \"LABEL_1\": 1\n", | |
" },\n", | |
" \"length_penalty\": 1.0,\n", | |
" \"max_length\": 20,\n", | |
" \"max_position_embeddings\": 131072,\n", | |
" \"max_window_layers\": 24,\n", | |
" \"min_length\": 0,\n", | |
" \"model_type\": \"qwen2\",\n", | |
" \"no_repeat_ngram_size\": 0,\n", | |
" \"num_attention_heads\": 14,\n", | |
" \"num_beam_groups\": 1,\n", | |
" \"num_beams\": 1,\n", | |
" \"num_hidden_layers\": 24,\n", | |
" \"num_key_value_heads\": 2,\n", | |
" \"num_return_sequences\": 1,\n", | |
" \"output_attentions\": false,\n", | |
" \"output_hidden_states\": false,\n", | |
" \"output_scores\": false,\n", | |
" \"pad_token_id\": null,\n", | |
" \"prefix\": null,\n", | |
" \"problem_type\": null,\n", | |
" \"pruned_heads\": {},\n", | |
" \"remove_invalid_values\": false,\n", | |
" \"repetition_penalty\": 1.0,\n", | |
" \"return_dict\": true,\n", | |
" \"return_dict_in_generate\": false,\n", | |
" \"rms_norm_eps\": 1e-06,\n", | |
" \"rope_theta\": 1000000.0,\n", | |
" \"sep_token_id\": null,\n", | |
" \"sliding_window\": null,\n", | |
" \"suppress_tokens\": null,\n", | |
" \"task_specific_params\": null,\n", | |
" \"temperature\": 1.0,\n", | |
" \"tf_legacy_loss\": false,\n", | |
" \"tie_encoder_decoder\": false,\n", | |
" \"tie_word_embeddings\": true,\n", | |
" \"tokenizer_class\": null,\n", | |
" \"top_k\": 50,\n", | |
" \"top_p\": 1.0,\n", | |
" \"torch_dtype\": \"bfloat16\",\n", | |
" \"torchscript\": false,\n", | |
" \"transformers_version\": \"4.44.0\",\n", | |
" \"typical_p\": 1.0,\n", | |
" \"use_bfloat16\": false,\n", | |
" \"use_cache\": true,\n", | |
" \"use_sliding_window\": false,\n", | |
" \"vocab_size\": 151936\n", | |
" }\n", | |
"}" | |
] | |
}, | |
"execution_count": 268, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_model.config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 269, | |
"id": "ce4e50df-fa65-4f94-9212-29d86acc4862", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"new_model.model.layers = new_layers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 270, | |
"id": "b3c58c14-7b51-4a6c-8f98-a36cfd9bca55", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Qwen2Model(\n", | |
" (embed_tokens): Embedding(151936, 896)\n", | |
" (layers): ModuleList(\n", | |
" (0-19): 20 x Qwen2DecoderLayer(\n", | |
" (self_attn): Qwen2SdpaAttention(\n", | |
" (q_proj): Linear(in_features=896, out_features=896, bias=True)\n", | |
" (k_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (v_proj): Linear(in_features=896, out_features=128, bias=True)\n", | |
" (o_proj): Linear(in_features=896, out_features=896, bias=False)\n", | |
" (rotary_emb): Qwen2RotaryEmbedding()\n", | |
" )\n", | |
" (mlp): Qwen2MLP(\n", | |
" (gate_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (up_proj): Linear(in_features=896, out_features=4864, bias=False)\n", | |
" (down_proj): Linear(in_features=4864, out_features=896, bias=False)\n", | |
" (act_fn): SiLU()\n", | |
" )\n", | |
" (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
" )\n", | |
" )\n", | |
" (norm): Qwen2RMSNorm((896,), eps=1e-06)\n", | |
")" | |
] | |
}, | |
"execution_count": 270, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_model.model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "cf3b611e-5973-492e-9f12-923dc62a7b0c", | |
"metadata": {}, | |
"source": [ | |
"# Final testing" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "20e81995-b63a-4547-a381-ee35defec98a", | |
"metadata": {}, | |
"source": [ | |
"#### Generate" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 274, | |
"id": "9f9c5e19-c90f-4ae6-aac7-5c68b70f9286", | |
"metadata": { | |
"editable": true, | |
"slideshow": { | |
"slide_type": "" | |
}, | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", | |
"Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"9.55 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 \n", | |
"outputs = model.generate(**inputs, max_new_tokens=50)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 272, | |
"id": "e6059909-2993-46ea-ad2b-04cf3b95379b", | |
"metadata": { | |
"editable": true, | |
"slideshow": { | |
"slide_type": "" | |
}, | |
"tags": [] | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", | |
"Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"8.36 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r1 -n1 \n", | |
"outputs = new_model.generate(**inputs, max_new_tokens=50)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "13b39e56-c1a5-4deb-9221-8dfa20fdec3f", | |
"metadata": {}, | |
"source": [ | |
"#### decode the answer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 275, | |
"id": "b5f96806-6ec0-4f74-b78d-7b8bf212c04d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"<|im_start|>user Hola, te llamas Juana<|im_end|>\n", | |
"<|im_start|>assistant 1. Hola, te llamas Juana\n" | |
] | |
} | |
], | |
"source": [ | |
"text = tokenizer.batch_decode(outputs)[0]\n", | |
"print(text) 1. Hola, te llamas Juana" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "aecdb247-ff40-44aa-abb6-9013c7425955", | |
"metadata": {}, | |
"source": [ | |
"# CLEANUP" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 276, | |
"id": "867af9c0-0ec8-42b0-8893-994afaf3f324", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Successfuly deleted: \"model\".\n", | |
"Successfuly deleted: \"new_model\".\n", | |
"Successfuly deleted: \"old_config\".\n", | |
"Successfuly deleted: \"new_config\".\n", | |
"Successfuly deleted: \"old_layers\".\n", | |
"Successfuly deleted: \"new_layers\".\n", | |
"Successfuly deleted: \"tokenizer\".\n" | |
] | |
} | |
], | |
"source": [ | |
"var_bin = (\"model\", \"new_model\", \"old_config\", \"new_config\", \"old_layers\", \"new_layers\", \"tokenizer\")\n", | |
"for variable in var_bin:\n", | |
" try:\n", | |
" exec(f'del {variable} ')\n", | |
" print(f'Successfuly deleted: \"{variable}\".')\n", | |
" except NameError:\n", | |
" print(f'The variable \"{variable}\" was not found.')\n", | |
"del var_bin" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment