Skip to content

Instantly share code, notes, and snippets.

@sekstini
Created July 19, 2023 21:58
Show Gist options
  • Save sekstini/540bd34141633be05340879fc3a40d05 to your computer and use it in GitHub Desktop.
Save sekstini/540bd34141633be05340879fc3a40d05 to your computer and use it in GitHub Desktop.
LLaMA-2 7b weight comparison between original (bf16) and huggingface (fp16)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "38224d5b",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import numpy as np\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "df124ad3",
"metadata": {},
"outputs": [],
"source": [
"sd_orig = torch.load(\"consolidated.00.pth\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0f4bc9d1",
"metadata": {},
"outputs": [],
"source": [
"sd_hf = torch.load(\"pytorch_model-00001-of-00002.bin\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "29afe0cf",
"metadata": {},
"outputs": [],
"source": [
"common_keys = sd_orig.keys() & sd_hf.keys()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "995c4063",
"metadata": {},
"outputs": [],
"source": [
"params = {\"dim\": 4096, \"multiple_of\": 256, \"n_heads\": 32, \"n_layers\": 32, \"norm_eps\": 1e-05, \"vocab_size\": -1}\n",
"num_shards = 1\n",
"n_layers = params[\"n_layers\"]\n",
"n_heads = params[\"n_heads\"]\n",
"n_heads_per_shard = n_heads // num_shards\n",
"dim = params[\"dim\"]\n",
"dims_per_head = dim // n_heads\n",
"base = 10000.0\n",
"inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))\n",
"\n",
"if \"n_kv_heads\" in params:\n",
" num_key_value_heads = params[\"n_kv_heads\"] # for GQA / MQA\n",
" num_local_key_value_heads = n_heads_per_shard // num_key_value_heads\n",
" key_value_dim = dim // num_key_value_heads\n",
"else: # compatibility with other checkpoints\n",
" num_key_value_heads = n_heads\n",
" num_local_key_value_heads = n_heads_per_shard\n",
" key_value_dim = dim\n",
"\n",
"def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):\n",
" return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)\n",
"\n",
"sd_orig_hf = {}\n",
"for l in range(n_layers):\n",
" sd_orig_hf.update({\n",
" f\"model.layers.{l}.self_attn.q_proj.weight\": permute(\n",
" sd_orig[f\"layers.{l}.attention.wq.weight\"]\n",
" ),\n",
" f\"model.layers.{l}.self_attn.k_proj.weight\": permute(\n",
" sd_orig[f\"layers.{l}.attention.wk.weight\"]\n",
" ),\n",
" f\"model.layers.{l}.self_attn.v_proj.weight\": sd_orig[f\"layers.{l}.attention.wv.weight\"],\n",
" f\"model.layers.{l}.self_attn.o_proj.weight\": sd_orig[f\"layers.{l}.attention.wo.weight\"],\n",
" f\"model.layers.{l}.mlp.gate_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w1.weight\"],\n",
" f\"model.layers.{l}.mlp.down_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w2.weight\"],\n",
" f\"model.layers.{l}.mlp.up_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w3.weight\"],\n",
" f\"model.layers.{l}.input_layernorm.weight\": sd_orig[f\"layers.{l}.attention_norm.weight\"],\n",
" f\"model.layers.{l}.post_attention_layernorm.weight\": sd_orig[f\"layers.{l}.ffn_norm.weight\"],\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0f3e7ccf",
"metadata": {},
"outputs": [],
"source": [
"def sortkey(key: str):\n",
" parts = key.split(\".\")\n",
" parts[2] = int(parts[2])\n",
" return parts\n",
"\n",
"common_keys = list(sd_orig_hf.keys() & sd_hf.keys())\n",
"common_keys.sort(key=sortkey)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4c47312d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(216, 288, 241)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(common_keys), len(sd_orig_hf.keys()), len(sd_hf.keys())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5d53887c",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"layers.0.input_layernorm :: 4.470348358154297e-08\n",
"layers.0.mlp.down_proj :: 0.0002487435704097152\n",
"layers.0.mlp.gate_proj :: 0.0002498461399227381\n",
"layers.0.mlp.up_proj :: 0.0002581549051683396\n",
"layers.0.post_attention_layernorm :: 0.0\n",
"layers.0.self_attn.k_proj :: 0.00021169817773625255\n",
"layers.0.self_attn.o_proj :: 0.0002980003773700446\n",
"layers.0.self_attn.q_proj :: 0.0002925241133198142\n",
"layers.0.self_attn.v_proj :: 0.0001831730332924053\n",
"layers.1.input_layernorm :: 0.0\n",
"layers.1.mlp.down_proj :: 0.00023984009749256074\n",
"layers.1.mlp.gate_proj :: 0.0002312921133125201\n",
"layers.1.mlp.up_proj :: 0.00024525466142222285\n",
"layers.1.post_attention_layernorm :: 0.0\n",
"layers.1.self_attn.k_proj :: 9.451161895412952e-05\n",
"layers.1.self_attn.o_proj :: 0.0002728034742176533\n",
"layers.1.self_attn.q_proj :: 9.88328829407692e-05\n",
"layers.1.self_attn.v_proj :: 0.00020060034876223654\n",
"layers.2.input_layernorm :: 0.0\n",
"layers.2.mlp.down_proj :: 0.00023379421327263117\n",
"layers.2.mlp.gate_proj :: 0.00022194455959834158\n",
"layers.2.mlp.up_proj :: 0.00023320293985307217\n",
"layers.2.post_attention_layernorm :: 0.0\n",
"layers.2.self_attn.k_proj :: 8.56785336509347e-05\n",
"layers.2.self_attn.o_proj :: 0.00011266511864960194\n",
"layers.2.self_attn.q_proj :: 7.946790719870478e-05\n",
"layers.2.self_attn.v_proj :: 0.00010692673095036298\n",
"layers.3.input_layernorm :: 0.0\n",
"layers.3.mlp.down_proj :: 0.0002337423647986725\n",
"layers.3.mlp.gate_proj :: 0.0002152875968022272\n",
"layers.3.mlp.up_proj :: 0.00023548048920929432\n",
"layers.3.post_attention_layernorm :: 0.0\n",
"layers.3.self_attn.k_proj :: 7.839099271222949e-05\n",
"layers.3.self_attn.o_proj :: 0.0001229266927111894\n",
"layers.3.self_attn.q_proj :: 7.610922330059111e-05\n",
"layers.3.self_attn.v_proj :: 0.00011793316662078723\n",
"layers.4.input_layernorm :: 0.0\n",
"layers.4.mlp.down_proj :: 0.00023839841014705598\n",
"layers.4.mlp.gate_proj :: 0.00020985839364584535\n",
"layers.4.mlp.up_proj :: 0.00023889326257631183\n",
"layers.4.post_attention_layernorm :: 0.0\n",
"layers.4.self_attn.k_proj :: 7.34994318918325e-05\n",
"layers.4.self_attn.o_proj :: 0.00010853856656467542\n",
"layers.4.self_attn.q_proj :: 6.952149124117568e-05\n",
"layers.4.self_attn.v_proj :: 0.0001106640265788883\n",
"layers.5.input_layernorm :: 0.0\n",
"layers.5.mlp.down_proj :: 0.00023294253333006054\n",
"layers.5.mlp.gate_proj :: 0.00021902177832089365\n",
"layers.5.mlp.up_proj :: 0.00023804808733984828\n",
"layers.5.post_attention_layernorm :: 0.0\n",
"layers.5.self_attn.k_proj :: 6.887259951326996e-05\n",
"layers.5.self_attn.o_proj :: 0.00010860415932256728\n",
"layers.5.self_attn.q_proj :: 7.008601096458733e-05\n",
"layers.5.self_attn.v_proj :: 0.00010994714830303565\n",
"layers.6.input_layernorm :: 0.0\n",
"layers.6.mlp.down_proj :: 0.0002430058957543224\n",
"layers.6.mlp.gate_proj :: 0.00021106022177264094\n",
"layers.6.mlp.up_proj :: 0.0002357853518333286\n",
"layers.6.post_attention_layernorm :: 0.0\n",
"layers.6.self_attn.k_proj :: 7.34528002794832e-05\n",
"layers.6.self_attn.o_proj :: 0.00012069537478964776\n",
"layers.6.self_attn.q_proj :: 7.39123352104798e-05\n",
"layers.6.self_attn.v_proj :: 0.00011444320261944085\n",
"layers.7.input_layernorm :: 0.0\n",
"layers.7.mlp.down_proj :: 0.0002386124397162348\n",
"layers.7.mlp.gate_proj :: 0.0002129468193743378\n",
"layers.7.mlp.up_proj :: 0.0002342099032830447\n",
"layers.7.post_attention_layernorm :: 0.0\n",
"layers.7.self_attn.k_proj :: 7.436081796186045e-05\n",
"layers.7.self_attn.o_proj :: 0.000117763556772843\n",
"layers.7.self_attn.q_proj :: 7.579576049465686e-05\n",
"layers.7.self_attn.v_proj :: 0.00011647324572550133\n",
"layers.8.input_layernorm :: 0.0\n",
"layers.8.mlp.down_proj :: 0.00023561849957332015\n",
"layers.8.mlp.gate_proj :: 0.00021395857038442045\n",
"layers.8.mlp.up_proj :: 0.00023233317187987268\n",
"layers.8.post_attention_layernorm :: 0.0\n",
"layers.8.self_attn.k_proj :: 7.479614578187466e-05\n",
"layers.8.self_attn.o_proj :: 0.0001101200541597791\n",
"layers.8.self_attn.q_proj :: 7.198037928901613e-05\n",
"layers.8.self_attn.v_proj :: 0.00011036549403797835\n",
"layers.9.input_layernorm :: 0.0\n",
"layers.9.mlp.down_proj :: 0.00023483391851186752\n",
"layers.9.mlp.gate_proj :: 0.00021813629427924752\n",
"layers.9.mlp.up_proj :: 0.0002305333619005978\n",
"layers.9.post_attention_layernorm :: 0.0\n",
"layers.9.self_attn.k_proj :: 7.099700451362878e-05\n",
"layers.9.self_attn.o_proj :: 0.00010961489897454157\n",
"layers.9.self_attn.q_proj :: 6.99333322700113e-05\n",
"layers.9.self_attn.v_proj :: 0.00011098265531472862\n",
"layers.10.input_layernorm :: 0.0\n",
"layers.10.mlp.down_proj :: 0.0002355966134928167\n",
"layers.10.mlp.gate_proj :: 0.00021879498672205955\n",
"layers.10.mlp.up_proj :: 0.00022633779735770077\n",
"layers.10.post_attention_layernorm :: 0.0\n",
"layers.10.self_attn.k_proj :: 6.887767085572705e-05\n",
"layers.10.self_attn.o_proj :: 0.00010606015712255612\n",
"layers.10.self_attn.q_proj :: 7.245963206514716e-05\n",
"layers.10.self_attn.v_proj :: 0.00011047374573536217\n",
"layers.11.input_layernorm :: 0.0\n",
"layers.11.mlp.down_proj :: 0.00023263935872819275\n",
"layers.11.mlp.gate_proj :: 0.00022096707834862173\n",
"layers.11.mlp.up_proj :: 0.00022701549460180104\n",
"layers.11.post_attention_layernorm :: 0.0\n",
"layers.11.self_attn.k_proj :: 7.927079423097894e-05\n",
"layers.11.self_attn.o_proj :: 0.00010579710215097293\n",
"layers.11.self_attn.q_proj :: 7.568404544144869e-05\n",
"layers.11.self_attn.v_proj :: 0.00010630728502292186\n",
"layers.12.input_layernorm :: 0.0\n",
"layers.12.mlp.down_proj :: 0.000229398108785972\n",
"layers.12.mlp.gate_proj :: 0.00022061345225665718\n",
"layers.12.mlp.up_proj :: 0.00022570605506189167\n",
"layers.12.post_attention_layernorm :: 0.0\n",
"layers.12.self_attn.k_proj :: 7.188042945927009e-05\n",
"layers.12.self_attn.o_proj :: 0.00010446263331687078\n",
"layers.12.self_attn.q_proj :: 7.259837002493441e-05\n",
"layers.12.self_attn.v_proj :: 0.00010847383964573964\n",
"layers.13.input_layernorm :: 0.0\n",
"layers.13.mlp.down_proj :: 0.00023183136363513768\n",
"layers.13.mlp.gate_proj :: 0.00022038226597942412\n",
"layers.13.mlp.up_proj :: 0.00022204018023330718\n",
"layers.13.post_attention_layernorm :: 0.0\n",
"layers.13.self_attn.k_proj :: 7.277751865331084e-05\n",
"layers.13.self_attn.o_proj :: 9.949544619303197e-05\n",
"layers.13.self_attn.q_proj :: 7.417373853968456e-05\n",
"layers.13.self_attn.v_proj :: 9.998214954975992e-05\n",
"layers.14.input_layernorm :: 0.0\n",
"layers.14.mlp.down_proj :: 0.0002250982797704637\n",
"layers.14.mlp.gate_proj :: 0.0002234296698588878\n",
"layers.14.mlp.up_proj :: 0.0002275968436151743\n",
"layers.14.post_attention_layernorm :: 0.0\n",
"layers.14.self_attn.k_proj :: 7.394910790026188e-05\n",
"layers.14.self_attn.o_proj :: 0.00010180797835346311\n",
"layers.14.self_attn.q_proj :: 7.286618347279727e-05\n",
"layers.14.self_attn.v_proj :: 0.00010593021579552442\n",
"layers.15.input_layernorm :: 0.0\n",
"layers.15.mlp.down_proj :: 0.0002261872577946633\n",
"layers.15.mlp.gate_proj :: 0.00021966946951579303\n",
"layers.15.mlp.up_proj :: 0.0002207624347647652\n",
"layers.15.post_attention_layernorm :: 0.0\n",
"layers.15.self_attn.k_proj :: 7.03289988450706e-05\n",
"layers.15.self_attn.o_proj :: 0.0001009642583085224\n",
"layers.15.self_attn.q_proj :: 6.977833982091397e-05\n",
"layers.15.self_attn.v_proj :: 0.00010197132360190153\n",
"layers.16.input_layernorm :: 0.0\n",
"layers.16.mlp.down_proj :: 0.000229156285058707\n",
"layers.16.mlp.gate_proj :: 0.00022054999135434628\n",
"layers.16.mlp.up_proj :: 0.00022039496980141848\n",
"layers.16.post_attention_layernorm :: 0.0\n",
"layers.16.self_attn.k_proj :: 7.691039354540408e-05\n",
"layers.16.self_attn.o_proj :: 9.248219430446625e-05\n",
"layers.16.self_attn.q_proj :: 7.115134212654084e-05\n",
"layers.16.self_attn.v_proj :: 9.902598685584962e-05\n",
"layers.17.input_layernorm :: 2.2351741790771484e-08\n",
"layers.17.mlp.down_proj :: 0.00022360548609867692\n",
"layers.17.mlp.gate_proj :: 0.0002172273671021685\n",
"layers.17.mlp.up_proj :: 0.00022320033167488873\n",
"layers.17.post_attention_layernorm :: 0.0\n",
"layers.17.self_attn.k_proj :: 7.445142546202987e-05\n",
"layers.17.self_attn.o_proj :: 9.69550819718279e-05\n",
"layers.17.self_attn.q_proj :: 7.511652074754238e-05\n",
"layers.17.self_attn.v_proj :: 9.722042887005955e-05\n",
"layers.18.input_layernorm :: 0.0\n",
"layers.18.mlp.down_proj :: 0.00022297966643236578\n",
"layers.18.mlp.gate_proj :: 0.0002168940263800323\n",
"layers.18.mlp.up_proj :: 0.0002239357854705304\n",
"layers.18.post_attention_layernorm :: 0.0\n",
"layers.18.self_attn.k_proj :: 7.01848475728184e-05\n",
"layers.18.self_attn.o_proj :: 9.185199451167136e-05\n",
"layers.18.self_attn.q_proj :: 7.449802797054872e-05\n",
"layers.18.self_attn.v_proj :: 9.327943553216755e-05\n",
"layers.19.input_layernorm :: 0.0\n",
"layers.19.mlp.down_proj :: 0.00022434021229855716\n",
"layers.19.mlp.gate_proj :: 0.00021632449352182448\n",
"layers.19.mlp.up_proj :: 0.00022189474839251488\n",
"layers.19.post_attention_layernorm :: 0.0\n",
"layers.19.self_attn.k_proj :: 7.54313514335081e-05\n",
"layers.19.self_attn.o_proj :: 9.140757902059704e-05\n",
"layers.19.self_attn.q_proj :: 7.367075158981606e-05\n",
"layers.19.self_attn.v_proj :: 9.205481183016673e-05\n",
"layers.20.input_layernorm :: 0.0\n",
"layers.20.mlp.down_proj :: 0.0002232126862509176\n",
"layers.20.mlp.gate_proj :: 0.00021270349679980427\n",
"layers.20.mlp.up_proj :: 0.0002203828771598637\n",
"layers.20.post_attention_layernorm :: 0.0\n",
"layers.20.self_attn.k_proj :: 7.543408719357103e-05\n",
"layers.20.self_attn.o_proj :: 8.742884529056028e-05\n",
"layers.20.self_attn.q_proj :: 7.817970617907122e-05\n",
"layers.20.self_attn.v_proj :: 8.893346239347011e-05\n",
"layers.21.input_layernorm :: 0.0\n",
"layers.21.mlp.down_proj :: 0.00022195042402017862\n",
"layers.21.mlp.gate_proj :: 0.0002116999530699104\n",
"layers.21.mlp.up_proj :: 0.00021895463578402996\n",
"layers.21.post_attention_layernorm :: 0.0\n",
"layers.21.self_attn.k_proj :: 8.04948213044554e-05\n",
"layers.21.self_attn.o_proj :: 8.695253927726299e-05\n",
"layers.21.self_attn.q_proj :: 8.039720705710351e-05\n",
"layers.21.self_attn.v_proj :: 8.726368832867593e-05\n",
"layers.22.input_layernorm :: 0.0\n",
"layers.22.mlp.down_proj :: 0.00022601695673074573\n",
"layers.22.mlp.gate_proj :: 0.0002062570711132139\n",
"layers.22.mlp.up_proj :: 0.00021878430561628193\n",
"layers.22.post_attention_layernorm :: 0.0\n",
"layers.22.self_attn.k_proj :: 7.688651385251433e-05\n",
"layers.22.self_attn.o_proj :: 8.864752453519031e-05\n",
"layers.22.self_attn.q_proj :: 7.639191608177498e-05\n",
"layers.22.self_attn.v_proj :: 8.675439312355593e-05\n",
"layers.23.input_layernorm :: 0.0\n",
"layers.23.mlp.down_proj :: 0.00022117732441984117\n",
"layers.23.mlp.gate_proj :: 0.00020739230967592448\n",
"layers.23.mlp.up_proj :: 0.00022074722801335156\n",
"layers.23.post_attention_layernorm :: 0.0\n",
"layers.23.self_attn.k_proj :: 7.341142918448895e-05\n",
"layers.23.self_attn.o_proj :: 8.356582839041948e-05\n",
"layers.23.self_attn.q_proj :: 7.526070112362504e-05\n",
"layers.23.self_attn.v_proj :: 8.286593219963834e-05\n"
]
}
],
"source": [
"for k in common_keys:\n",
" a = sd_hf[k]\n",
" b = sd_orig_hf[k]\n",
" l1 = (a.float() - b.float()).abs().sum().item()\n",
" name = k.removeprefix(\"model.\").removesuffix(\".weight\")\n",
" print(f\"{name:40} :: {l1}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment