Created
July 19, 2023 21:58
-
-
Save sekstini/540bd34141633be05340879fc3a40d05 to your computer and use it in GitHub Desktop.
LLaMA-2 7b weight comparison between original (bf16) and huggingface (fp16)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "38224d5b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import torch\n", | |
"import numpy as np\n", | |
"import json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "df124ad3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sd_orig = torch.load(\"consolidated.00.pth\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "0f4bc9d1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sd_hf = torch.load(\"pytorch_model-00001-of-00002.bin\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "29afe0cf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"common_keys = sd_orig.keys() & sd_hf.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "995c4063", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"params = {\"dim\": 4096, \"multiple_of\": 256, \"n_heads\": 32, \"n_layers\": 32, \"norm_eps\": 1e-05, \"vocab_size\": -1}\n", | |
"num_shards = 1\n", | |
"n_layers = params[\"n_layers\"]\n", | |
"n_heads = params[\"n_heads\"]\n", | |
"n_heads_per_shard = n_heads // num_shards\n", | |
"dim = params[\"dim\"]\n", | |
"dims_per_head = dim // n_heads\n", | |
"base = 10000.0\n", | |
"inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))\n", | |
"\n", | |
"if \"n_kv_heads\" in params:\n", | |
" num_key_value_heads = params[\"n_kv_heads\"] # for GQA / MQA\n", | |
" num_local_key_value_heads = n_heads_per_shard // num_key_value_heads\n", | |
" key_value_dim = dim // num_key_value_heads\n", | |
"else: # compatibility with other checkpoints\n", | |
" num_key_value_heads = n_heads\n", | |
" num_local_key_value_heads = n_heads_per_shard\n", | |
" key_value_dim = dim\n", | |
"\n", | |
"def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):\n", | |
" return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)\n", | |
"\n", | |
"sd_orig_hf = {}\n", | |
"for l in range(n_layers):\n", | |
" sd_orig_hf.update({\n", | |
" f\"model.layers.{l}.self_attn.q_proj.weight\": permute(\n", | |
" sd_orig[f\"layers.{l}.attention.wq.weight\"]\n", | |
" ),\n", | |
" f\"model.layers.{l}.self_attn.k_proj.weight\": permute(\n", | |
" sd_orig[f\"layers.{l}.attention.wk.weight\"]\n", | |
" ),\n", | |
" f\"model.layers.{l}.self_attn.v_proj.weight\": sd_orig[f\"layers.{l}.attention.wv.weight\"],\n", | |
" f\"model.layers.{l}.self_attn.o_proj.weight\": sd_orig[f\"layers.{l}.attention.wo.weight\"],\n", | |
" f\"model.layers.{l}.mlp.gate_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w1.weight\"],\n", | |
" f\"model.layers.{l}.mlp.down_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w2.weight\"],\n", | |
" f\"model.layers.{l}.mlp.up_proj.weight\": sd_orig[f\"layers.{l}.feed_forward.w3.weight\"],\n", | |
" f\"model.layers.{l}.input_layernorm.weight\": sd_orig[f\"layers.{l}.attention_norm.weight\"],\n", | |
" f\"model.layers.{l}.post_attention_layernorm.weight\": sd_orig[f\"layers.{l}.ffn_norm.weight\"],\n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "0f3e7ccf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def sortkey(key: str):\n", | |
" parts = key.split(\".\")\n", | |
" parts[2] = int(parts[2])\n", | |
" return parts\n", | |
"\n", | |
"common_keys = list(sd_orig_hf.keys() & sd_hf.keys())\n", | |
"common_keys.sort(key=sortkey)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "4c47312d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(216, 288, 241)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(common_keys), len(sd_orig_hf.keys()), len(sd_hf.keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "5d53887c", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"layers.0.input_layernorm :: 4.470348358154297e-08\n", | |
"layers.0.mlp.down_proj :: 0.0002487435704097152\n", | |
"layers.0.mlp.gate_proj :: 0.0002498461399227381\n", | |
"layers.0.mlp.up_proj :: 0.0002581549051683396\n", | |
"layers.0.post_attention_layernorm :: 0.0\n", | |
"layers.0.self_attn.k_proj :: 0.00021169817773625255\n", | |
"layers.0.self_attn.o_proj :: 0.0002980003773700446\n", | |
"layers.0.self_attn.q_proj :: 0.0002925241133198142\n", | |
"layers.0.self_attn.v_proj :: 0.0001831730332924053\n", | |
"layers.1.input_layernorm :: 0.0\n", | |
"layers.1.mlp.down_proj :: 0.00023984009749256074\n", | |
"layers.1.mlp.gate_proj :: 0.0002312921133125201\n", | |
"layers.1.mlp.up_proj :: 0.00024525466142222285\n", | |
"layers.1.post_attention_layernorm :: 0.0\n", | |
"layers.1.self_attn.k_proj :: 9.451161895412952e-05\n", | |
"layers.1.self_attn.o_proj :: 0.0002728034742176533\n", | |
"layers.1.self_attn.q_proj :: 9.88328829407692e-05\n", | |
"layers.1.self_attn.v_proj :: 0.00020060034876223654\n", | |
"layers.2.input_layernorm :: 0.0\n", | |
"layers.2.mlp.down_proj :: 0.00023379421327263117\n", | |
"layers.2.mlp.gate_proj :: 0.00022194455959834158\n", | |
"layers.2.mlp.up_proj :: 0.00023320293985307217\n", | |
"layers.2.post_attention_layernorm :: 0.0\n", | |
"layers.2.self_attn.k_proj :: 8.56785336509347e-05\n", | |
"layers.2.self_attn.o_proj :: 0.00011266511864960194\n", | |
"layers.2.self_attn.q_proj :: 7.946790719870478e-05\n", | |
"layers.2.self_attn.v_proj :: 0.00010692673095036298\n", | |
"layers.3.input_layernorm :: 0.0\n", | |
"layers.3.mlp.down_proj :: 0.0002337423647986725\n", | |
"layers.3.mlp.gate_proj :: 0.0002152875968022272\n", | |
"layers.3.mlp.up_proj :: 0.00023548048920929432\n", | |
"layers.3.post_attention_layernorm :: 0.0\n", | |
"layers.3.self_attn.k_proj :: 7.839099271222949e-05\n", | |
"layers.3.self_attn.o_proj :: 0.0001229266927111894\n", | |
"layers.3.self_attn.q_proj :: 7.610922330059111e-05\n", | |
"layers.3.self_attn.v_proj :: 0.00011793316662078723\n", | |
"layers.4.input_layernorm :: 0.0\n", | |
"layers.4.mlp.down_proj :: 0.00023839841014705598\n", | |
"layers.4.mlp.gate_proj :: 0.00020985839364584535\n", | |
"layers.4.mlp.up_proj :: 0.00023889326257631183\n", | |
"layers.4.post_attention_layernorm :: 0.0\n", | |
"layers.4.self_attn.k_proj :: 7.34994318918325e-05\n", | |
"layers.4.self_attn.o_proj :: 0.00010853856656467542\n", | |
"layers.4.self_attn.q_proj :: 6.952149124117568e-05\n", | |
"layers.4.self_attn.v_proj :: 0.0001106640265788883\n", | |
"layers.5.input_layernorm :: 0.0\n", | |
"layers.5.mlp.down_proj :: 0.00023294253333006054\n", | |
"layers.5.mlp.gate_proj :: 0.00021902177832089365\n", | |
"layers.5.mlp.up_proj :: 0.00023804808733984828\n", | |
"layers.5.post_attention_layernorm :: 0.0\n", | |
"layers.5.self_attn.k_proj :: 6.887259951326996e-05\n", | |
"layers.5.self_attn.o_proj :: 0.00010860415932256728\n", | |
"layers.5.self_attn.q_proj :: 7.008601096458733e-05\n", | |
"layers.5.self_attn.v_proj :: 0.00010994714830303565\n", | |
"layers.6.input_layernorm :: 0.0\n", | |
"layers.6.mlp.down_proj :: 0.0002430058957543224\n", | |
"layers.6.mlp.gate_proj :: 0.00021106022177264094\n", | |
"layers.6.mlp.up_proj :: 0.0002357853518333286\n", | |
"layers.6.post_attention_layernorm :: 0.0\n", | |
"layers.6.self_attn.k_proj :: 7.34528002794832e-05\n", | |
"layers.6.self_attn.o_proj :: 0.00012069537478964776\n", | |
"layers.6.self_attn.q_proj :: 7.39123352104798e-05\n", | |
"layers.6.self_attn.v_proj :: 0.00011444320261944085\n", | |
"layers.7.input_layernorm :: 0.0\n", | |
"layers.7.mlp.down_proj :: 0.0002386124397162348\n", | |
"layers.7.mlp.gate_proj :: 0.0002129468193743378\n", | |
"layers.7.mlp.up_proj :: 0.0002342099032830447\n", | |
"layers.7.post_attention_layernorm :: 0.0\n", | |
"layers.7.self_attn.k_proj :: 7.436081796186045e-05\n", | |
"layers.7.self_attn.o_proj :: 0.000117763556772843\n", | |
"layers.7.self_attn.q_proj :: 7.579576049465686e-05\n", | |
"layers.7.self_attn.v_proj :: 0.00011647324572550133\n", | |
"layers.8.input_layernorm :: 0.0\n", | |
"layers.8.mlp.down_proj :: 0.00023561849957332015\n", | |
"layers.8.mlp.gate_proj :: 0.00021395857038442045\n", | |
"layers.8.mlp.up_proj :: 0.00023233317187987268\n", | |
"layers.8.post_attention_layernorm :: 0.0\n", | |
"layers.8.self_attn.k_proj :: 7.479614578187466e-05\n", | |
"layers.8.self_attn.o_proj :: 0.0001101200541597791\n", | |
"layers.8.self_attn.q_proj :: 7.198037928901613e-05\n", | |
"layers.8.self_attn.v_proj :: 0.00011036549403797835\n", | |
"layers.9.input_layernorm :: 0.0\n", | |
"layers.9.mlp.down_proj :: 0.00023483391851186752\n", | |
"layers.9.mlp.gate_proj :: 0.00021813629427924752\n", | |
"layers.9.mlp.up_proj :: 0.0002305333619005978\n", | |
"layers.9.post_attention_layernorm :: 0.0\n", | |
"layers.9.self_attn.k_proj :: 7.099700451362878e-05\n", | |
"layers.9.self_attn.o_proj :: 0.00010961489897454157\n", | |
"layers.9.self_attn.q_proj :: 6.99333322700113e-05\n", | |
"layers.9.self_attn.v_proj :: 0.00011098265531472862\n", | |
"layers.10.input_layernorm :: 0.0\n", | |
"layers.10.mlp.down_proj :: 0.0002355966134928167\n", | |
"layers.10.mlp.gate_proj :: 0.00021879498672205955\n", | |
"layers.10.mlp.up_proj :: 0.00022633779735770077\n", | |
"layers.10.post_attention_layernorm :: 0.0\n", | |
"layers.10.self_attn.k_proj :: 6.887767085572705e-05\n", | |
"layers.10.self_attn.o_proj :: 0.00010606015712255612\n", | |
"layers.10.self_attn.q_proj :: 7.245963206514716e-05\n", | |
"layers.10.self_attn.v_proj :: 0.00011047374573536217\n", | |
"layers.11.input_layernorm :: 0.0\n", | |
"layers.11.mlp.down_proj :: 0.00023263935872819275\n", | |
"layers.11.mlp.gate_proj :: 0.00022096707834862173\n", | |
"layers.11.mlp.up_proj :: 0.00022701549460180104\n", | |
"layers.11.post_attention_layernorm :: 0.0\n", | |
"layers.11.self_attn.k_proj :: 7.927079423097894e-05\n", | |
"layers.11.self_attn.o_proj :: 0.00010579710215097293\n", | |
"layers.11.self_attn.q_proj :: 7.568404544144869e-05\n", | |
"layers.11.self_attn.v_proj :: 0.00010630728502292186\n", | |
"layers.12.input_layernorm :: 0.0\n", | |
"layers.12.mlp.down_proj :: 0.000229398108785972\n", | |
"layers.12.mlp.gate_proj :: 0.00022061345225665718\n", | |
"layers.12.mlp.up_proj :: 0.00022570605506189167\n", | |
"layers.12.post_attention_layernorm :: 0.0\n", | |
"layers.12.self_attn.k_proj :: 7.188042945927009e-05\n", | |
"layers.12.self_attn.o_proj :: 0.00010446263331687078\n", | |
"layers.12.self_attn.q_proj :: 7.259837002493441e-05\n", | |
"layers.12.self_attn.v_proj :: 0.00010847383964573964\n", | |
"layers.13.input_layernorm :: 0.0\n", | |
"layers.13.mlp.down_proj :: 0.00023183136363513768\n", | |
"layers.13.mlp.gate_proj :: 0.00022038226597942412\n", | |
"layers.13.mlp.up_proj :: 0.00022204018023330718\n", | |
"layers.13.post_attention_layernorm :: 0.0\n", | |
"layers.13.self_attn.k_proj :: 7.277751865331084e-05\n", | |
"layers.13.self_attn.o_proj :: 9.949544619303197e-05\n", | |
"layers.13.self_attn.q_proj :: 7.417373853968456e-05\n", | |
"layers.13.self_attn.v_proj :: 9.998214954975992e-05\n", | |
"layers.14.input_layernorm :: 0.0\n", | |
"layers.14.mlp.down_proj :: 0.0002250982797704637\n", | |
"layers.14.mlp.gate_proj :: 0.0002234296698588878\n", | |
"layers.14.mlp.up_proj :: 0.0002275968436151743\n", | |
"layers.14.post_attention_layernorm :: 0.0\n", | |
"layers.14.self_attn.k_proj :: 7.394910790026188e-05\n", | |
"layers.14.self_attn.o_proj :: 0.00010180797835346311\n", | |
"layers.14.self_attn.q_proj :: 7.286618347279727e-05\n", | |
"layers.14.self_attn.v_proj :: 0.00010593021579552442\n", | |
"layers.15.input_layernorm :: 0.0\n", | |
"layers.15.mlp.down_proj :: 0.0002261872577946633\n", | |
"layers.15.mlp.gate_proj :: 0.00021966946951579303\n", | |
"layers.15.mlp.up_proj :: 0.0002207624347647652\n", | |
"layers.15.post_attention_layernorm :: 0.0\n", | |
"layers.15.self_attn.k_proj :: 7.03289988450706e-05\n", | |
"layers.15.self_attn.o_proj :: 0.0001009642583085224\n", | |
"layers.15.self_attn.q_proj :: 6.977833982091397e-05\n", | |
"layers.15.self_attn.v_proj :: 0.00010197132360190153\n", | |
"layers.16.input_layernorm :: 0.0\n", | |
"layers.16.mlp.down_proj :: 0.000229156285058707\n", | |
"layers.16.mlp.gate_proj :: 0.00022054999135434628\n", | |
"layers.16.mlp.up_proj :: 0.00022039496980141848\n", | |
"layers.16.post_attention_layernorm :: 0.0\n", | |
"layers.16.self_attn.k_proj :: 7.691039354540408e-05\n", | |
"layers.16.self_attn.o_proj :: 9.248219430446625e-05\n", | |
"layers.16.self_attn.q_proj :: 7.115134212654084e-05\n", | |
"layers.16.self_attn.v_proj :: 9.902598685584962e-05\n", | |
"layers.17.input_layernorm :: 2.2351741790771484e-08\n", | |
"layers.17.mlp.down_proj :: 0.00022360548609867692\n", | |
"layers.17.mlp.gate_proj :: 0.0002172273671021685\n", | |
"layers.17.mlp.up_proj :: 0.00022320033167488873\n", | |
"layers.17.post_attention_layernorm :: 0.0\n", | |
"layers.17.self_attn.k_proj :: 7.445142546202987e-05\n", | |
"layers.17.self_attn.o_proj :: 9.69550819718279e-05\n", | |
"layers.17.self_attn.q_proj :: 7.511652074754238e-05\n", | |
"layers.17.self_attn.v_proj :: 9.722042887005955e-05\n", | |
"layers.18.input_layernorm :: 0.0\n", | |
"layers.18.mlp.down_proj :: 0.00022297966643236578\n", | |
"layers.18.mlp.gate_proj :: 0.0002168940263800323\n", | |
"layers.18.mlp.up_proj :: 0.0002239357854705304\n", | |
"layers.18.post_attention_layernorm :: 0.0\n", | |
"layers.18.self_attn.k_proj :: 7.01848475728184e-05\n", | |
"layers.18.self_attn.o_proj :: 9.185199451167136e-05\n", | |
"layers.18.self_attn.q_proj :: 7.449802797054872e-05\n", | |
"layers.18.self_attn.v_proj :: 9.327943553216755e-05\n", | |
"layers.19.input_layernorm :: 0.0\n", | |
"layers.19.mlp.down_proj :: 0.00022434021229855716\n", | |
"layers.19.mlp.gate_proj :: 0.00021632449352182448\n", | |
"layers.19.mlp.up_proj :: 0.00022189474839251488\n", | |
"layers.19.post_attention_layernorm :: 0.0\n", | |
"layers.19.self_attn.k_proj :: 7.54313514335081e-05\n", | |
"layers.19.self_attn.o_proj :: 9.140757902059704e-05\n", | |
"layers.19.self_attn.q_proj :: 7.367075158981606e-05\n", | |
"layers.19.self_attn.v_proj :: 9.205481183016673e-05\n", | |
"layers.20.input_layernorm :: 0.0\n", | |
"layers.20.mlp.down_proj :: 0.0002232126862509176\n", | |
"layers.20.mlp.gate_proj :: 0.00021270349679980427\n", | |
"layers.20.mlp.up_proj :: 0.0002203828771598637\n", | |
"layers.20.post_attention_layernorm :: 0.0\n", | |
"layers.20.self_attn.k_proj :: 7.543408719357103e-05\n", | |
"layers.20.self_attn.o_proj :: 8.742884529056028e-05\n", | |
"layers.20.self_attn.q_proj :: 7.817970617907122e-05\n", | |
"layers.20.self_attn.v_proj :: 8.893346239347011e-05\n", | |
"layers.21.input_layernorm :: 0.0\n", | |
"layers.21.mlp.down_proj :: 0.00022195042402017862\n", | |
"layers.21.mlp.gate_proj :: 0.0002116999530699104\n", | |
"layers.21.mlp.up_proj :: 0.00021895463578402996\n", | |
"layers.21.post_attention_layernorm :: 0.0\n", | |
"layers.21.self_attn.k_proj :: 8.04948213044554e-05\n", | |
"layers.21.self_attn.o_proj :: 8.695253927726299e-05\n", | |
"layers.21.self_attn.q_proj :: 8.039720705710351e-05\n", | |
"layers.21.self_attn.v_proj :: 8.726368832867593e-05\n", | |
"layers.22.input_layernorm :: 0.0\n", | |
"layers.22.mlp.down_proj :: 0.00022601695673074573\n", | |
"layers.22.mlp.gate_proj :: 0.0002062570711132139\n", | |
"layers.22.mlp.up_proj :: 0.00021878430561628193\n", | |
"layers.22.post_attention_layernorm :: 0.0\n", | |
"layers.22.self_attn.k_proj :: 7.688651385251433e-05\n", | |
"layers.22.self_attn.o_proj :: 8.864752453519031e-05\n", | |
"layers.22.self_attn.q_proj :: 7.639191608177498e-05\n", | |
"layers.22.self_attn.v_proj :: 8.675439312355593e-05\n", | |
"layers.23.input_layernorm :: 0.0\n", | |
"layers.23.mlp.down_proj :: 0.00022117732441984117\n", | |
"layers.23.mlp.gate_proj :: 0.00020739230967592448\n", | |
"layers.23.mlp.up_proj :: 0.00022074722801335156\n", | |
"layers.23.post_attention_layernorm :: 0.0\n", | |
"layers.23.self_attn.k_proj :: 7.341142918448895e-05\n", | |
"layers.23.self_attn.o_proj :: 8.356582839041948e-05\n", | |
"layers.23.self_attn.q_proj :: 7.526070112362504e-05\n", | |
"layers.23.self_attn.v_proj :: 8.286593219963834e-05\n" | |
] | |
} | |
], | |
"source": [ | |
"for k in common_keys:\n", | |
" a = sd_hf[k]\n", | |
" b = sd_orig_hf[k]\n", | |
" l1 = (a.float() - b.float()).abs().sum().item()\n", | |
" name = k.removeprefix(\"model.\").removesuffix(\".weight\")\n", | |
" print(f\"{name:40} :: {l1}\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment