Created
April 29, 2025 12:39
-
-
Save city96/a05cb7ec6664a5085efb007497f2049b to your computer and use it in GitHub Desktop.
Attempt to recreate tngtech/DeepSeek-R1T-Chimera from quantized files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Attempt to recreate tngtech/DeepSeek-R1T-Chimera from quantized files | |
# based on https://huggingface.co/tngtech/DeepSeek-R1T-Chimera/discussions/1 | |
# using: | |
# - https://huggingface.co/unsloth/DeepSeek-R1-GGUF | |
# - https://huggingface.co/unsloth/DeepSeek-V3-0324-GGUF | |
# NOTE: The key mapping might not be 100% correct, feel free to experiment | |
import gguf | |
from tqdm import tqdm | |
# I merged the split files using `llama-gguf-split --merge` first | |
PATH_R1 = "DeepSeek-R1-UD-Q2_K_XL.gguf" | |
PATH_V3 = "DeepSeek-V3-0324-UD-Q2_K_XL.gguf" | |
PATH_OUT = "DeepSeek-R1T-Chimera-UD-Q2_K_XL.gguf" | |
KEY_MAPPING = { | |
"token_embd": "v3", | |
"blk.0": "v3", | |
"blk.1": "v3", | |
"blk.2": "v3", | |
"shexp": "v3", | |
"exps": "r1", | |
"attn": "v3", | |
"ffn_gate_inp": "v3", | |
# Default is "v3" for the rest | |
} | |
if __name__ == "__main__": | |
reader_r1 = gguf.GGUFReader(PATH_R1) | |
tensors_r1 = {x.name:x for x in reader_r1.tensors} | |
print(f"Read {len(tensors_r1)} tensors from R1 model") | |
reader_v3 = gguf.GGUFReader(PATH_V3) | |
tensors_v3 = {x.name:x for x in reader_v3.tensors} | |
print(f"Read {len(tensors_v3)} tensors from V3 model") | |
if len(tensors_r1) != len(tensors_v3): | |
raise ValueError("Invalid tensor count in models") | |
keys = set(tensors_r1.keys()) | set(tensors_v3.keys()) | |
keys = sorted(keys) # optional, mostly for readability | |
sd = {} | |
for key in keys: | |
tensor_r1 = tensors_r1[key] | |
tensor_v3 = tensors_v3[key] | |
src = "v3" | |
for k,v in KEY_MAPPING.items(): | |
if k in key: | |
src = v | |
break | |
print(f"Using src:{src} for {key:40}") | |
val = tensor_r1 if src == "r1" else tensor_v3 | |
sd[key] = val | |
writer = writer = gguf.GGUFWriter(PATH_OUT, arch="deepseek2") | |
# reuse v3 metadata | |
for field in reader_v3.fields.values(): | |
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): | |
continue | |
writer.add_key_value(field.name, field.contents(), field.types[0]) | |
# add tensor info | |
total_bytes = 0 | |
for name, tensor in sd.items(): | |
total_bytes += tensor.n_bytes | |
writer.add_tensor_info( | |
tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type | |
) | |
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) | |
writer.write_header_to_file() | |
writer.write_kv_data_to_file() | |
writer.write_ti_data_to_file() | |
# add actual tensors | |
for key, tensor in sd.items(): | |
writer.write_tensor_data(tensor.data) | |
bar.update(tensor.n_bytes) | |
writer.close() | |
bar.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment