Last active
September 18, 2024 04:32
-
-
Save tail-call/a602fde6be9eb9097827dacd00a11dd5 to your computer and use it in GitHub Desktop.
Studying "Faster and Lighter LLMs: A Survey on Current Challenges and Way Forward": adapting a table for plotting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Method | Quantization Type | WM (GB) | RM (GB) | Tokens/sec | Perplexity | NVIDIA GPU | AMD GPU | Apple Silicon | CPU | Intel GPU | AWS Inferentia2 | WebGPU | WASM | Adreno Mali | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Llama.cpp | GGUF K-Quant 2bit | 2.36 | 3.69 | 102.15 | 6.96 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
1 | Llama.cpp | GGUF 4bit (check) | 3.56 | 4.88 | 128.97 | 5.96 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
2 | Llama.cpp | GGUF AWQ 4bit | 3.56 | 4.88 | 129.25 | 5.91 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
3 | Llama.cpp | GGUF K-Quant 4bit | 3.59 | 4.90 | 109.72 | 5.87 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
4 | Llama.cpp | GGUF 8bit | 6.67 | 7.78 | 93.39 | 5.79 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
5 | Llama.cpp | GGUF FP16 | 12.55 | 13.22 | 66.81 | 5.79 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | |
6 | ExLlama | GPTQ 4bit | 3.63 | 5.35 | 77.10 | 6.08 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
8 | ExLlamav2 | EXL2 2bit | 2.01 | 5.21 | 153.75 | 20.21 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
9 | ExLlamav2 | EXL2 4bit | 3.36 | 6.61 | 131.68 | 6.12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
10 | ExLlamav2 | GPTQ 4bit | 3.63 | 6.93 | 151.30 | 6.03 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
11 | ExLlamav2 | EXL2 8bit | 6.37 | 9.47 | 115.81 | 5.76 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
12 | ExLlamav2 | FP16 | 12.55 | 15.09 | 67.70 | 5.73 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
13 | vLLM | AWQ GEMM 4bit | 3.62 | 34.55 | 114.43 | 6.02 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
14 | vLLM | GPTQ 4bit | 3.63 | 36.51 | 172.88 | 6.08 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
15 | vLLM | FP16 | 12.55 | 35.92 | 79.74 | 5.85 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
16 | TensorRT-LLM | AWQ GEMM 4bit | 3.42 | 5.69 | 194.86 | 6.02 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
17 | TensorRT-LLM | GPTQ 4bit | 3.60 | 5.88 | 202.16 | 6.08 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
18 | TensorRT-LLM | INT8 | 6.53 | 8.55 | 143.57 | 5.89 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
19 | TensorRT-LLM | FP16 | 12.55 | 14.61 | 83.43 | 5.85 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
20 | TGI | AWQ GEMM 4bit | 3.62 | 7.97 | 30.80 | 6.02 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
21 | TGI | AWQ GEMV 4bit | 3.62 | 7.96 | 34.22 | 6.02 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
22 | TGI | GPTQ 4bit | 3.69 | 39.39 | 34.86 | 6.08 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
23 | TGI | FP4 | 12.55 | 17.02 | 34.38 | 6.15 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
24 | TGI | NF4 | 12.55 | 17.02 | 33.93 | 6.02 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
25 | TGI | INT8 | 12.55 | 11.66 | 5.39 | 5.89 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
26 | TGI | FP16 | 12.55 | 17.02 | 34.23 | 5.85 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | |
27 | MLC-LLM | OmniQuant 3bit | 3.2 | 5.1 | 83.4 | 6.65 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | |
28 | MLC-LLM | OmniQuant 4bit | 3.8 | 5.7 | 134.2 | 5.97 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | |
29 | MLC-LLM | AWQ GEMM 4bit | 3.62 | 6.50 | 23.62 | 6.02 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | |
30 | MLC-LLM | Q4F16 | 3.53 | 6.50 | 189.07 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | ||
31 | MLC-LLM | Q3F16 | 2.84 | 5.98 | 185.47 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | ||
32 | MLC-LLM | FP16 | 12.55 | 15.38 | 87.37 | 5.85 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"input = \"\"\"\n", | |
"Method\tHardware Support\tQuantization Type\tWM (GB)\tRM (GB)\tTokens/sec\tPerplexity\n", | |
"Llama.cpp\tNVIDIA GPU\tGGUF K-Quant 2bit\t2.36\t3.69\t102.15\t6.96\n", | |
"-\tAMD GPU\tGGUF 4bit (check)\t3.56\t4.88\t128.97\t5.96\n", | |
"-\tApple Silicon\tGGUF AWQ 4bit\t3.56\t4.88\t129.25\t5.91\n", | |
"-\tCPU\tGGUF K-Quant 4bit\t3.59\t4.90\t109.72\t5.87\n", | |
"-\t-\tGGUF 8bit\t6.67\t7.78\t93.39\t5.79\n", | |
"-\t-\tGGUF FP16\t12.55\t13.22\t66.81\t5.79\n", | |
"ExLlama\tNVIDIA GPU\tGPTQ 4bit\t3.63\t5.35\t77.10\t6.08\n", | |
"-\tAMD GPU\t-\t-\t-\t-\t-\n", | |
"ExLlamav2\tNVIDIA GPU\tEXL2 2bit\t2.01\t5.21\t153.75\t20.21\n", | |
"-\tAMD GPU\tEXL2 4bit\t3.36\t6.61\t131.68\t6.12\n", | |
"-\t-\tGPTQ 4bit\t3.63\t6.93\t151.30\t6.03\n", | |
"-\t-\tEXL2 8bit\t6.37\t9.47\t115.81\t5.76\n", | |
"-\t-\tFP16\t12.55\t15.09\t67.70\t5.73\n", | |
"vLLM\tNVIDIA GPU\tAWQ GEMM 4bit\t3.62\t34.55\t114.43\t6.02\n", | |
"-\tAMD GPU\tGPTQ 4bit\t3.63\t36.51\t172.88\t6.08\n", | |
"-\t-\tFP16\t12.55\t35.92\t79.74\t5.85\n", | |
"TensorRT-LLM\tNVIDIA GPU\tAWQ GEMM 4bit\t3.42\t5.69\t194.86\t6.02\n", | |
"-\t-\tGPTQ 4bit\t3.60\t5.88\t202.16\t6.08\n", | |
"-\t-\tINT8\t6.53\t8.55\t143.57\t5.89\n", | |
"-\t-\tFP16\t12.55\t14.61\t83.43\t5.85\n", | |
"TGI\tAMD GPU\tAWQ GEMM 4bit\t3.62\t7.97\t30.80\t6.02\n", | |
"-\tNVIDIA GPU\tAWQ GEMV 4bit\t3.62\t7.96\t34.22\t6.02\n", | |
"-\tIntel GPU\tGPTQ 4bit\t3.69\t39.39\t34.86\t6.08\n", | |
"-\tAWS Inferentia2\tFP4\t12.55\t17.02\t34.38\t6.15\n", | |
"-\t-\tNF4\t12.55\t17.02\t33.93\t6.02\n", | |
"-\t-\tINT8\t12.55\t11.66\t5.39\t5.89\n", | |
"-\t-\tFP16\t12.55\t17.02\t34.23\t5.85\n", | |
"MLC-LLM\tNVIDIA GPU\tOmniQuant 3bit\t3.2\t5.1\t83.4\t6.65\n", | |
"-\tAMD GPU,\tOmniQuant 4bit\t3.8\t5.7\t134.2\t5.97\n", | |
"-\tCPU, WebGPU,\tAWQ GEMM 4bit\t3.62\t6.50\t23.62\t6.02\n", | |
"-\tApple Silicon,\tQ4F16\t3.53\t6.50\t189.07\t-\n", | |
"-\tIntel GPU,\tQ3F16\t2.84\t5.98\t185.47\t-\n", | |
"-\tWASM, Adreno Mali\tFP16\t12.55\t15.38\t87.37\t5.85\n", | |
"\"\"\"\n", | |
"\n", | |
"lines = input.strip().split('\\n')\n", | |
"data = [line.split('\\t') for line in lines]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'Hardware support': {'Llama.cpp': ['NVIDIA GPU',\n", | |
" 'AMD GPU',\n", | |
" 'Apple Silicon',\n", | |
" 'CPU'],\n", | |
" 'ExLlama': ['NVIDIA GPU', 'AMD GPU'],\n", | |
" 'ExLlamav2': ['NVIDIA GPU', 'AMD GPU'],\n", | |
" 'vLLM': ['NVIDIA GPU', 'AMD GPU'],\n", | |
" 'TensorRT-LLM': ['NVIDIA GPU'],\n", | |
" 'TGI': ['AMD GPU', 'NVIDIA GPU', 'Intel GPU', 'AWS Inferentia2'],\n", | |
" 'MLC-LLM': ['NVIDIA GPU',\n", | |
" 'AMD GPU',\n", | |
" 'CPU',\n", | |
" 'WebGPU',\n", | |
" 'Apple Silicon',\n", | |
" 'Intel GPU',\n", | |
" 'WASM',\n", | |
" 'Adreno Mali']},\n", | |
" 'Possible values': ['NVIDIA GPU',\n", | |
" 'AMD GPU',\n", | |
" 'Apple Silicon',\n", | |
" 'CPU',\n", | |
" 'Intel GPU',\n", | |
" 'AWS Inferentia2',\n", | |
" 'WebGPU',\n", | |
" 'WASM',\n", | |
" 'Adreno Mali']}" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"header = data[0]\n", | |
"body = data[1:]\n", | |
"\n", | |
"method_name = None\n", | |
"hardware_support_dict = {}\n", | |
"possible_hardware = []\n", | |
"\n", | |
"for row in body:\n", | |
" if row[0] != '-':\n", | |
" method_name = row[0]\n", | |
" else:\n", | |
" row[0] = method_name\n", | |
"\n", | |
" if row[1] != '-':\n", | |
" if method_name not in hardware_support_dict:\n", | |
" hardware_support_dict[method_name] = []\n", | |
" for item in row[1].split(','):\n", | |
" if item != '':\n", | |
" hardware_support_dict[method_name].append(item.strip())\n", | |
"\n", | |
"for values in hardware_support_dict.values():\n", | |
" for value in values:\n", | |
" if value not in possible_hardware:\n", | |
" possible_hardware.append(value)\n", | |
"\n", | |
"{ 'Hardware support': hardware_support_dict, 'Possible values': possible_hardware }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"vscode": { | |
"languageId": "shellscript" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: pandas in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (2.2.2)\n", | |
"Requirement already satisfied: numpy>=1.26.0 in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from pandas) (2.1.1)\n", | |
"Requirement already satisfied: python-dateutil>=2.8.2 in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n", | |
"Requirement already satisfied: pytz>=2020.1 in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from pandas) (2024.2)\n", | |
"Requirement already satisfied: tzdata>=2022.7 in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from pandas) (2024.1)\n", | |
"Requirement already satisfied: six>=1.5 in /Users/scales/.pyenv/versions/3.12.3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", | |
"\n", | |
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", | |
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", | |
"Note: you may need to restart the kernel to use updated packages.\n" | |
] | |
} | |
], | |
"source": [ | |
"pip install pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Method</th>\n", | |
" <th>Quantization Type</th>\n", | |
" <th>WM (GB)</th>\n", | |
" <th>RM (GB)</th>\n", | |
" <th>Tokens/sec</th>\n", | |
" <th>Perplexity</th>\n", | |
" <th>NVIDIA GPU</th>\n", | |
" <th>AMD GPU</th>\n", | |
" <th>Apple Silicon</th>\n", | |
" <th>CPU</th>\n", | |
" <th>Intel GPU</th>\n", | |
" <th>AWS Inferentia2</th>\n", | |
" <th>WebGPU</th>\n", | |
" <th>WASM</th>\n", | |
" <th>Adreno Mali</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF K-Quant 2bit</td>\n", | |
" <td>2.36</td>\n", | |
" <td>3.69</td>\n", | |
" <td>102.15</td>\n", | |
" <td>6.96</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF 4bit (check)</td>\n", | |
" <td>3.56</td>\n", | |
" <td>4.88</td>\n", | |
" <td>128.97</td>\n", | |
" <td>5.96</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF AWQ 4bit</td>\n", | |
" <td>3.56</td>\n", | |
" <td>4.88</td>\n", | |
" <td>129.25</td>\n", | |
" <td>5.91</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF K-Quant 4bit</td>\n", | |
" <td>3.59</td>\n", | |
" <td>4.90</td>\n", | |
" <td>109.72</td>\n", | |
" <td>5.87</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF 8bit</td>\n", | |
" <td>6.67</td>\n", | |
" <td>7.78</td>\n", | |
" <td>93.39</td>\n", | |
" <td>5.79</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Llama.cpp</td>\n", | |
" <td>GGUF FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>13.22</td>\n", | |
" <td>66.81</td>\n", | |
" <td>5.79</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>ExLlama</td>\n", | |
" <td>GPTQ 4bit</td>\n", | |
" <td>3.63</td>\n", | |
" <td>5.35</td>\n", | |
" <td>77.10</td>\n", | |
" <td>6.08</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>ExLlamav2</td>\n", | |
" <td>EXL2 2bit</td>\n", | |
" <td>2.01</td>\n", | |
" <td>5.21</td>\n", | |
" <td>153.75</td>\n", | |
" <td>20.21</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>ExLlamav2</td>\n", | |
" <td>EXL2 4bit</td>\n", | |
" <td>3.36</td>\n", | |
" <td>6.61</td>\n", | |
" <td>131.68</td>\n", | |
" <td>6.12</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>ExLlamav2</td>\n", | |
" <td>GPTQ 4bit</td>\n", | |
" <td>3.63</td>\n", | |
" <td>6.93</td>\n", | |
" <td>151.30</td>\n", | |
" <td>6.03</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>ExLlamav2</td>\n", | |
" <td>EXL2 8bit</td>\n", | |
" <td>6.37</td>\n", | |
" <td>9.47</td>\n", | |
" <td>115.81</td>\n", | |
" <td>5.76</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>ExLlamav2</td>\n", | |
" <td>FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>15.09</td>\n", | |
" <td>67.70</td>\n", | |
" <td>5.73</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>vLLM</td>\n", | |
" <td>AWQ GEMM 4bit</td>\n", | |
" <td>3.62</td>\n", | |
" <td>34.55</td>\n", | |
" <td>114.43</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>vLLM</td>\n", | |
" <td>GPTQ 4bit</td>\n", | |
" <td>3.63</td>\n", | |
" <td>36.51</td>\n", | |
" <td>172.88</td>\n", | |
" <td>6.08</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>vLLM</td>\n", | |
" <td>FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>35.92</td>\n", | |
" <td>79.74</td>\n", | |
" <td>5.85</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>TensorRT-LLM</td>\n", | |
" <td>AWQ GEMM 4bit</td>\n", | |
" <td>3.42</td>\n", | |
" <td>5.69</td>\n", | |
" <td>194.86</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>TensorRT-LLM</td>\n", | |
" <td>GPTQ 4bit</td>\n", | |
" <td>3.60</td>\n", | |
" <td>5.88</td>\n", | |
" <td>202.16</td>\n", | |
" <td>6.08</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>TensorRT-LLM</td>\n", | |
" <td>INT8</td>\n", | |
" <td>6.53</td>\n", | |
" <td>8.55</td>\n", | |
" <td>143.57</td>\n", | |
" <td>5.89</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>TensorRT-LLM</td>\n", | |
" <td>FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>14.61</td>\n", | |
" <td>83.43</td>\n", | |
" <td>5.85</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>TGI</td>\n", | |
" <td>AWQ GEMM 4bit</td>\n", | |
" <td>3.62</td>\n", | |
" <td>7.97</td>\n", | |
" <td>30.80</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>TGI</td>\n", | |
" <td>AWQ GEMV 4bit</td>\n", | |
" <td>3.62</td>\n", | |
" <td>7.96</td>\n", | |
" <td>34.22</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>TGI</td>\n", | |
" <td>GPTQ 4bit</td>\n", | |
" <td>3.69</td>\n", | |
" <td>39.39</td>\n", | |
" <td>34.86</td>\n", | |
" <td>6.08</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>TGI</td>\n", | |
" <td>FP4</td>\n", | |
" <td>12.55</td>\n", | |
" <td>17.02</td>\n", | |
" <td>34.38</td>\n", | |
" <td>6.15</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>TGI</td>\n", | |
" <td>NF4</td>\n", | |
" <td>12.55</td>\n", | |
" <td>17.02</td>\n", | |
" <td>33.93</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>TGI</td>\n", | |
" <td>INT8</td>\n", | |
" <td>12.55</td>\n", | |
" <td>11.66</td>\n", | |
" <td>5.39</td>\n", | |
" <td>5.89</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>TGI</td>\n", | |
" <td>FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>17.02</td>\n", | |
" <td>34.23</td>\n", | |
" <td>5.85</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>OmniQuant 3bit</td>\n", | |
" <td>3.2</td>\n", | |
" <td>5.1</td>\n", | |
" <td>83.4</td>\n", | |
" <td>6.65</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>OmniQuant 4bit</td>\n", | |
" <td>3.8</td>\n", | |
" <td>5.7</td>\n", | |
" <td>134.2</td>\n", | |
" <td>5.97</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>AWQ GEMM 4bit</td>\n", | |
" <td>3.62</td>\n", | |
" <td>6.50</td>\n", | |
" <td>23.62</td>\n", | |
" <td>6.02</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>Q4F16</td>\n", | |
" <td>3.53</td>\n", | |
" <td>6.50</td>\n", | |
" <td>189.07</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>Q3F16</td>\n", | |
" <td>2.84</td>\n", | |
" <td>5.98</td>\n", | |
" <td>185.47</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td>MLC-LLM</td>\n", | |
" <td>FP16</td>\n", | |
" <td>12.55</td>\n", | |
" <td>15.38</td>\n", | |
" <td>87.37</td>\n", | |
" <td>5.85</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Method Quantization Type WM (GB) RM (GB) Tokens/sec Perplexity \\\n", | |
"0 Llama.cpp GGUF K-Quant 2bit 2.36 3.69 102.15 6.96 \n", | |
"1 Llama.cpp GGUF 4bit (check) 3.56 4.88 128.97 5.96 \n", | |
"2 Llama.cpp GGUF AWQ 4bit 3.56 4.88 129.25 5.91 \n", | |
"3 Llama.cpp GGUF K-Quant 4bit 3.59 4.90 109.72 5.87 \n", | |
"4 Llama.cpp GGUF 8bit 6.67 7.78 93.39 5.79 \n", | |
"5 Llama.cpp GGUF FP16 12.55 13.22 66.81 5.79 \n", | |
"6 ExLlama GPTQ 4bit 3.63 5.35 77.10 6.08 \n", | |
"8 ExLlamav2 EXL2 2bit 2.01 5.21 153.75 20.21 \n", | |
"9 ExLlamav2 EXL2 4bit 3.36 6.61 131.68 6.12 \n", | |
"10 ExLlamav2 GPTQ 4bit 3.63 6.93 151.30 6.03 \n", | |
"11 ExLlamav2 EXL2 8bit 6.37 9.47 115.81 5.76 \n", | |
"12 ExLlamav2 FP16 12.55 15.09 67.70 5.73 \n", | |
"13 vLLM AWQ GEMM 4bit 3.62 34.55 114.43 6.02 \n", | |
"14 vLLM GPTQ 4bit 3.63 36.51 172.88 6.08 \n", | |
"15 vLLM FP16 12.55 35.92 79.74 5.85 \n", | |
"16 TensorRT-LLM AWQ GEMM 4bit 3.42 5.69 194.86 6.02 \n", | |
"17 TensorRT-LLM GPTQ 4bit 3.60 5.88 202.16 6.08 \n", | |
"18 TensorRT-LLM INT8 6.53 8.55 143.57 5.89 \n", | |
"19 TensorRT-LLM FP16 12.55 14.61 83.43 5.85 \n", | |
"20 TGI AWQ GEMM 4bit 3.62 7.97 30.80 6.02 \n", | |
"21 TGI AWQ GEMV 4bit 3.62 7.96 34.22 6.02 \n", | |
"22 TGI GPTQ 4bit 3.69 39.39 34.86 6.08 \n", | |
"23 TGI FP4 12.55 17.02 34.38 6.15 \n", | |
"24 TGI NF4 12.55 17.02 33.93 6.02 \n", | |
"25 TGI INT8 12.55 11.66 5.39 5.89 \n", | |
"26 TGI FP16 12.55 17.02 34.23 5.85 \n", | |
"27 MLC-LLM OmniQuant 3bit 3.2 5.1 83.4 6.65 \n", | |
"28 MLC-LLM OmniQuant 4bit 3.8 5.7 134.2 5.97 \n", | |
"29 MLC-LLM AWQ GEMM 4bit 3.62 6.50 23.62 6.02 \n", | |
"30 MLC-LLM Q4F16 3.53 6.50 189.07 NaN \n", | |
"31 MLC-LLM Q3F16 2.84 5.98 185.47 NaN \n", | |
"32 MLC-LLM FP16 12.55 15.38 87.37 5.85 \n", | |
"\n", | |
" NVIDIA GPU AMD GPU Apple Silicon CPU Intel GPU AWS Inferentia2 \\\n", | |
"0 1 1 1 1 0 0 \n", | |
"1 1 1 1 1 0 0 \n", | |
"2 1 1 1 1 0 0 \n", | |
"3 1 1 1 1 0 0 \n", | |
"4 1 1 1 1 0 0 \n", | |
"5 1 1 1 1 0 0 \n", | |
"6 1 1 0 0 0 0 \n", | |
"8 1 1 0 0 0 0 \n", | |
"9 1 1 0 0 0 0 \n", | |
"10 1 1 0 0 0 0 \n", | |
"11 1 1 0 0 0 0 \n", | |
"12 1 1 0 0 0 0 \n", | |
"13 1 1 0 0 0 0 \n", | |
"14 1 1 0 0 0 0 \n", | |
"15 1 1 0 0 0 0 \n", | |
"16 1 0 0 0 0 0 \n", | |
"17 1 0 0 0 0 0 \n", | |
"18 1 0 0 0 0 0 \n", | |
"19 1 0 0 0 0 0 \n", | |
"20 1 1 0 0 1 1 \n", | |
"21 1 1 0 0 1 1 \n", | |
"22 1 1 0 0 1 1 \n", | |
"23 1 1 0 0 1 1 \n", | |
"24 1 1 0 0 1 1 \n", | |
"25 1 1 0 0 1 1 \n", | |
"26 1 1 0 0 1 1 \n", | |
"27 1 1 1 1 1 0 \n", | |
"28 1 1 1 1 1 0 \n", | |
"29 1 1 1 1 1 0 \n", | |
"30 1 1 1 1 1 0 \n", | |
"31 1 1 1 1 1 0 \n", | |
"32 1 1 1 1 1 0 \n", | |
"\n", | |
" WebGPU WASM Adreno Mali \n", | |
"0 0 0 0 \n", | |
"1 0 0 0 \n", | |
"2 0 0 0 \n", | |
"3 0 0 0 \n", | |
"4 0 0 0 \n", | |
"5 0 0 0 \n", | |
"6 0 0 0 \n", | |
"8 0 0 0 \n", | |
"9 0 0 0 \n", | |
"10 0 0 0 \n", | |
"11 0 0 0 \n", | |
"12 0 0 0 \n", | |
"13 0 0 0 \n", | |
"14 0 0 0 \n", | |
"15 0 0 0 \n", | |
"16 0 0 0 \n", | |
"17 0 0 0 \n", | |
"18 0 0 0 \n", | |
"19 0 0 0 \n", | |
"20 0 0 0 \n", | |
"21 0 0 0 \n", | |
"22 0 0 0 \n", | |
"23 0 0 0 \n", | |
"24 0 0 0 \n", | |
"25 0 0 0 \n", | |
"26 0 0 0 \n", | |
"27 1 1 1 \n", | |
"28 1 1 1 \n", | |
"29 1 1 1 \n", | |
"30 1 1 1 \n", | |
"31 1 1 1 \n", | |
"32 1 1 1 " | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"df = pd.DataFrame(body, columns=header)\n", | |
"\n", | |
"del df['Hardware Support']\n", | |
"\n", | |
"for hardware in possible_hardware:\n", | |
" df[hardware] = df['Method'].apply(lambda method: (\n", | |
" 1 if hardware in hardware_support_dict[method] else 0\n", | |
" ))\n", | |
"\n", | |
"df = df[df['Quantization Type'] != '-']\n", | |
"\n", | |
"df.replace('-', np.nan, inplace=True)\n", | |
"\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv('output.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://arxiv.org/html/2402.01799v1