Last active
October 8, 2025 01:25
-
-
Save AmosLewis/fae896f00d838f728c44327685662349 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
root@smci350-odcdh2-a05-1:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json --test-mode PerformanceOnly 2>&1 | tee server-time.log | |
Warning: Missing argument '--test-scenario' | |
Info: Defaulting to test scenario 'Offline' | |
Log started at: 2025-10-07 22:23:01 | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:01.515306 | |
INFO:root:#################################################################################################################################################################################### | |
Running python3 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --bs 8 --user_conf_path user.conf --count 16 --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007 --debug False --verbose True --user_conf_path user.conf --shortfin_config shortfin_405b_config_fp4.json | |
############################################################################################################################################################################################## | |
DEBUG:root: | |
{'audit_conf_path': None, | |
'batcher_limit': 3, | |
'bs': '8', | |
'cores_per_devices': 1, | |
'count': 16, | |
'debug': False, | |
'detailed_logdir_name': True, | |
'devices': '0,1,2,3,4,5,6,7', | |
'enable_batcher': False, | |
'enable_numa': True, | |
'fibers_per_device': 1, | |
'log_mode': 'AsyncPoll', | |
'log_mode_async_poll_interval_ms': 1000, | |
'log_sample_get': False, | |
'logfile_outdir': 'OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007', | |
'logfile_prefix': 'mlperf_log_', | |
'logfile_suffix': '', | |
'mlperf_conf_path': '/mlperf/inference/mlperf.conf', | |
'mock_timeout_ms': None, | |
'model_path': 'Meta-Llama-3.1-405B-Instruct', | |
'model_weights': '/models/SDXL/official_pytorch/fp16/stable_diffusion_fp16', | |
'num_sample_loops': 1, | |
'qps': '', | |
'save_images': False, | |
'scenario': 'Offline', | |
'shark_engine': 'iree_python_api', | |
'shortfin_config': 'shortfin_405b_config_fp4.json', | |
'skip_warmup': False, | |
'tensor_path': '/data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl', | |
'test_mode': 'PerformanceOnly', | |
'time': 0, | |
'total_sample_count': 8313, | |
'user_conf_path': 'user.conf', | |
'verbose': True, | |
'workers_per_device': 1} | |
WARNING:root:Override count with 16 | |
INFO:Llama-405B-Dataset:Loading dataset... | |
INFO:Llama-405B-Dataset:Finished loading dataset. | |
[Server] init with [0, 1, 2, 3, 4, 5, 6, 7]x1 | |
[Server] ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |
[Server] Server process job init started for 0:0 device | |
[Server] Server process job init started for 1:0 device | |
[Server] Server process job init started for 2:0 device | |
[Server] Server process job init started for 3:0 device | |
[Server] Server process job init started for 4:0 device | |
[Server] Server process job init started for 5:0 device | |
[Server] Server process job init started for 6:0 device | |
[Server] Server process job init started for 7:0 device | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.506842 | |
Traceback (most recent call last): | |
File "<string>", line 1, in <module> | |
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526478 | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526496 | |
exitcode = _main(fd, parent_sentinel) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 131, in _main | |
prepare(preparation_data) | |
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 246, in prepare | |
_fixup_main_from_path(data['init_main_from_path']) | |
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 297, in _fixup_main_from_path | |
main_content = runpy.run_path(main_path, | |
^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "<frozen runpy>", line 286, in run_path | |
File "<frozen runpy>", line 98, in _run_module_code | |
File "<frozen runpy>", line 88, in _run_code | |
File "/mlperf/harness/harness_alt_mi355.py", line 29, in <module> | |
from llama_backend import LlamaShortfinService | |
File "/mlperf/harness/llama_backend.py", line 26, in <module> | |
from sample_processor import SampleRequest, SampleProcessor, SampleResponse | |
File "/mlperf/harness/sample_processor.py", line 17, in <module> | |
from shortfin_apps.llm.components.lifecycle import ShortfinLlmLifecycleManager | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/lifecycle.py", line 24, in <module> | |
from .service import LlmGenerateService | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 20, in <module> | |
from .service_debug_dumper import SERVICE_DEBUG_DUMPER | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service_debug_dumper.py", line 226, in <module> | |
SERVICE_DEBUG_DUMPER = ServiceDebugDumper() | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service_debug_dumper.py", line 32, in __init__ | |
self.dump_dir.mkdir(parents=True, exist_ok=False) | |
File "/usr/lib/python3.12/pathlib.py", line 1313, in mkdir | |
os.mkdir(self, mode) | |
FileExistsError: [Errno 17] File exists: '/root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526478' | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.528460 | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.528490 | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.533388 | |
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.580782 | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [1] | |
[Device 6:0] Initializing LlmManager | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [0] | |
[Device 0:0] Initializing LlmManager | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [1] | |
[Device 7:0] Initializing LlmManager | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [1] | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [0] | |
[Device 3:0] Initializing LlmManager | |
[Device 5:0] Initializing LlmManager | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [0] | |
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}} | |
[Device 2:0] Initializing LlmManager | |
INFO:root:GPU: 0 | |
INFO:root:Nearest nodes = [1] | |
[Device 4:0] Initializing LlmManager | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1]) | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
[Device 0:0] Initializing Service | |
[Device 6:0] Initializing Service | |
Process SampleProcessor-1: | |
Process SampleProcessor-7: | |
Traceback (most recent call last): | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
[Device 3:0] Initializing Service | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
[Device 5:0] Initializing Service | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
[Device 4:0] Initializing Service | |
[Device 2:0] Initializing Service | |
Process SampleProcessor-4: | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
Process SampleProcessor-6: | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
Process SampleProcessor-3: | |
Process SampleProcessor-5: | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa | |
INFO:shortfin_apps.llm.components.manager:Starting system manager | |
[Device 7:0] Initializing Service | |
Process SampleProcessor-8: | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/mlperf/harness/sample_processor.py", line 192, in run | |
self.init_processor() | |
File "/mlperf/harness/sample_processor.py", line 150, in init_processor | |
self.service = self.start_service(self.verbose_log) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/mlperf/harness/sample_processor.py", line 163, in start_service | |
self.service.start() | |
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start | |
self.inference_program = self.create_program( | |
^^^^^^^^^^^^^^^^^^^^ | |
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program | |
return sf.Program( | |
^^^^^^^^^^^ | |
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>; | |
^C^C |
issue1 solution:
Current broken mlperf code for pp8:
8 worker processes
Each sees only 1 GPU via ROCR_VISIBLE_DEVICES
Each tries to load a model compiled for 8 GPUs
What You Need for Tensor/Pipeline Parallel:
Single worker process managing all 8 GPUs
This issue can be fixed by:
- In sample_processor.py - Remove GPU isolation:
def __init__(
self,
args,
device_id,
# ...
):
# ...
self.device_id = 0 # Always use device 0 (first in list)
self.real_device_id = device_id
# COMMENT OUT THIS LINE - Don't restrict visible devices
# os.environ["ROCR_VISIBLE_DEVICES"] = f"{device_id}"
- In harness_alt_mi355.py - Use only 1 worker:
server = LlamaShortfinService(
devices=[0], # <-- Only create 1 worker
dataset=dataset,
verbose=args.verbose,
cores_per_devices=1,
batch_size=args.bs,
shortfin_server_config=args.shortfin_config
)
- Update shortfin_405b_config_fp4.json:
{
"device_ids": "0 1 2 3 4 5 6 7",
"workers": "1"
}
Why This Works
With these changes:
1 worker process is created, It sees all 8 GPUs (no ROCR_VISIBLE_DEVICES restriction)
Shortfin creates a system with all 8 devices visible
pp8 model loads correctly across all GPUs
The single worker handles all inference requests using the 8-GPU model
issue2:
root@smci350-odcdh2-a05-1:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json --test-mode PerformanceOnly 2>&1 | tee server-time.log
Warning: Missing argument '--test-scenario'
Info: Defaulting to test scenario 'Offline'
Log started at: 2025-10-08 01:08:53
INFO:root:####################################################################################################################################################################################
Running python3 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --bs 8 --user_conf_path user.conf --count 16 --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007 --debug False --verbose True --user_conf_path user.conf --shortfin_config shortfin_405b_config_fp4.json
##############################################################################################################################################################################################
DEBUG:root:
{'audit_conf_path': None,
'batcher_limit': 3,
'bs': '8',
'cores_per_devices': 1,
'count': 16,
'debug': False,
'detailed_logdir_name': True,
'devices': '0,1,2,3,4,5,6,7',
'enable_batcher': False,
'enable_numa': True,
'fibers_per_device': 1,
'log_mode': 'AsyncPoll',
'log_mode_async_poll_interval_ms': 1000,
'log_sample_get': False,
'logfile_outdir': 'OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007',
'logfile_prefix': 'mlperf_log_',
'logfile_suffix': '',
'mlperf_conf_path': '/mlperf/inference/mlperf.conf',
'mock_timeout_ms': None,
'model_path': 'Meta-Llama-3.1-405B-Instruct',
'model_weights': '/models/SDXL/official_pytorch/fp16/stable_diffusion_fp16',
'num_sample_loops': 1,
'qps': '',
'save_images': False,
'scenario': 'Offline',
'shark_engine': 'iree_python_api',
'shortfin_config': 'shortfin_405b_config_fp4.json',
'skip_warmup': False,
'tensor_path': '/data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl',
'test_mode': 'PerformanceOnly',
'time': 0,
'total_sample_count': 8313,
'user_conf_path': 'user.conf',
'verbose': True,
'workers_per_device': 1}
WARNING:root:Override count with 16
INFO:Llama-405B-Dataset:Loading dataset...
INFO:Llama-405B-Dataset:Finished loading dataset.
[Server] init with [0]x1
[Server] ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[Server] Server process job init started for 0:0 device
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [0]
[Device 0:0] Initializing LlmManager
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:7:0@0', 'amdgpu:6:0@0', 'amdgpu:5:0@0', 'amdgpu:4:0@0', 'amdgpu:3:0@0', 'amdgpu:2:0@0', 'amdgpu:1:0@0', 'amdgpu:0:0@0'] devices
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:1:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:2:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 983040], dtype=float8_e4m3fn, size=2.6GiB) on DeviceAffinity(amdgpu:3:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:4:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:5:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:6:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 983040], dtype=float8_e4m3fn, size=2.6GiB) on DeviceAffinity(amdgpu:7:0@0[0x1])
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 0:0] Initializing Service
dezhi Device(name='amdgpu:0:0@0', ordinal=0:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:1:0@0', ordinal=1:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:2:0@0', ordinal=2:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:3:0@0', ordinal=3:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:4:0@0', ordinal=4:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:5:0@0', ordinal=5:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:6:0@0', ordinal=6:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:7:0@0', ordinal=7:0, node_affinity=0, capabilities=0x0)
[Server] Server process job is ready for 0:0 device
[Device 0:0] process sample loop started
[Server] process response loop started[Server] Server init with [0]x1 finished
INFO:root:Start Test!
[Server] [Server] Received 16 samples
[Server] Pushed into the queue
INFO:micro_llama_process_samples:SampleResponder-1 Sending response
INFO:micro_llama_process_samples:SampleResponder-1 end time: 3684809.129486016
Exception in thread Thread-1 (process_response_loop):
Traceback (most recent call last):
File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
self.run()
File "/usr/lib/python3.12/threading.py", line 1010, in run
self._target(*self._args, **self._kwargs)
File "/mlperf/harness/llama_backend.py", line 274, in process_response_loop
_process_response(response)
File "/mlperf/harness/llama_backend.py", line 250, in _process_response
processed_output = self.dataset.postProcess(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/dataset.py", line 69, in postProcess
return [np.asarray(out, dtype=np.int32) for out in output_seq]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: invalid literal for int() with base 10: 'R'
INFO:micro_llama_process_samples:SampleResponder-2 Sending response
INFO:micro_llama_process_samples:SampleResponder-2 end time: 3684809.14639025
^C^C^C^C^Z
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
run_offline.sh
shortfin_405b_config_fp4.json
llama_backend.py add
chunk_block_size