Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active October 8, 2025 01:25
Show Gist options
  • Save AmosLewis/fae896f00d838f728c44327685662349 to your computer and use it in GitHub Desktop.
Save AmosLewis/fae896f00d838f728c44327685662349 to your computer and use it in GitHub Desktop.
root@smci350-odcdh2-a05-1:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json --test-mode PerformanceOnly 2>&1 | tee server-time.log
Warning: Missing argument '--test-scenario'
Info: Defaulting to test scenario 'Offline'
Log started at: 2025-10-07 22:23:01
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:01.515306
INFO:root:####################################################################################################################################################################################
Running python3 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --bs 8 --user_conf_path user.conf --count 16 --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007 --debug False --verbose True --user_conf_path user.conf --shortfin_config shortfin_405b_config_fp4.json
##############################################################################################################################################################################################
DEBUG:root:
{'audit_conf_path': None,
'batcher_limit': 3,
'bs': '8',
'cores_per_devices': 1,
'count': 16,
'debug': False,
'detailed_logdir_name': True,
'devices': '0,1,2,3,4,5,6,7',
'enable_batcher': False,
'enable_numa': True,
'fibers_per_device': 1,
'log_mode': 'AsyncPoll',
'log_mode_async_poll_interval_ms': 1000,
'log_sample_get': False,
'logfile_outdir': 'OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007',
'logfile_prefix': 'mlperf_log_',
'logfile_suffix': '',
'mlperf_conf_path': '/mlperf/inference/mlperf.conf',
'mock_timeout_ms': None,
'model_path': 'Meta-Llama-3.1-405B-Instruct',
'model_weights': '/models/SDXL/official_pytorch/fp16/stable_diffusion_fp16',
'num_sample_loops': 1,
'qps': '',
'save_images': False,
'scenario': 'Offline',
'shark_engine': 'iree_python_api',
'shortfin_config': 'shortfin_405b_config_fp4.json',
'skip_warmup': False,
'tensor_path': '/data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl',
'test_mode': 'PerformanceOnly',
'time': 0,
'total_sample_count': 8313,
'user_conf_path': 'user.conf',
'verbose': True,
'workers_per_device': 1}
WARNING:root:Override count with 16
INFO:Llama-405B-Dataset:Loading dataset...
INFO:Llama-405B-Dataset:Finished loading dataset.
[Server] init with [0, 1, 2, 3, 4, 5, 6, 7]x1
[Server] ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[Server] Server process job init started for 0:0 device
[Server] Server process job init started for 1:0 device
[Server] Server process job init started for 2:0 device
[Server] Server process job init started for 3:0 device
[Server] Server process job init started for 4:0 device
[Server] Server process job init started for 5:0 device
[Server] Server process job init started for 6:0 device
[Server] Server process job init started for 7:0 device
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.506842
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526478
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526496
exitcode = _main(fd, parent_sentinel)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 131, in _main
prepare(preparation_data)
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 246, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "/usr/lib/python3.12/multiprocessing/spawn.py", line 297, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
^^^^^^^^^^^^^^^^^^^^^^^^^
File "<frozen runpy>", line 286, in run_path
File "<frozen runpy>", line 98, in _run_module_code
File "<frozen runpy>", line 88, in _run_code
File "/mlperf/harness/harness_alt_mi355.py", line 29, in <module>
from llama_backend import LlamaShortfinService
File "/mlperf/harness/llama_backend.py", line 26, in <module>
from sample_processor import SampleRequest, SampleProcessor, SampleResponse
File "/mlperf/harness/sample_processor.py", line 17, in <module>
from shortfin_apps.llm.components.lifecycle import ShortfinLlmLifecycleManager
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/lifecycle.py", line 24, in <module>
from .service import LlmGenerateService
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 20, in <module>
from .service_debug_dumper import SERVICE_DEBUG_DUMPER
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service_debug_dumper.py", line 226, in <module>
SERVICE_DEBUG_DUMPER = ServiceDebugDumper()
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service_debug_dumper.py", line 32, in __init__
self.dump_dir.mkdir(parents=True, exist_ok=False)
File "/usr/lib/python3.12/pathlib.py", line 1313, in mkdir
os.mkdir(self, mode)
FileExistsError: [Errno 17] File exists: '/root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.526478'
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.528460
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.528490
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.533388
INFO:shortfin_apps.llm.components.service_debug_dumper:[debug_service.py] Please find debug dumps for service.py in /root/.shortfin/debug/llm_service_invocation_dumps/2025-10-07T22:23:03.580782
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [1]
[Device 6:0] Initializing LlmManager
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [0]
[Device 0:0] Initializing LlmManager
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [1]
[Device 7:0] Initializing LlmManager
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [1]
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [0]
[Device 3:0] Initializing LlmManager
[Device 5:0] Initializing LlmManager
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [0]
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
[Device 2:0] Initializing LlmManager
INFO:root:GPU: 0
INFO:root:Nearest nodes = [1]
[Device 4:0] Initializing LlmManager
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:0:0@0'] devices
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 0:0] Initializing Service
[Device 6:0] Initializing Service
Process SampleProcessor-1:
Process SampleProcessor-7:
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 3:0] Initializing Service
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
[Device 5:0] Initializing Service
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 4:0] Initializing Service
[Device 2:0] Initializing Service
Process SampleProcessor-4:
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
Process SampleProcessor-6:
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
Process SampleProcessor-3:
Process SampleProcessor-5:
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 7:0] Initializing Service
Process SampleProcessor-8:
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/mlperf/harness/sample_processor.py", line 192, in run
self.init_processor()
File "/mlperf/harness/sample_processor.py", line 150, in init_processor
self.service = self.start_service(self.verbose_log)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mlperf/harness/sample_processor.py", line 163, in start_service
self.service.start()
File "/shark-ai/shortfin/python/shortfin_apps/llm/components/service.py", line 117, in start
self.inference_program = self.create_program(
^^^^^^^^^^^^^^^^^^^^
File "/shark-ai/shortfin/python/shortfin_apps/utils.py", line 531, in create_program
return sf.Program(
^^^^^^^^^^^
ValueError: <vm>:0: INCOMPATIBLE; HAL device `__device_1` not found or unavailable: #hal.device.target<"hip", {ordinal = 1 : index}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>, iree_codegen.target_info = #iree_gpu.target<arch = "gfx950", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x32_F16>, <MFMA_F32_32x32x16_F16>, <MFMA_F32_16x16x32_BF16>, <MFMA_F32_32x32x16_BF16>, <MFMA_F32_16x16x128_F8E5M2>, <MFMA_F32_16x16x128_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN>, <MFMA_F32_16x16x128_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2>, <MFMA_F32_32x32x64_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN>, <MFMA_F32_32x32x64_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x64_I8>, <MFMA_I32_32x32x32_I8>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2>, <MFMA_F32_16x16x32_F8E5M2_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN>, <MFMA_F32_16x16x32_F8E4M3FN_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2>, <MFMA_F32_32x32x16_F8E5M2_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN>, <MFMA_F32_32x32x16_F8E4M3FN_F8E5M2>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], scaled_mma = [<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E8M0FNU, rhs_elem_type = f8E8M0FNU, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2, rhs_elem_type = f8E5M2, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E5M2FNUZ, rhs_elem_type = f8E5M2FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f8E4M3FNUZ, rhs_elem_type = f8E4M3FNUZ, acc_elem_type = f32>, <intrinsic = MFMA_SCALE_F32_32x32x64_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 163840, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider, ukernels = "none"}>]>;
^C^C
@AmosLewis
Copy link
Author

AmosLewis commented Oct 7, 2025

run_offline.sh

echo "Log started at: $(date +"%Y-%m-%d %H:%M:%S")"
# rocprofv3 --output-format csv rocpd --disable-signal-handlers -r -- \
AMD_LOG_LEVEL=1 python3 -u harness_alt_mi355.py \
  --devices "0,1,2,3,4,5,6,7" \
  --scenario "$TEST_SCENARIO" \
  --test_mode "$TEST_MODE" \
	--bs 8 \
	--user_conf_path user.conf \
	--count 16 \
	--tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl \
	--logfile_outdir "Output${TEST_SCENARIO}${TEST_MODE}-a05-pp8-16samples-ps8_ds200_dc_4096_1007" \
  --debug "$DEBUG" \
  --verbose "$VERBOSE" \
  --user_conf_path "user.conf" \
	--shortfin_config "$SHORTFIN_CONFIG" 2>&1 | tee server.log
echo "Log end at: $(date +"%Y-%m-%d %H:%M:%S")"
#./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json --test-mode PerformanceOnly 2>&1 | tee server-time.log

shortfin_405b_config_fp4.json

{
    "host": "0.0.0.0",
    "port": "8080",
    "model_config": "/artifacts/chi/f4/f4_ps8_ds200_dbc4096.iree1007.shark1007_b5c.json",
    "tokenizer_json": "/shark-dev/weights/fp4/tokenizer.json",
    "tokenizer_config_json": "/shark-dev/weights/fp4/tokenizer_config.json",
    "vmfb": "/artifacts/chi/f4/f4_ps8_ds200_dbc4096.iree1007.shark1007_b5c.vmfb",
    "parameters": [
        "/shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa"
    ],
    "device": "hip",
    "device_ids": "0",
    "workers": "1",
    "artifacts_path": "/artifacts",
    "chunk_block_size":"64"
}

llama_backend.py add chunk_block_size

@dataclass(slots=True)
class ShortfinServerArgs:
    host: None | str
    port: str
    model_config: str
    tokenizer_json: str
    tokenizer_config_json: str
    vmfb: str
    parameters: list[str]
    device: str
    device_ids: str
    workers: str
    prefix_sharing_algorithm: Optional[str] = None
    amdgpu_allocators: Optional[str] = None
    artifacts_path: Optional[str] = None
    chunk_block_size: Optional[str] = None

    def to_args(self) -> list[str]:
        args = [
            "--host",
            self.host,
            "--port",
            self.port,
            "--model_config",
            self.model_config,
            "--tokenizer_json",
            self.tokenizer_json,
            "--vmfb",
            self.vmfb,
            "--parameters",
            *self.parameters,
            "--device",
            self.device,
            "--device_ids",
            *[str(x) for x in self.device_ids.split(" ")],
            "--tokenizer_config_json",
            self.tokenizer_config_json,
            "--chunk_block_size",
            self.chunk_block_size,
            # "--num-workers",
            # self.workers,
        ]
        if self.prefix_sharing_algorithm is not None:
            args.extend(["--prefix_sharing_algorithm", self.prefix_sharing_algorithm])

        if self.amdgpu_allocators is not None:
            args.extend(["amdgpu_allocators", self.amdgpu_allocators])
        return args

    def to_dict(self):
        return asdict(self)

@AmosLewis
Copy link
Author

AmosLewis commented Oct 8, 2025

issue1 solution:
Current broken mlperf code for pp8:
8 worker processes
Each sees only 1 GPU via ROCR_VISIBLE_DEVICES
Each tries to load a model compiled for 8 GPUs

What You Need for Tensor/Pipeline Parallel:
Single worker process managing all 8 GPUs

This issue can be fixed by:

  1. In sample_processor.py - Remove GPU isolation:
def __init__(
    self,
    args,
    device_id,
    # ...
):
    # ...
    self.device_id = 0  # Always use device 0 (first in list)
    self.real_device_id = device_id
    
    # COMMENT OUT THIS LINE - Don't restrict visible devices
    # os.environ["ROCR_VISIBLE_DEVICES"] = f"{device_id}"
  1. In harness_alt_mi355.py - Use only 1 worker:
server = LlamaShortfinService(
    devices=[0],  # <-- Only create 1 worker
    dataset=dataset,
    verbose=args.verbose,
    cores_per_devices=1,
    batch_size=args.bs,
    shortfin_server_config=args.shortfin_config
)
  1. Update shortfin_405b_config_fp4.json:
{
    "device_ids": "0 1 2 3 4 5 6 7",
    "workers": "1"
}

Why This Works
With these changes:
1 worker process is created, It sees all 8 GPUs (no ROCR_VISIBLE_DEVICES restriction)
Shortfin creates a system with all 8 devices visible
pp8 model loads correctly across all GPUs
The single worker handles all inference requests using the 8-GPU model

@AmosLewis
Copy link
Author

issue2:

root@smci350-odcdh2-a05-1:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json --test-mode PerformanceOnly 2>&1 | tee server-time.log
Warning: Missing argument '--test-scenario'
Info: Defaulting to test scenario 'Offline'
Log started at: 2025-10-08 01:08:53
INFO:root:####################################################################################################################################################################################
Running python3 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --bs 8 --user_conf_path user.conf --count 16 --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007 --debug False --verbose True --user_conf_path user.conf --shortfin_config shortfin_405b_config_fp4.json
##############################################################################################################################################################################################
DEBUG:root:
{'audit_conf_path': None,
 'batcher_limit': 3,
 'bs': '8',
 'cores_per_devices': 1,
 'count': 16,
 'debug': False,
 'detailed_logdir_name': True,
 'devices': '0,1,2,3,4,5,6,7',
 'enable_batcher': False,
 'enable_numa': True,
 'fibers_per_device': 1,
 'log_mode': 'AsyncPoll',
 'log_mode_async_poll_interval_ms': 1000,
 'log_sample_get': False,
 'logfile_outdir': 'OutputOfflinePerformanceOnly-a05-pp8-16samples-ps8_ds200_dc_4096_1007',
 'logfile_prefix': 'mlperf_log_',
 'logfile_suffix': '',
 'mlperf_conf_path': '/mlperf/inference/mlperf.conf',
 'mock_timeout_ms': None,
 'model_path': 'Meta-Llama-3.1-405B-Instruct',
 'model_weights': '/models/SDXL/official_pytorch/fp16/stable_diffusion_fp16',
 'num_sample_loops': 1,
 'qps': '',
 'save_images': False,
 'scenario': 'Offline',
 'shark_engine': 'iree_python_api',
 'shortfin_config': 'shortfin_405b_config_fp4.json',
 'skip_warmup': False,
 'tensor_path': '/data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl',
 'test_mode': 'PerformanceOnly',
 'time': 0,
 'total_sample_count': 8313,
 'user_conf_path': 'user.conf',
 'verbose': True,
 'workers_per_device': 1}

WARNING:root:Override count with 16
INFO:Llama-405B-Dataset:Loading dataset...
INFO:Llama-405B-Dataset:Finished loading dataset.
[Server] init with [0]x1
[Server] ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
[Server] Server process job init started for 0:0 device
INFO:root:NUMA hardware info: {'numa_node_distance': [[10, 32], [32, 10]], 'node_cpu_info': {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], 1: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]}}
INFO:root:GPU: 0
INFO:root:Nearest nodes = [0]
[Device 0:0] Initializing LlmManager
INFO:shortfin_apps.llm.components.manager:Created local system with ['amdgpu:7:0@0', 'amdgpu:6:0@0', 'amdgpu:5:0@0', 'amdgpu:4:0@0', 'amdgpu:3:0@0', 'amdgpu:2:0@0', 'amdgpu:1:0@0', 'amdgpu:0:0@0'] devices
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:0:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:1:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:2:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 983040], dtype=float8_e4m3fn, size=2.6GiB) on DeviceAffinity(amdgpu:3:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:4:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:5:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 1048576], dtype=float8_e4m3fn, size=2.8GiB) on DeviceAffinity(amdgpu:6:0@0[0x1])
INFO:root:Allocating page table (shape=[2816, 983040], dtype=float8_e4m3fn, size=2.6GiB) on DeviceAffinity(amdgpu:7:0@0[0x1])
INFO:root:Loading parameter fiber 'model' from: /shark-dev/weights/fp4/fp4_preshuffled_2025_09_12.irpa
INFO:shortfin_apps.llm.components.manager:Starting system manager
[Device 0:0] Initializing Service
dezhi Device(name='amdgpu:0:0@0', ordinal=0:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:1:0@0', ordinal=1:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:2:0@0', ordinal=2:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:3:0@0', ordinal=3:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:4:0@0', ordinal=4:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:5:0@0', ordinal=5:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:6:0@0', ordinal=6:0, node_affinity=0, capabilities=0x0)
dezhi Device(name='amdgpu:7:0@0', ordinal=7:0, node_affinity=0, capabilities=0x0)
[Server] Server process job is ready for 0:0 device
[Device 0:0] process sample loop started
[Server] process response loop started[Server] Server init with [0]x1 finished

INFO:root:Start Test!
[Server] [Server] Received 16 samples
[Server] Pushed into the queue
INFO:micro_llama_process_samples:SampleResponder-1 Sending response
INFO:micro_llama_process_samples:SampleResponder-1 end time: 3684809.129486016
Exception in thread Thread-1 (process_response_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.12/threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "/mlperf/harness/llama_backend.py", line 274, in process_response_loop
    _process_response(response)
  File "/mlperf/harness/llama_backend.py", line 250, in _process_response
    processed_output = self.dataset.postProcess(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mlperf/harness/dataset.py", line 69, in postProcess
    return [np.asarray(out, dtype=np.int32) for out in output_seq]
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: invalid literal for int() with base 10: 'R'
INFO:micro_llama_process_samples:SampleResponder-2 Sending response
INFO:micro_llama_process_samples:SampleResponder-2 end time: 3684809.14639025
^C^C^C^C^Z

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment