vuiseng9 · July 18, 2022 05:27 · vuiseng9 · Jul 18, 2022 · vuiseng9 · Jul 18, 2022
diff --git a/bert-async-infer.py b/bert-async-infer.py
 import time
 import numpy as np
 import logging as log

 from openvino.runtime import AsyncInferQueue, Core, PartialShape
 from openvino.tools.benchmark.utils.constants import CPU_DEVICE_NAME

 log.info = print

 model_path="/data1/vchua/jpqd-bert/r0.010-squad-bert-b-mvmt-8bit/ir/squad-BertForQuestionAnswering.cropped.8bit.onnx"

 def get_input_output_names(ports):
    return [port.any_name for port in ports]

 def get_node_names(ports):
    return [port.node.friendly_name for port in ports]

 def print_inputs_and_outputs_info(model):
    inputs = model.inputs
    input_names = get_input_output_names(inputs)
    for i in range(len(inputs)):
        log.info(f"Model input '{input_names[i]}' precision {inputs[i].element_type.get_type_name()}, "
                                                    f"dimensions ({str(inputs[i].node.layout)}): "
                                                    f"{' '.join(str(x) for x in inputs[i].partial_shape)}")
    outputs = model.outputs
    output_names = get_input_output_names(outputs)
    for i in range(len(outputs)):
        log.info(f"Model output '{output_names[i]}' precision {outputs[i].element_type.get_type_name()}, "
                                        f"dimensions ({str(outputs[i].node.layout)}): "
                                        f"{' '.join(str(x) for x in  outputs[i].partial_shape)}")

 log.info('\nCreating OpenVINO Runtime Core')
 core = Core()
 device_config = {
                    CPU_DEVICE_NAME :
                        dict(
                                PERF_COUNT='NO', 
                                PERFORMANCE_HINT='THROUGHPUT',
                                NUM_STREAMS='-1'
                            )
                }
 core.set_property(CPU_DEVICE_NAME, device_config[CPU_DEVICE_NAME])

 keys = core.get_property(CPU_DEVICE_NAME, 'SUPPORTED_PROPERTIES')
 log.info(f'\nDEVICE: {CPU_DEVICE_NAME}')
 for k in keys:
    if k not in ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', 'SUPPORTED_PROPERTIES'):
        try:
            log.info(f'  {k}  , {core.get_property(CPU_DEVICE_NAME, k)}')
        except:
            pass

 log.info(f'\nReading the model: {model_path}\n')
 model = core.read_model(model_path)

 ### !!! Toggle this variable
 dynamic_length = True

 if dynamic_length is True:
    seqlen= -1
    new_shape_cfg = {}
    for iport in model.inputs:
        new_shape_cfg[iport.any_name] = PartialShape([1, -1])
    model.reshape(new_shape_cfg)
 else:
    seqlen=384
    new_shape_cfg = {}
    for iport in model.inputs:
        new_shape_cfg[iport.any_name] = PartialShape([1, seqlen])
    model.reshape(new_shape_cfg)

 compiled_model = core.compile_model(model, CPU_DEVICE_NAME)
 input_port_names = [iport.any_name for iport in compiled_model.inputs]

 print_inputs_and_outputs_info(compiled_model)

 def create_input(seqlen):
    return {
                input_port_names[0]: np.expand_dims(np.random.randint(999, size=seqlen), axis=0).astype('int64'),
                input_port_names[1]: np.expand_dims(np.random.randint(  2, size=seqlen), axis=0).astype('int64'),
                input_port_names[2]: np.expand_dims(np.random.randint(999, size=seqlen), axis=0).astype('int64')
            }

 N_SAMPLE = 1024

 if dynamic_length is True:
    loaded_samples = []
    sl_list = [64, 192, 256, 384]
    for loop in range(int(N_SAMPLE/len(sl_list))):
        for sl in sl_list:
            loaded_samples.append(create_input(sl))
 else:
    loaded_samples = [create_input(seqlen) for i in range(N_SAMPLE)]

 infer_queue = AsyncInferQueue(compiled_model, 0)
 # warmup
 for it in range(100):
    infer_queue.get_idle_request_id()
    infer_queue.start_async(inputs=loaded_samples[it % N_SAMPLE])
 infer_queue.wait_all()

 niter=2500
 # benchmark
 start = time.time()
 for it in range(niter):
    infer_queue.get_idle_request_id()
    infer_queue.start_async(inputs=loaded_samples[it % N_SAMPLE])
 infer_queue.wait_all()
 e2e_elapse = time.time() - start

 log.info(   '\nSeqLen {} | {} iter '
            '| E2E: {:.3f} s '
            '| TPT: {:6.2f} fps'.format(
                seqlen, niter, e2e_elapse, niter/e2e_elapse)
            )
	import time
	import numpy as np
	import logging as log

	from openvino.runtime import AsyncInferQueue, Core, PartialShape
	from openvino.tools.benchmark.utils.constants import CPU_DEVICE_NAME

	log.info = print

	model_path="/data1/vchua/jpqd-bert/r0.010-squad-bert-b-mvmt-8bit/ir/squad-BertForQuestionAnswering.cropped.8bit.onnx"

	def get_input_output_names(ports):
	return [port.any_name for port in ports]

	def get_node_names(ports):
	return [port.node.friendly_name for port in ports]

	def print_inputs_and_outputs_info(model):
	inputs = model.inputs
	input_names = get_input_output_names(inputs)
	for i in range(len(inputs)):
	log.info(f"Model input '{input_names[i]}' precision {inputs[i].element_type.get_type_name()}, "
	f"dimensions ({str(inputs[i].node.layout)}): "
	f"{' '.join(str(x) for x in inputs[i].partial_shape)}")
	outputs = model.outputs
	output_names = get_input_output_names(outputs)
	for i in range(len(outputs)):
	log.info(f"Model output '{output_names[i]}' precision {outputs[i].element_type.get_type_name()}, "
	f"dimensions ({str(outputs[i].node.layout)}): "
	f"{' '.join(str(x) for x in outputs[i].partial_shape)}")

	log.info('\nCreating OpenVINO Runtime Core')
	core = Core()
	device_config = {
	CPU_DEVICE_NAME :
	dict(
	PERF_COUNT='NO',
	PERFORMANCE_HINT='THROUGHPUT',
	NUM_STREAMS='-1'
	)
	}
	core.set_property(CPU_DEVICE_NAME, device_config[CPU_DEVICE_NAME])

	keys = core.get_property(CPU_DEVICE_NAME, 'SUPPORTED_PROPERTIES')
	log.info(f'\nDEVICE: {CPU_DEVICE_NAME}')
	for k in keys:
	if k not in ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', 'SUPPORTED_PROPERTIES'):
	try:
	log.info(f' {k} , {core.get_property(CPU_DEVICE_NAME, k)}')
	except:
	pass

	log.info(f'\nReading the model: {model_path}\n')
	model = core.read_model(model_path)

	### !!! Toggle this variable
	dynamic_length = True

	if dynamic_length is True:
	seqlen= -1
	new_shape_cfg = {}
	for iport in model.inputs:
	new_shape_cfg[iport.any_name] = PartialShape([1, -1])
	model.reshape(new_shape_cfg)
	else:
	seqlen=384
	new_shape_cfg = {}
	for iport in model.inputs:
	new_shape_cfg[iport.any_name] = PartialShape([1, seqlen])
	model.reshape(new_shape_cfg)

	compiled_model = core.compile_model(model, CPU_DEVICE_NAME)
	input_port_names = [iport.any_name for iport in compiled_model.inputs]

	print_inputs_and_outputs_info(compiled_model)

	def create_input(seqlen):
	return {
	input_port_names[0]: np.expand_dims(np.random.randint(999, size=seqlen), axis=0).astype('int64'),
	input_port_names[1]: np.expand_dims(np.random.randint( 2, size=seqlen), axis=0).astype('int64'),
	input_port_names[2]: np.expand_dims(np.random.randint(999, size=seqlen), axis=0).astype('int64')
	}

	N_SAMPLE = 1024

	if dynamic_length is True:
	loaded_samples = []
	sl_list = [64, 192, 256, 384]
	for loop in range(int(N_SAMPLE/len(sl_list))):
	for sl in sl_list:
	loaded_samples.append(create_input(sl))
	else:
	loaded_samples = [create_input(seqlen) for i in range(N_SAMPLE)]

	infer_queue = AsyncInferQueue(compiled_model, 0)
	# warmup
	for it in range(100):
	infer_queue.get_idle_request_id()
	infer_queue.start_async(inputs=loaded_samples[it % N_SAMPLE])
	infer_queue.wait_all()

	niter=2500
	# benchmark
	start = time.time()
	for it in range(niter):
	infer_queue.get_idle_request_id()
	infer_queue.start_async(inputs=loaded_samples[it % N_SAMPLE])
	infer_queue.wait_all()
	e2e_elapse = time.time() - start

	log.info( '\nSeqLen {} \| {} iter '
	'\| E2E: {:.3f} s '
	'\| TPT: {:6.2f} fps'.format(
	seqlen, niter, e2e_elapse, niter/e2e_elapse)
	)