Mr0grog · October 13, 2023 19:14
diff --git a/0-README.md b/0-README.md
diff --git a/bom-benchmark.py b/bom-benchmark.py
 import timeit
 from statistics import mean

 # ALGORITHMS ##################################################################
 def get_bom_encoding_a(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_b(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer.startswith(b"\xFF\xFE\x00\x00"):
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_c(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2:4] == b"\x00\x00":
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_d(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2] == 0 and buffer[3] == 0:
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_e(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer == b'':
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_f(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if len(buffer) < 2:
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None

 def get_bom_encoding_g(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    if buffer.startswith(b"\xFF\xFE\x00\x00"):
        return 'utf_32_le'
    if buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    if buffer.startswith(b"\xFF\xFE"):
        return 'utf_16_le'
    if buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    return None

 def get_bom_encoding_df(buffer: bytes):
    """Return the encoding detected from a BOM at the start some bytes."""
    if len(buffer) < 2:
        return None
    elif buffer.startswith(b"\xEF\xBB\xBF"):
        return 'utf_8_sig'
    elif buffer.startswith(b"\xFF\xFE"):
        if buffer[2] == 0 and buffer[3] == 0:
            return 'utf_32_le'
        return 'utf_16_le'
    elif buffer.startswith(b"\x00\x00\xFE\xFF"):
        return 'utf_32_be'
    elif buffer.startswith(b"\xFE\xFF"):
        return 'utf_16_be'
    else:
        return None


 # BENCHMARK ###################################################################
 examples = (
    b'',
    'a'.encode('ascii'),
    'abcdefg'.encode('ascii'),
    'abcdefg'.encode('utf_8_sig'),
    b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'),
    b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'),
    b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'),
    b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'),
    'abcdefg'.encode('shift_jis'),
    'abcdefg'.encode('big5'),
 )

 def runner(testfunc):
    def wrapped():
        for raw in examples:
            testfunc(raw)
    return wrapped

 def dotest(label, testfunc):
    times = timeit.repeat(runner(testfunc), number=100_000, repeat=10)
    print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}')

 def timeall():
    print('Testing nesting...')
    dotest('Baseline (A)', get_bom_encoding_a)
    dotest('Nesting (B)', get_bom_encoding_b)
    dotest('Nesting w/ Substring (C)', get_bom_encoding_c)
    dotest('Nesting w/ each byte (D)', get_bom_encoding_d)
    print('')
    print('Testing early fail...')
    dotest('Baseline (A)', get_bom_encoding_a)
    dotest('Compare empty buffer (E)', get_bom_encoding_e)
    dotest('Check buffer length (F)', get_bom_encoding_f)
    dotest('No elses (G)', get_bom_encoding_g)
    print('')
    dotest('D+F', get_bom_encoding_df)

 timeall()
	import timeit
	from statistics import mean

	# ALGORITHMS ##################################################################
	def get_bom_encoding_a(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_b(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_c(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2:4] == b"\x00\x00":
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_d(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2] == 0 and buffer[3] == 0:
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_e(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer == b'':
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_f(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if len(buffer) < 2:
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None

	def get_bom_encoding_g(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	if buffer.startswith(b"\xFF\xFE\x00\x00"):
	return 'utf_32_le'
	if buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	if buffer.startswith(b"\xFF\xFE"):
	return 'utf_16_le'
	if buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	return None

	def get_bom_encoding_df(buffer: bytes):
	"""Return the encoding detected from a BOM at the start some bytes."""
	if len(buffer) < 2:
	return None
	elif buffer.startswith(b"\xEF\xBB\xBF"):
	return 'utf_8_sig'
	elif buffer.startswith(b"\xFF\xFE"):
	if buffer[2] == 0 and buffer[3] == 0:
	return 'utf_32_le'
	return 'utf_16_le'
	elif buffer.startswith(b"\x00\x00\xFE\xFF"):
	return 'utf_32_be'
	elif buffer.startswith(b"\xFE\xFF"):
	return 'utf_16_be'
	else:
	return None


	# BENCHMARK ###################################################################
	examples = (
	b'',
	'a'.encode('ascii'),
	'abcdefg'.encode('ascii'),
	'abcdefg'.encode('utf_8_sig'),
	b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'),
	b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'),
	b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'),
	b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'),
	'abcdefg'.encode('shift_jis'),
	'abcdefg'.encode('big5'),
	)

	def runner(testfunc):
	def wrapped():
	for raw in examples:
	testfunc(raw)
	return wrapped

	def dotest(label, testfunc):
	times = timeit.repeat(runner(testfunc), number=100_000, repeat=10)
	print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}')

	def timeall():
	print('Testing nesting...')
	dotest('Baseline (A)', get_bom_encoding_a)
	dotest('Nesting (B)', get_bom_encoding_b)
	dotest('Nesting w/ Substring (C)', get_bom_encoding_c)
	dotest('Nesting w/ each byte (D)', get_bom_encoding_d)
	print('')
	print('Testing early fail...')
	dotest('Baseline (A)', get_bom_encoding_a)
	dotest('Compare empty buffer (E)', get_bom_encoding_e)
	dotest('Check buffer length (F)', get_bom_encoding_f)
	dotest('No elses (G)', get_bom_encoding_g)
	print('')
	dotest('D+F', get_bom_encoding_df)

	timeall()