|
import timeit |
|
from statistics import mean |
|
|
|
# ALGORITHMS ################################################################## |
|
def get_bom_encoding_a(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE\x00\x00"): |
|
return 'utf_32_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_b(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
if buffer.startswith(b"\xFF\xFE\x00\x00"): |
|
return 'utf_32_le' |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_c(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
if buffer[2:4] == b"\x00\x00": |
|
return 'utf_32_le' |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_d(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
if buffer[2] == 0 and buffer[3] == 0: |
|
return 'utf_32_le' |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_e(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer == b'': |
|
return None |
|
elif buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE\x00\x00"): |
|
return 'utf_32_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_f(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if len(buffer) < 2: |
|
return None |
|
elif buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE\x00\x00"): |
|
return 'utf_32_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
def get_bom_encoding_g(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
if buffer.startswith(b"\xFF\xFE\x00\x00"): |
|
return 'utf_32_le' |
|
if buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
if buffer.startswith(b"\xFF\xFE"): |
|
return 'utf_16_le' |
|
if buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
return None |
|
|
|
def get_bom_encoding_df(buffer: bytes): |
|
"""Return the encoding detected from a BOM at the start some bytes.""" |
|
if len(buffer) < 2: |
|
return None |
|
elif buffer.startswith(b"\xEF\xBB\xBF"): |
|
return 'utf_8_sig' |
|
elif buffer.startswith(b"\xFF\xFE"): |
|
if buffer[2] == 0 and buffer[3] == 0: |
|
return 'utf_32_le' |
|
return 'utf_16_le' |
|
elif buffer.startswith(b"\x00\x00\xFE\xFF"): |
|
return 'utf_32_be' |
|
elif buffer.startswith(b"\xFE\xFF"): |
|
return 'utf_16_be' |
|
else: |
|
return None |
|
|
|
|
|
# BENCHMARK ################################################################### |
|
examples = ( |
|
b'', |
|
'a'.encode('ascii'), |
|
'abcdefg'.encode('ascii'), |
|
'abcdefg'.encode('utf_8_sig'), |
|
b'\xFF\xFE' + 'abcdefg'.encode('utf_16_le'), |
|
b'\xFE\xFF' + 'abcdefg'.encode('utf_16_be'), |
|
b'\xFF\xFE\x00\x00' + 'abcdefg'.encode('utf_32_le'), |
|
b'\x00\x00\xFE\xFF' + 'abcdefg'.encode('utf_32_be'), |
|
'abcdefg'.encode('shift_jis'), |
|
'abcdefg'.encode('big5'), |
|
) |
|
|
|
def runner(testfunc): |
|
def wrapped(): |
|
for raw in examples: |
|
testfunc(raw) |
|
return wrapped |
|
|
|
def dotest(label, testfunc): |
|
times = timeit.repeat(runner(testfunc), number=100_000, repeat=10) |
|
print(f'{label:>25}:: Min: {min(times):.5f}, mean: {mean(times):.5f}') |
|
|
|
def timeall(): |
|
print('Testing nesting...') |
|
dotest('Baseline (A)', get_bom_encoding_a) |
|
dotest('Nesting (B)', get_bom_encoding_b) |
|
dotest('Nesting w/ Substring (C)', get_bom_encoding_c) |
|
dotest('Nesting w/ each byte (D)', get_bom_encoding_d) |
|
print('') |
|
print('Testing early fail...') |
|
dotest('Baseline (A)', get_bom_encoding_a) |
|
dotest('Compare empty buffer (E)', get_bom_encoding_e) |
|
dotest('Check buffer length (F)', get_bom_encoding_f) |
|
dotest('No elses (G)', get_bom_encoding_g) |
|
print('') |
|
dotest('D+F', get_bom_encoding_df) |
|
|
|
timeall() |