Last active
July 1, 2022 17:31
-
-
Save roeniss/0d16c94c75bb107087326798fcc4e922 to your computer and use it in GitHub Desktop.
Tony Finch - tolower() in bulk at speed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is based on the post "2022-06-27 – tolower() in bulk at speed" by Tony Finch | |
# link: https://dotat.at/@/2022-06-27-tolower-swar.html | |
## For me, it was quite hard to understand the post without some visual helps. So I made it. | |
## You can run this with python3 in terminal and I hope this would help you, too. | |
OCTETS = "ABSOLUTE" | |
def show_ascii(s): | |
bins = bin(s)[2:].zfill(64) | |
for i in range(0, 64, 8): | |
print(chr(int(bins[i:i+8], 2)), end="") | |
print() | |
OKGREEN = "\033[92m" | |
ENDC = "\033[0m" | |
def color(s): | |
S = "" | |
for ss in s: | |
if (ss == "1"): | |
S += OKGREEN + ss + ENDC | |
else: | |
S += ss | |
return S | |
def show(s, memo=""): | |
S = bin(s)[2:].zfill(64) | |
S = str(S) | |
for i in range(0, 8): | |
print(color(S[i*8:i*8+8]), end=" ") | |
print(color(" # " + memo)) | |
def to_hex(s): | |
return int(s.encode("utf-8").hex(), 16) | |
all_bytes = 0x0101010101010101 | |
octets = to_hex(OCTETS) | |
# show(all_bytes, "all_bytes") | |
show(octets, "octets") | |
steal_all_0x80 = 0x7F * all_bytes | |
show(steal_all_0x80, "0x7F * all_bytes") | |
heptets = octets & steal_all_0x80 | |
show(heptets, "heptets = octets & (0x7F * all_bytes)") | |
print("-"*60) | |
# show(0x7F, "0x7F") | |
# show(to_hex('A'), "A") | |
# show(to_hex('Z'), "Z") | |
# print("-"*60) | |
is_gt_Z = heptets + (0x7F - to_hex('Z')) * all_bytes | |
show(is_gt_Z, "is_gt_Z = heptets + (0x7F - 'Z') * all_bytes -- (each first bit is true if heptet - 'Z' > 0)") | |
is_ge_A = heptets + (0x80 - to_hex('A')) * all_bytes | |
show(is_ge_A, "is_ge_A = heptets + (0x80 - 'A') * all_bytes -- (each first bit is true if heptet - 'A' >= 0)") | |
print(" -- above two example works only when the original char is btw 0 ~ 127 (ascii)") | |
print("-"*60) | |
is_ascii = (0xFF * all_bytes) ^ octets # ~octets make something not expected, so use other way | |
show(octets, "octets") | |
show(is_ascii, "is_ascii = ~octets -- (each first bit is true if it's ascii char)") | |
print("-"*60) | |
is_upper = is_ascii & (is_ge_A ^ is_gt_Z) | |
show(is_upper, "is_upper = is_ascii & (is_ge_A ^ is_gt_Z) -- (true only if is_ge_A=1 and is_gt_Z=0") | |
to_lower = (is_upper >> 2) & (0x20 * all_bytes) | |
show(to_lower, "to_lower = (is_upper >> 2) & (0x20 * all_bytes)") | |
# show(((is_ascii >> 2) & ((is_ge_A >> 2) ^ (is_gt_Z >> 2)) & (0x20 * all_bytes)), "same result") | |
show(octets, "octets") | |
result = octets | to_lower | |
show(result, "octets | to_lower") | |
print("-"*60) | |
show_ascii(result) | |
print("-"*60) | |
# another way | |
is_ascii = ~octets & (0x80 * all_bytes) | |
show(is_ascii, "is_ascii = ~octets & (0x80 * all_bytes)") | |
is_upper = is_ascii & (is_ge_A ^ is_gt_Z) | |
result = octets | is_upper >> 2 | |
show(result, "octets | is_upper >> 2") | |
print("-"*60) | |
show_ascii(result) # same result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
actually you don't even need to run it. Here's the output: