Skip to content

Instantly share code, notes, and snippets.

@roeniss
Last active July 1, 2022 17:31
Show Gist options
  • Save roeniss/0d16c94c75bb107087326798fcc4e922 to your computer and use it in GitHub Desktop.
Save roeniss/0d16c94c75bb107087326798fcc4e922 to your computer and use it in GitHub Desktop.
Tony Finch - tolower() in bulk at speed
# This is based on the post "2022-06-27 – tolower() in bulk at speed" by Tony Finch
# link: https://dotat.at/@/2022-06-27-tolower-swar.html
## For me, it was quite hard to understand the post without some visual helps. So I made it.
## You can run this with python3 in terminal and I hope this would help you, too.
OCTETS = "ABSOLUTE"
def show_ascii(s):
bins = bin(s)[2:].zfill(64)
for i in range(0, 64, 8):
print(chr(int(bins[i:i+8], 2)), end="")
print()
OKGREEN = "\033[92m"
ENDC = "\033[0m"
def color(s):
S = ""
for ss in s:
if (ss == "1"):
S += OKGREEN + ss + ENDC
else:
S += ss
return S
def show(s, memo=""):
S = bin(s)[2:].zfill(64)
S = str(S)
for i in range(0, 8):
print(color(S[i*8:i*8+8]), end=" ")
print(color(" # " + memo))
def to_hex(s):
return int(s.encode("utf-8").hex(), 16)
all_bytes = 0x0101010101010101
octets = to_hex(OCTETS)
# show(all_bytes, "all_bytes")
show(octets, "octets")
steal_all_0x80 = 0x7F * all_bytes
show(steal_all_0x80, "0x7F * all_bytes")
heptets = octets & steal_all_0x80
show(heptets, "heptets = octets & (0x7F * all_bytes)")
print("-"*60)
# show(0x7F, "0x7F")
# show(to_hex('A'), "A")
# show(to_hex('Z'), "Z")
# print("-"*60)
is_gt_Z = heptets + (0x7F - to_hex('Z')) * all_bytes
show(is_gt_Z, "is_gt_Z = heptets + (0x7F - 'Z') * all_bytes -- (each first bit is true if heptet - 'Z' > 0)")
is_ge_A = heptets + (0x80 - to_hex('A')) * all_bytes
show(is_ge_A, "is_ge_A = heptets + (0x80 - 'A') * all_bytes -- (each first bit is true if heptet - 'A' >= 0)")
print(" -- above two example works only when the original char is btw 0 ~ 127 (ascii)")
print("-"*60)
is_ascii = (0xFF * all_bytes) ^ octets # ~octets make something not expected, so use other way
show(octets, "octets")
show(is_ascii, "is_ascii = ~octets -- (each first bit is true if it's ascii char)")
print("-"*60)
is_upper = is_ascii & (is_ge_A ^ is_gt_Z)
show(is_upper, "is_upper = is_ascii & (is_ge_A ^ is_gt_Z) -- (true only if is_ge_A=1 and is_gt_Z=0")
to_lower = (is_upper >> 2) & (0x20 * all_bytes)
show(to_lower, "to_lower = (is_upper >> 2) & (0x20 * all_bytes)")
# show(((is_ascii >> 2) & ((is_ge_A >> 2) ^ (is_gt_Z >> 2)) & (0x20 * all_bytes)), "same result")
show(octets, "octets")
result = octets | to_lower
show(result, "octets | to_lower")
print("-"*60)
show_ascii(result)
print("-"*60)
# another way
is_ascii = ~octets & (0x80 * all_bytes)
show(is_ascii, "is_ascii = ~octets & (0x80 * all_bytes)")
is_upper = is_ascii & (is_ge_A ^ is_gt_Z)
result = octets | is_upper >> 2
show(result, "octets | is_upper >> 2")
print("-"*60)
show_ascii(result) # same result
@roeniss
Copy link
Author

roeniss commented Jul 1, 2022

actually you don't even need to run it. Here's the output:

CleanShot 2022-07-02 at 02 30 56

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment