Created
April 6, 2018 21:24
-
-
Save JordanReiter/f99105fa73c6fd82729ed4406e0e39c0 to your computer and use it in GitHub Desktop.
This function can be used to process text that may or may not be base64 encoded
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This function can be used to process text that may or may not be base64 encoded | |
# In general, if you use base64 decode on normal text, you end up with random bytes | |
# that don't resemble a standard encoding. This function makes use of the chardet | |
# library to recognize decoded text that matches an expected encoding. | |
# Note that it tends to fail for very short inputs. | |
import binascii | |
import base64 | |
import chardet | |
def clean_base64(input): | |
output = input | |
try: | |
try: | |
input = input.encode() | |
except AttributeError: | |
pass | |
# normally, b64decode ignores non-base64 characters | |
# we don't want it to, but we do want it to ignore line breaks | |
# so validate the string with all line breaks removed | |
input = input.replace(b'\r', b'').replace(b'\n', b'') | |
decoded = base64.b64decode(input, validate=True) | |
detected = chardet.detect(decoded) | |
if detected['encoding'] and detected['confidence'] > 0.5: | |
try: | |
# first assume utf-8; chardet often wrongly guesses windows-1252 | |
output = decoded.decode() | |
except UnicodeError: | |
output = decoded.decode(detected['encoding']) | |
except (binascii.Error, UnicodeError, ValueError): | |
# the value is definitely not base64 | |
pass | |
return output | |
tests = [ | |
("Tweebuffelsmeteenskootmorsdoodgeskietfontein", "Tweebuffelsmeteenskootmorsdoodgeskietfontein"), | |
("Hello, how are you?", "Hello, how are you?"), | |
(b'SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"), | |
('SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"), | |
(b'SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"), | |
('SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"), | |
("Hi there, friend", "Hi there, friend"), | |
("Hello", "Hello"), | |
(b'SGVsbG8=', "Hello"), | |
('SGVsbG8=', "Hello"), | |
] | |
for test, expected in tests: | |
assert clean_base64(test) == expected | |
# Fails for some very short strings | |
assert clean_base64('yoyo') == 'yoyo' | |
assert clean_base64('yoyoyoyo') == 'yoyoyoyo' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment