JordanReiter · April 6, 2018 21:24
diff --git a/clean_base64.py b/clean_base64.py
 # This function can be used to process text that may or may not be base64 encoded
 # In general, if you use base64 decode on normal text, you end up with random bytes
 # that don't resemble a standard encoding. This function makes use of the chardet
 # library to recognize decoded text that matches an expected encoding.

 # Note that it tends to fail for very short inputs.

 import binascii
 import base64

 import chardet


 def clean_base64(input):
    output = input
    try:
        try:
            input = input.encode()
        except AttributeError:
            pass
        # normally, b64decode ignores non-base64 characters
        # we don't want it to, but we do want it to ignore line breaks
        # so validate the string with all line breaks removed
        input = input.replace(b'\r', b'').replace(b'\n', b'')
        decoded = base64.b64decode(input, validate=True)
        detected = chardet.detect(decoded)
        if detected['encoding'] and detected['confidence'] > 0.5:
            try:
                # first assume utf-8; chardet often wrongly guesses windows-1252
                output = decoded.decode() 
            except UnicodeError:
                output = decoded.decode(detected['encoding'])
    except (binascii.Error, UnicodeError, ValueError):
        # the value is definitely not base64
        pass
    return output


 tests = [
    ("Tweebuffelsmeteenskootmorsdoodgeskietfontein", "Tweebuffelsmeteenskootmorsdoodgeskietfontein"),
    ("Hello, how are you?", "Hello, how are you?"),
    (b'SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
    ('SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
    (b'SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
    ('SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
    ("Hi there, friend", "Hi there, friend"),
    ("Hello", "Hello"),
    (b'SGVsbG8=', "Hello"),
    ('SGVsbG8=', "Hello"),
 ]

 for test, expected in tests:
    assert clean_base64(test) == expected

 # Fails for some very short strings
 assert clean_base64('yoyo') == 'yoyo'
 assert clean_base64('yoyoyoyo') == 'yoyoyoyo'
	# This function can be used to process text that may or may not be base64 encoded
	# In general, if you use base64 decode on normal text, you end up with random bytes
	# that don't resemble a standard encoding. This function makes use of the chardet
	# library to recognize decoded text that matches an expected encoding.

	# Note that it tends to fail for very short inputs.

	import binascii
	import base64

	import chardet


	def clean_base64(input):
	output = input
	try:
	try:
	input = input.encode()
	except AttributeError:
	pass
	# normally, b64decode ignores non-base64 characters
	# we don't want it to, but we do want it to ignore line breaks
	# so validate the string with all line breaks removed
	input = input.replace(b'\r', b'').replace(b'\n', b'')
	decoded = base64.b64decode(input, validate=True)
	detected = chardet.detect(decoded)
	if detected['encoding'] and detected['confidence'] > 0.5:
	try:
	# first assume utf-8; chardet often wrongly guesses windows-1252
	output = decoded.decode()
	except UnicodeError:
	output = decoded.decode(detected['encoding'])
	except (binascii.Error, UnicodeError, ValueError):
	# the value is definitely not base64
	pass
	return output


	tests = [
	("Tweebuffelsmeteenskootmorsdoodgeskietfontein", "Tweebuffelsmeteenskootmorsdoodgeskietfontein"),
	("Hello, how are you?", "Hello, how are you?"),
	(b'SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
	('SGVsbG8sIGhvdyBhcmUgeW91Pw==', "Hello, how are you?"),
	(b'SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
	('SGkgdGhlcmUsIGZyaWVuZA==', "Hi there, friend"),
	("Hi there, friend", "Hi there, friend"),
	("Hello", "Hello"),
	(b'SGVsbG8=', "Hello"),
	('SGVsbG8=', "Hello"),
	]

	for test, expected in tests:
	assert clean_base64(test) == expected

	# Fails for some very short strings
	assert clean_base64('yoyo') == 'yoyo'
	assert clean_base64('yoyoyoyo') == 'yoyoyoyo'