Created
November 30, 2016 21:53
-
-
Save jhyland87/1f47dfb8123a2f78a2705bc02644c180 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import config | |
import string | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf-8") | |
class BaseServer(Lockable): | |
_logger = config.default_logger | |
# Check if a string is a numeric value. Floats, digits, integers, decimals, negative values, all return True | |
# - If the value instance passed is an int or float => True | |
# - If the value is a string thats detected as a digit => True | |
# - If the value instance is NOT a string that can be checked => False | |
# - If the Try int(s) didn't throw an error => True | |
# - If the Try float(s) didn't throw an error => True | |
# - None of the checks have returned True => False | |
@staticmethod | |
def isNumeric( s ): | |
val_type = type(s) | |
if val_type is int or val_type is float or str(s).isdigit(): | |
return True | |
if val_type is not str: | |
return False | |
try: | |
n = int(s) | |
return True | |
except ValueError: | |
pass | |
try: | |
n = float(s) | |
return True | |
except ValueError: | |
pass | |
return False | |
@staticmethod | |
def is_ascii(s): | |
return all(ord(c) < 128 for c in s) | |
def sanitizeData( self, data, unknown_subst = '' ): | |
""" Method to sanitize user input data. Can accept any type of data (str, int, list, dict, etc) | |
Args: | |
data (mixed) Data to sanitize | |
unknown_subst (str) Character to replace any unknown unicode characters with | |
Returns (mixed): | |
Returns a sanitized version of the provided data. Whatever variable type was provided will | |
be the result type | |
Example: | |
>>> self.sanitizeData( u'\u2018' + 'In Quotes' + u'\u2019' ) | |
'In Quotes' | |
>>> self.sanitizeData( 'No Unknown' + u'\u014A' + ' Char' ) | |
No Unknown Char | |
>>> self.sanitizeData( 'Unknown Char: ' +u'\u014A', '?' ) | |
Unknown Char: ? | |
""" | |
if self.isNumeric( data ): | |
return data | |
def _sanitizeStr( dirty_string ): | |
dirty_string = str( dirty_string ) | |
replacement_chars = { | |
""" | |
Unicode Replacement Dictionary | |
Most of the common unicode characters should be included here. A list of commonly used unicode characters can | |
be found here: https://gist.github.com/jhyland87/5f7ed9d91875280e4f973a033dcf8db0 | |
""" | |
# General Punctuation | |
u'\u2016' : "|", # DOUBLE VERTICAL LINE | |
u'\u2017' : "_", # DOUBLE LOW LINE | |
u'\u2022' : '-', # BULLET | |
u'\u2024' : '-', # ONE DOT LEADER | |
u'\u2025' : '-', # TWO DOT LEADER | |
u'\u2026' : '-', # HORIZONTAL ELLIPSIS (Three dot leader) | |
u'\u2027' : '-', # HYPHENATION POINT | |
u'\u2032' : "'", # PRIME (minutes, feet) | |
u'\u2033' : '"', # DOUBLE PRIME (seconds, inches) | |
u'\u2034' : "'''", # TRIPLE PRIME (old measure, 1/12 of an inch) | |
u'\u2038' : '^', # CARET | |
u'\u2053' : '~', # SWUNG DASH | |
u'\u2055' : '*', # FLOWER PUNCTUATION MARK | |
u'\u204F' : ';', # REVERSED SEMICOLON | |
# Quotation marks and apostrophe | |
u'\u2018' : "'", # LEFT SINGLE QUOTATION MARK | |
u'\u2019' : "'", # RIGHT SINGLE QUOTATION MARK | |
u'\u201A' : "'", # SINGLE LOW-9 QUOTATION MARK | |
u'\u201B' : "'", # SINGLE LOW-9 QUOTATION MARK | |
u'\u201C' : '"', # RIGHT DOUBLE QUOTATION MARK | |
u'\u201D' : '"', # RIGHT DOUBLE QUOTATION MARK | |
u'\u201E' : '"', # DOUBLE LOW-9 QUOTATION MARK | |
u'\u201F' : '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK | |
# Dashes/Hyphens | |
u'\u2010' : '-', # Hyphen | |
u'\u2011' : "-", # NON-BREAKING HYPHEN | |
u'\u2012' : "-", # FIGURE DASH | |
u'\u2013' : "-", # EM DASH | |
u'\u2014' : "-", # EM DASH | |
u'\u2015' : "-", # HORIZONTAL BAR | |
# Quotation Marks | |
u'\u2039' : '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
u'\u203A' : '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
# Punctuation marks | |
u'\u203C' : '!!', # DOUBLE EXCLAMATION MARK | |
u'\u2048' : '?!', # QUESTION EXCLAMATION MARK | |
u'\u2049' : '??', # EXCLAMATION QUESTION MARK | |
# Math stuff | |
u'\u00D7' : 'x', # MULTIPLICATION SIGN | |
u'\u2044' : '/', # FRACTION SLASH | |
# Bracket pieces | |
u'\u239B' : '(', # LEFT PARENTHESIS UPPER HOOK | |
u'\u239C' : '(', # LEFT PARENTHESIS EXTENSION | |
u'\u239D' : '(', # LEFT PARENTHESIS LOWER HOOK | |
u'\u239E' : ')', # RIGHT PARENTHESIS UPPER HOOK | |
u'\u239F' : ')', # RIGHT PARENTHESIS EXTENSION | |
u'\u23A0' : ')', # RIGHT PARENTHESIS LOWER HOOK | |
u'\u23A2' : '|', # LEFT SQUARE BRACKET EXTENSION | |
u'\u23A5' : '|', # RIGHT SQUARE BRACKET EXTENSION | |
u'\u23AA' : '|', # CURLY BRACKET EXTENSION | |
# Light and heavy dashed lines | |
u'\u2504' : '-', # BOX DRAWINGS LIGHT TRIPLE DASH HORIZONTAL | |
u'\u2505' : '-', # BOX DRAWINGS HEAVY TRIPLE DASH HORIZONTAL | |
u'\u2506' : '|', # BOX DRAWINGS LIGHT TRIPLE DASH VERTICAL | |
u'\u2507' : '|', # BOX DRAWINGS HEAVY TRIPLE DASH VERTICAL | |
u'\u2508' : '-', # BOX DRAWINGS LIGHT QUADRUPLE DASH HORIZONTAL | |
u'\u2509' : '-', # BOX DRAWINGS HEAVY QUADRUPLE DASH HORIZONTAL | |
u'\u250A' : '|', # BOX DRAWINGS LIGHT QUADRUPLE DASH VERTICAL | |
u'\u250B' : '|', # BOX DRAWINGS HEAVY QUADRUPLE DASH VERTICAL | |
u'\u254C' : '-', # BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL | |
u'\u254D' : '-', # BOX DRAWINGS HEAVY DOUBLE DASH HORIZONTAL | |
u'\u254E' : '|', # BOX DRAWINGS LIGHT DOUBLE DASH VERTICAL | |
u'\u254F' : '|', # BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL | |
# Character cell diagonals | |
u'\u2571' : '/', # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT | |
u'\u2572' : '\\', # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT | |
u'\u2573' : 'X', # BOX DRAWINGS LIGHT DIAGONAL CROSS | |
# Superscripts | |
u'\u207D' : '(', # SUPERSCRIPT LEFT PARENTHESIS | |
u'\u207E' : '(', # SUPERSCRIPT RIGHT PARENTHESIS | |
u'\u207C' : '=', # SUPERSCRIPT EQUALS SIGN | |
u'\xab' : '<<', # Double left arrows | |
u'\xbb' : '>>', # Double right arrows | |
u'\xa6' : '|', # "Broken" pipe | |
u'\xb7' : '-', # Middle dot | |
# Other.. | |
u'\u02DC' : "~", # SMALL TILDE | |
u'\u00A0' : " ", # NON-BREAKING SPACE BAR | |
u'\ufffd' : '-', # Some weird lookin hyphen that MS word has | |
# Fullwidth ASCII variants | |
u'\uFF01' : '!', # FULLWIDTH EXCLAMATION MARK | |
u'\uFF02' : '"', # FULLWIDTH QUOTATION MARK | |
u'\uFF03' : '#', # FULLWIDTH NUMBER SIGN | |
u'\uFF04' : '$', # FULLWIDTH DOLLAR SIGN | |
u'\uFF05' : '%', # FULLWIDTH PERCENT SIGN | |
u'\uFF06' : '&', # FULLWIDTH AMPERSAND | |
u'\uFF07' : "'", # FULLWIDTH APOSTROPHE | |
u'\uFF08' : '(', # FULLWIDTH LEFT PARENTHESIS | |
u'\uFF09' : ')', # FULLWIDTH RIGHT PARENTHESIS | |
u'\uFF0A' : '*', # FULLWIDTH ASTERISK | |
u'\uFF0B' : '+', # FULLWIDTH PLUS SIGN | |
u'\uFF0C' : ',', # FULLWIDTH COMMA | |
u'\uFF0D' : '-', # FULLWIDTH HYPHEN-MINUS | |
u'\uFF0E' : '.', # FULLWIDTH FULL STOP (period?) | |
u'\uFF0F' : '/', # FULLWIDTH SOLIDUS | |
u'\uFF1A' : ':', # FULLWIDTH COLON | |
u'\uFF1B' : ';', # FULLWIDTH SEMICOLON | |
u'\uFF1C' : '<', # FULLWIDTH LESS-THAN SIGN | |
u'\uFF1D' : '=', # FULLWIDTH EQUALS SIGN | |
u'\uFF1E' : '>', # FULLWIDTH GREATER-THAN SIGN | |
u'\uFF1F' : '?', # FULLWIDTH QUESTION MARK | |
u'\uFF20' : '@', # FULLWIDTH COMMERCIAL AT | |
u'\uFF3B' : '[', # FULLWIDTH LEFT SQUARE BRACKET | |
u'\uFF3C' : '\\', # FULLWIDTH REVERSE SOLIDUS | |
u'\uFF3D' : ']', # FULLWIDTH RIGHT SQUARE BRACKET | |
u'\uFF3E' : '^', # FULLWIDTH CIRCUMFLEX ACCENT | |
u'\uFF3F' : '_', # FULLWIDTH LOW LINE | |
u'\uFF40' : '`', # FULLWIDTH GRAVE ACCENT | |
u'\uFF5B' : '{', # FULLWIDTH LEFT CURLY BRACKET | |
u'\uFF5C' : '|', # FULLWIDTH VERTICAL LINE | |
u'\uFF5D' : '}', # FULLWIDTH RIGHT CURLY BRACKET | |
u'\uFF5E' : '~', # FULLWIDTH TILDE | |
u'\uFF5F' : '(', # FULLWIDTH LEFT WHITE PARENTHESIS | |
u'\uFF60' : ')', # FULLWIDTH RIGHT WHITE PARENTHESIS | |
u'\uFFE4' : '|', # FULLWIDTH BROKEN BAR | |
# Small form variants | |
u'\uFE50' : ',', # SMALL COMMA | |
u'\uFE51' : ',', # SMALL IDEOGRAPHIC COMMA | |
u'\uFE52' : '.', # SMALL FULL STOP | |
u'\uFE53' : ';', # SMALL SEMICOLON | |
u'\uFE54' : ':', # SMALL COLON | |
u'\uFE55' : '?', # SMALL QUESTION MARK | |
u'\uFE56' : '!', # SMALL EXCLAMATION MARK | |
u'\uFE57' : '-', # SMALL EM DASH | |
u'\uFE58' : '(', # SMALL LEFT PARENTHESIS | |
u'\uFE59' : ')', # SMALL RIGHT PARENTHESIS | |
u'\uFE5A' : '{', # SMALL LEFT CURLY BRACKET | |
u'\uFE5B' : '}', # SMALL RIGHT CURLY BRACKET | |
u'\uFE5C' : '(', # SMALL LEFT TORTOISE SHELL BRACKET | |
u'\uFE5D' : ')', # SMALL RIGHT TORTOISE SHELL BRACKET | |
u'\uFE5F' : '#', # SMALL NUMBER SIGN | |
u'\uFE60' : '&', # SMALL AMPERSAND | |
u'\uFE61' : '*', # SMALL ASTERISK | |
u'\uFE62' : '+', # SMALL PLUS SIGN | |
u'\uFE63' : '-', # SMALL HYPHEN-MINUS | |
u'\uFE64' : '<', # SMALL LESS-THAN SIGN | |
u'\uFE65' : '>', # SMALL GREATER-THAN SIGN | |
u'\uFE66' : '=', # SMALL EQUALS SIGN | |
u'\uFE68' : '\\', # SMALL REVERSE SOLIDUS | |
u'\uFE69' : '$', # SMALL DOLLAR SIGN | |
u'\uFE6A' : '%', # SMALL PERCENT SIGN | |
u'\uFE6B' : '@' # SMALL COMMERCIAL AT | |
} | |
#dirty_string = dirty_string.translate(None, string.punctuation) | |
# Only sanitize the string if it contains characters that are out of the ascii character range (128) | |
if self.is_ascii( dirty_string ) is False: | |
# Loop through the replacement characters, replacing key with the value in the dirty_string | |
for k, v in replacement_chars.iteritems(): | |
dirty_string = dirty_string.replace(k, v) | |
result = str(dirty_string) | |
# After all the known unicode characters have been replaced, check again to see if theres any unicode | |
# characters that werent defined in the replacements dictionary, and replace them with str(unknown_subst) | |
if self.is_ascii( result ) is False: | |
cleaned_result = [] | |
for s in list(result): | |
if self.is_ascii( s ) is not True: | |
s = str( unknown_subst ) | |
cleaned_result.append( s ) | |
result = ''.join(cleaned_result) | |
result = re.sub( r'\?+', str( unknown_subst ), result ) | |
return result | |
def _sanitizeList( listData ): | |
for subkey, subvalue in enumerate( listData ): | |
if isinstance( subvalue, str ): | |
listData[ subkey ] = _sanitizeStr( subvalue ) | |
else: | |
listData[ subkey ] = self.sanitizeData( subvalue ) | |
return listData | |
def _sanitizeDict( dictData ): | |
for key, value in dictData.items(): | |
#print("K: %s; V: %s" % ( key, value )) | |
if isinstance( value, str ): | |
#print "Val %s IS str" % value | |
dictData[key] = _sanitizeStr( value ) | |
else: | |
#print "Val %s is NOT str" % value | |
dictData[key] = self.sanitizeData( value ) | |
return dictData | |
if isinstance( data, list ): | |
return _sanitizeList( data ) | |
if isinstance( data, dict ): | |
return _sanitizeDict( data ) | |
return _sanitizeStr( data ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment