Skip to content

Instantly share code, notes, and snippets.

@jhyland87
Created November 30, 2016 21:53
Show Gist options
  • Save jhyland87/1f47dfb8123a2f78a2705bc02644c180 to your computer and use it in GitHub Desktop.
Save jhyland87/1f47dfb8123a2f78a2705bc02644c180 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import config
import string
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class BaseServer(Lockable):
_logger = config.default_logger
# Check if a string is a numeric value. Floats, digits, integers, decimals, negative values, all return True
# - If the value instance passed is an int or float => True
# - If the value is a string thats detected as a digit => True
# - If the value instance is NOT a string that can be checked => False
# - If the Try int(s) didn't throw an error => True
# - If the Try float(s) didn't throw an error => True
# - None of the checks have returned True => False
@staticmethod
def isNumeric( s ):
val_type = type(s)
if val_type is int or val_type is float or str(s).isdigit():
return True
if val_type is not str:
return False
try:
n = int(s)
return True
except ValueError:
pass
try:
n = float(s)
return True
except ValueError:
pass
return False
@staticmethod
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def sanitizeData( self, data, unknown_subst = '' ):
""" Method to sanitize user input data. Can accept any type of data (str, int, list, dict, etc)
Args:
data (mixed) Data to sanitize
unknown_subst (str) Character to replace any unknown unicode characters with
Returns (mixed):
Returns a sanitized version of the provided data. Whatever variable type was provided will
be the result type
Example:
>>> self.sanitizeData( u'\u2018' + 'In Quotes' + u'\u2019' )
'In Quotes'
>>> self.sanitizeData( 'No Unknown' + u'\u014A' + ' Char' )
No Unknown Char
>>> self.sanitizeData( 'Unknown Char: ' +u'\u014A', '?' )
Unknown Char: ?
"""
if self.isNumeric( data ):
return data
def _sanitizeStr( dirty_string ):
dirty_string = str( dirty_string )
replacement_chars = {
"""
Unicode Replacement Dictionary
Most of the common unicode characters should be included here. A list of commonly used unicode characters can
be found here: https://gist.github.com/jhyland87/5f7ed9d91875280e4f973a033dcf8db0
"""
# General Punctuation
u'\u2016' : "|", # DOUBLE VERTICAL LINE
u'\u2017' : "_", # DOUBLE LOW LINE
u'\u2022' : '-', # BULLET
u'\u2024' : '-', # ONE DOT LEADER
u'\u2025' : '-', # TWO DOT LEADER
u'\u2026' : '-', # HORIZONTAL ELLIPSIS (Three dot leader)
u'\u2027' : '-', # HYPHENATION POINT
u'\u2032' : "'", # PRIME (minutes, feet)
u'\u2033' : '"', # DOUBLE PRIME (seconds, inches)
u'\u2034' : "'''", # TRIPLE PRIME (old measure, 1/12 of an inch)
u'\u2038' : '^', # CARET
u'\u2053' : '~', # SWUNG DASH
u'\u2055' : '*', # FLOWER PUNCTUATION MARK
u'\u204F' : ';', # REVERSED SEMICOLON
# Quotation marks and apostrophe
u'\u2018' : "'", # LEFT SINGLE QUOTATION MARK
u'\u2019' : "'", # RIGHT SINGLE QUOTATION MARK
u'\u201A' : "'", # SINGLE LOW-9 QUOTATION MARK
u'\u201B' : "'", # SINGLE LOW-9 QUOTATION MARK
u'\u201C' : '"', # RIGHT DOUBLE QUOTATION MARK
u'\u201D' : '"', # RIGHT DOUBLE QUOTATION MARK
u'\u201E' : '"', # DOUBLE LOW-9 QUOTATION MARK
u'\u201F' : '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
# Dashes/Hyphens
u'\u2010' : '-', # Hyphen
u'\u2011' : "-", # NON-BREAKING HYPHEN
u'\u2012' : "-", # FIGURE DASH
u'\u2013' : "-", # EM DASH
u'\u2014' : "-", # EM DASH
u'\u2015' : "-", # HORIZONTAL BAR
# Quotation Marks
u'\u2039' : '<', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
u'\u203A' : '>', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
# Punctuation marks
u'\u203C' : '!!', # DOUBLE EXCLAMATION MARK
u'\u2048' : '?!', # QUESTION EXCLAMATION MARK
u'\u2049' : '??', # EXCLAMATION QUESTION MARK
# Math stuff
u'\u00D7' : 'x', # MULTIPLICATION SIGN
u'\u2044' : '/', # FRACTION SLASH
# Bracket pieces
u'\u239B' : '(', # LEFT PARENTHESIS UPPER HOOK
u'\u239C' : '(', # LEFT PARENTHESIS EXTENSION
u'\u239D' : '(', # LEFT PARENTHESIS LOWER HOOK
u'\u239E' : ')', # RIGHT PARENTHESIS UPPER HOOK
u'\u239F' : ')', # RIGHT PARENTHESIS EXTENSION
u'\u23A0' : ')', # RIGHT PARENTHESIS LOWER HOOK
u'\u23A2' : '|', # LEFT SQUARE BRACKET EXTENSION
u'\u23A5' : '|', # RIGHT SQUARE BRACKET EXTENSION
u'\u23AA' : '|', # CURLY BRACKET EXTENSION
# Light and heavy dashed lines
u'\u2504' : '-', # BOX DRAWINGS LIGHT TRIPLE DASH HORIZONTAL
u'\u2505' : '-', # BOX DRAWINGS HEAVY TRIPLE DASH HORIZONTAL
u'\u2506' : '|', # BOX DRAWINGS LIGHT TRIPLE DASH VERTICAL
u'\u2507' : '|', # BOX DRAWINGS HEAVY TRIPLE DASH VERTICAL
u'\u2508' : '-', # BOX DRAWINGS LIGHT QUADRUPLE DASH HORIZONTAL
u'\u2509' : '-', # BOX DRAWINGS HEAVY QUADRUPLE DASH HORIZONTAL
u'\u250A' : '|', # BOX DRAWINGS LIGHT QUADRUPLE DASH VERTICAL
u'\u250B' : '|', # BOX DRAWINGS HEAVY QUADRUPLE DASH VERTICAL
u'\u254C' : '-', # BOX DRAWINGS LIGHT DOUBLE DASH HORIZONTAL
u'\u254D' : '-', # BOX DRAWINGS HEAVY DOUBLE DASH HORIZONTAL
u'\u254E' : '|', # BOX DRAWINGS LIGHT DOUBLE DASH VERTICAL
u'\u254F' : '|', # BOX DRAWINGS HEAVY DOUBLE DASH VERTICAL
# Character cell diagonals
u'\u2571' : '/', # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
u'\u2572' : '\\', # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
u'\u2573' : 'X', # BOX DRAWINGS LIGHT DIAGONAL CROSS
# Superscripts
u'\u207D' : '(', # SUPERSCRIPT LEFT PARENTHESIS
u'\u207E' : '(', # SUPERSCRIPT RIGHT PARENTHESIS
u'\u207C' : '=', # SUPERSCRIPT EQUALS SIGN
u'\xab' : '<<', # Double left arrows
u'\xbb' : '>>', # Double right arrows
u'\xa6' : '|', # "Broken" pipe
u'\xb7' : '-', # Middle dot
# Other..
u'\u02DC' : "~", # SMALL TILDE
u'\u00A0' : " ", # NON-BREAKING SPACE BAR
u'\ufffd' : '-', # Some weird lookin hyphen that MS word has
# Fullwidth ASCII variants
u'\uFF01' : '!', # FULLWIDTH EXCLAMATION MARK
u'\uFF02' : '"', # FULLWIDTH QUOTATION MARK
u'\uFF03' : '#', # FULLWIDTH NUMBER SIGN
u'\uFF04' : '$', # FULLWIDTH DOLLAR SIGN
u'\uFF05' : '%', # FULLWIDTH PERCENT SIGN
u'\uFF06' : '&', # FULLWIDTH AMPERSAND
u'\uFF07' : "'", # FULLWIDTH APOSTROPHE
u'\uFF08' : '(', # FULLWIDTH LEFT PARENTHESIS
u'\uFF09' : ')', # FULLWIDTH RIGHT PARENTHESIS
u'\uFF0A' : '*', # FULLWIDTH ASTERISK
u'\uFF0B' : '+', # FULLWIDTH PLUS SIGN
u'\uFF0C' : ',', # FULLWIDTH COMMA
u'\uFF0D' : '-', # FULLWIDTH HYPHEN-MINUS
u'\uFF0E' : '.', # FULLWIDTH FULL STOP (period?)
u'\uFF0F' : '/', # FULLWIDTH SOLIDUS
u'\uFF1A' : ':', # FULLWIDTH COLON
u'\uFF1B' : ';', # FULLWIDTH SEMICOLON
u'\uFF1C' : '<', # FULLWIDTH LESS-THAN SIGN
u'\uFF1D' : '=', # FULLWIDTH EQUALS SIGN
u'\uFF1E' : '>', # FULLWIDTH GREATER-THAN SIGN
u'\uFF1F' : '?', # FULLWIDTH QUESTION MARK
u'\uFF20' : '@', # FULLWIDTH COMMERCIAL AT
u'\uFF3B' : '[', # FULLWIDTH LEFT SQUARE BRACKET
u'\uFF3C' : '\\', # FULLWIDTH REVERSE SOLIDUS
u'\uFF3D' : ']', # FULLWIDTH RIGHT SQUARE BRACKET
u'\uFF3E' : '^', # FULLWIDTH CIRCUMFLEX ACCENT
u'\uFF3F' : '_', # FULLWIDTH LOW LINE
u'\uFF40' : '`', # FULLWIDTH GRAVE ACCENT
u'\uFF5B' : '{', # FULLWIDTH LEFT CURLY BRACKET
u'\uFF5C' : '|', # FULLWIDTH VERTICAL LINE
u'\uFF5D' : '}', # FULLWIDTH RIGHT CURLY BRACKET
u'\uFF5E' : '~', # FULLWIDTH TILDE
u'\uFF5F' : '(', # FULLWIDTH LEFT WHITE PARENTHESIS
u'\uFF60' : ')', # FULLWIDTH RIGHT WHITE PARENTHESIS
u'\uFFE4' : '|', # FULLWIDTH BROKEN BAR
# Small form variants
u'\uFE50' : ',', # SMALL COMMA
u'\uFE51' : ',', # SMALL IDEOGRAPHIC COMMA
u'\uFE52' : '.', # SMALL FULL STOP
u'\uFE53' : ';', # SMALL SEMICOLON
u'\uFE54' : ':', # SMALL COLON
u'\uFE55' : '?', # SMALL QUESTION MARK
u'\uFE56' : '!', # SMALL EXCLAMATION MARK
u'\uFE57' : '-', # SMALL EM DASH
u'\uFE58' : '(', # SMALL LEFT PARENTHESIS
u'\uFE59' : ')', # SMALL RIGHT PARENTHESIS
u'\uFE5A' : '{', # SMALL LEFT CURLY BRACKET
u'\uFE5B' : '}', # SMALL RIGHT CURLY BRACKET
u'\uFE5C' : '(', # SMALL LEFT TORTOISE SHELL BRACKET
u'\uFE5D' : ')', # SMALL RIGHT TORTOISE SHELL BRACKET
u'\uFE5F' : '#', # SMALL NUMBER SIGN
u'\uFE60' : '&', # SMALL AMPERSAND
u'\uFE61' : '*', # SMALL ASTERISK
u'\uFE62' : '+', # SMALL PLUS SIGN
u'\uFE63' : '-', # SMALL HYPHEN-MINUS
u'\uFE64' : '<', # SMALL LESS-THAN SIGN
u'\uFE65' : '>', # SMALL GREATER-THAN SIGN
u'\uFE66' : '=', # SMALL EQUALS SIGN
u'\uFE68' : '\\', # SMALL REVERSE SOLIDUS
u'\uFE69' : '$', # SMALL DOLLAR SIGN
u'\uFE6A' : '%', # SMALL PERCENT SIGN
u'\uFE6B' : '@' # SMALL COMMERCIAL AT
}
#dirty_string = dirty_string.translate(None, string.punctuation)
# Only sanitize the string if it contains characters that are out of the ascii character range (128)
if self.is_ascii( dirty_string ) is False:
# Loop through the replacement characters, replacing key with the value in the dirty_string
for k, v in replacement_chars.iteritems():
dirty_string = dirty_string.replace(k, v)
result = str(dirty_string)
# After all the known unicode characters have been replaced, check again to see if theres any unicode
# characters that werent defined in the replacements dictionary, and replace them with str(unknown_subst)
if self.is_ascii( result ) is False:
cleaned_result = []
for s in list(result):
if self.is_ascii( s ) is not True:
s = str( unknown_subst )
cleaned_result.append( s )
result = ''.join(cleaned_result)
result = re.sub( r'\?+', str( unknown_subst ), result )
return result
def _sanitizeList( listData ):
for subkey, subvalue in enumerate( listData ):
if isinstance( subvalue, str ):
listData[ subkey ] = _sanitizeStr( subvalue )
else:
listData[ subkey ] = self.sanitizeData( subvalue )
return listData
def _sanitizeDict( dictData ):
for key, value in dictData.items():
#print("K: %s; V: %s" % ( key, value ))
if isinstance( value, str ):
#print "Val %s IS str" % value
dictData[key] = _sanitizeStr( value )
else:
#print "Val %s is NOT str" % value
dictData[key] = self.sanitizeData( value )
return dictData
if isinstance( data, list ):
return _sanitizeList( data )
if isinstance( data, dict ):
return _sanitizeDict( data )
return _sanitizeStr( data )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment