alexfriant · August 31, 2022 21:21
diff --git a/patternEyes.py b/patternEyes.py
 #####################################################################################
 #
 # Requirements: You'll need Python 3.5.1 or higher to run this
 # 
 # This script will provide you a basic understanding of the alphanumeric patterns
 # which exist in a list. You might get this list from a SQL query or something like
 # that.
 #
 # INPUT: Give this script a file that has a single column of ID type strings.
 # EXAMPLE (from command line):
 #   > python patternEyes.py "c:\temp\id_list.txt"
 #
 # OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
 # characters to "X". All punctuation stays as it exists.
 #
 # For example, if you want to see if all records are phone numbers, you might expect
 # to see something like this:
 #         (###)-###-####
 # But if you also see something like this, you know the data isn't as "clean" as
 # you were hoping, requiring further investigation:
 #         ##-XXX-######
 #
 #####################################################################################

 import re, os.path, sys
 from collections import defaultdict
 from pathlib import Path


 def patternEyes( filePath = r'c:\temp\id_list.txt'):
    strings = []
    patterns = []
    input_file = filePath

    if os.path.isfile( input_file ):
        cp = re.compile(r'[,]')
        np = re.compile(r'\d')
        ap = re.compile(r'[a-z]', re.IGNORECASE)

        file = open(input_file, 'r')
        for line in file:
            strings.extend(line.strip('\n').split(','))
        file.close()

        for string in strings:
            nm = np.sub('#', string)
            am = ap.sub('X', nm)
            patterns.append(am)

        pattern_counts = defaultdict(int)
        for pattern in patterns:
            if pattern == '':
                pattern_counts['No Data'] += 1
            else:
                pattern_counts[pattern] += 1

        pattern_rank = []
        for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
            pattern_rank.append([k, pattern_counts[k]])

        print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
        print("\n{0:20} | {1:10}".format("PATTERN", "COUNT"))
        print("-"*30)
        for pattern, count in pattern_rank:
            print("{0:20} | {1:10}".format(pattern, str(count)))
    else:
        print( "\nSorry, there is no file here: {}".format(input_file))

 def main( inputs ):
    if len( inputs ) == 2:
        patternEyes( inputs[1] )
    else:
        patternEyes()
        
 if __name__ == "__main__": main( sys.argv )
	#####################################################################################
	#
	# Requirements: You'll need Python 3.5.1 or higher to run this
	#
	# This script will provide you a basic understanding of the alphanumeric patterns
	# which exist in a list. You might get this list from a SQL query or something like
	# that.
	#
	# INPUT: Give this script a file that has a single column of ID type strings.
	# EXAMPLE (from command line):
	# > python patternEyes.py "c:\temp\id_list.txt"
	#
	# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha
	# characters to "X". All punctuation stays as it exists.
	#
	# For example, if you want to see if all records are phone numbers, you might expect
	# to see something like this:
	# (###)-###-####
	# But if you also see something like this, you know the data isn't as "clean" as
	# you were hoping, requiring further investigation:
	# ##-XXX-######
	#
	#####################################################################################

	import re, os.path, sys
	from collections import defaultdict
	from pathlib import Path


	def patternEyes( filePath = r'c:\temp\id_list.txt'):
	strings = []
	patterns = []
	input_file = filePath

	if os.path.isfile( input_file ):
	cp = re.compile(r'[,]')
	np = re.compile(r'\d')
	ap = re.compile(r'[a-z]', re.IGNORECASE)

	file = open(input_file, 'r')
	for line in file:
	strings.extend(line.strip('\n').split(','))
	file.close()

	for string in strings:
	nm = np.sub('#', string)
	am = ap.sub('X', nm)
	patterns.append(am)

	pattern_counts = defaultdict(int)
	for pattern in patterns:
	if pattern == '':
	pattern_counts['No Data'] += 1
	else:
	pattern_counts[pattern] += 1

	pattern_rank = []
	for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True):
	pattern_rank.append([k, pattern_counts[k]])

	print("\nREPORT FOR: {}".format(Path(input_file).resolve()))
	print("\n{0:20} \| {1:10}".format("PATTERN", "COUNT"))
	print("-"*30)
	for pattern, count in pattern_rank:
	print("{0:20} \| {1:10}".format(pattern, str(count)))
	else:
	print( "\nSorry, there is no file here: {}".format(input_file))

	def main( inputs ):
	if len( inputs ) == 2:
	patternEyes( inputs[1] )
	else:
	patternEyes()

	if __name__ == "__main__": main( sys.argv )