Last active
August 31, 2022 21:21
-
-
Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.
This Python script will provide a summary of alphanumeric patterns which exist in a list of values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##################################################################################### | |
# | |
# Requirements: You'll need Python 3.5.1 or higher to run this | |
# | |
# This script will provide you a basic understanding of the alphanumeric patterns | |
# which exist in a list. You might get this list from a SQL query or something like | |
# that. | |
# | |
# INPUT: Give this script a file that has a single column of ID type strings. | |
# EXAMPLE (from command line): | |
# > python patternEyes.py "c:\temp\id_list.txt" | |
# | |
# OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha | |
# characters to "X". All punctuation stays as it exists. | |
# | |
# For example, if you want to see if all records are phone numbers, you might expect | |
# to see something like this: | |
# (###)-###-#### | |
# But if you also see something like this, you know the data isn't as "clean" as | |
# you were hoping, requiring further investigation: | |
# ##-XXX-###### | |
# | |
##################################################################################### | |
import re, os.path, sys | |
from collections import defaultdict | |
from pathlib import Path | |
def patternEyes( filePath = r'c:\temp\id_list.txt'): | |
strings = [] | |
patterns = [] | |
input_file = filePath | |
if os.path.isfile( input_file ): | |
cp = re.compile(r'[,]') | |
np = re.compile(r'\d') | |
ap = re.compile(r'[a-z]', re.IGNORECASE) | |
file = open(input_file, 'r') | |
for line in file: | |
strings.extend(line.strip('\n').split(',')) | |
file.close() | |
for string in strings: | |
nm = np.sub('#', string) | |
am = ap.sub('X', nm) | |
patterns.append(am) | |
pattern_counts = defaultdict(int) | |
for pattern in patterns: | |
if pattern == '': | |
pattern_counts['No Data'] += 1 | |
else: | |
pattern_counts[pattern] += 1 | |
pattern_rank = [] | |
for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True): | |
pattern_rank.append([k, pattern_counts[k]]) | |
print("\nREPORT FOR: {}".format(Path(input_file).resolve())) | |
print("\n{0:20} | {1:10}".format("PATTERN", "COUNT")) | |
print("-"*30) | |
for pattern, count in pattern_rank: | |
print("{0:20} | {1:10}".format(pattern, str(count))) | |
else: | |
print( "\nSorry, there is no file here: {}".format(input_file)) | |
def main( inputs ): | |
if len( inputs ) == 2: | |
patternEyes( inputs[1] ) | |
else: | |
patternEyes() | |
if __name__ == "__main__": main( sys.argv ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment