Created
November 24, 2024 23:00
-
-
Save WomB0ComB0/b45a78eb6efd75173958b9f2b03c3e8e to your computer and use it in GitHub Desktop.
Python script that takes in 1x1 dimensional data to suggest the best regex against the inputted data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
from typing import Dict, List, Tuple | |
def analyze_patterns(terms_dict: Dict[str, str]) -> Tuple[Dict[str, int], List[str]]: | |
""" | |
Analyzes terms to find common patterns and potential regex optimizations. | |
Returns frequency of patterns and suggested regex patterns. | |
""" | |
# Convert all terms to list and lowercase for consistency | |
words = [word.lower() for word in terms_dict.keys()] | |
# Initialize pattern storage | |
prefix_patterns = Counter() | |
suffix_patterns = Counter() | |
infix_patterns = Counter() | |
# Common prefixes to look for | |
prefix_length = 1 | |
while prefix_length <= 4: | |
prefixes = [ | |
word[:prefix_length] for word in words if len(word) >= prefix_length | |
] | |
prefix_patterns.update(prefixes) | |
prefix_length += 1 | |
# Common suffixes | |
suffix_length = 2 | |
while suffix_length <= 4: | |
suffixes = [ | |
word[-suffix_length:] for word in words if len(word) >= suffix_length | |
] | |
suffix_patterns.update(suffixes) | |
suffix_length += 1 | |
# Find common word parts (length 3 or more) | |
for word in words: | |
for i in range(len(word) - 2): | |
for j in range(i + 3, min(i + 7, len(word) + 1)): | |
infix = word[i:j] | |
if len(infix) >= 3: | |
infix_patterns[infix] += 1 | |
# Filter patterns by frequency | |
common_patterns = { | |
"prefixes": {k: v for k, v in prefix_patterns.items() if v >= 3}, | |
"suffixes": {k: v for k, v in suffix_patterns.items() if v >= 3}, | |
"infixes": {k: v for k, v in infix_patterns.items() if v >= 3}, | |
} | |
suggested_patterns = [] | |
for prefix, _ in sorted( | |
common_patterns["prefixes"].items(), key=lambda x: x[1], reverse=True | |
)[:5]: | |
if len(prefix) >= 2: # Only use prefixes of length 2 or more | |
suggested_patterns.append(f"^{prefix}") | |
# Add high-frequency suffixes | |
for suffix, _ in sorted( | |
common_patterns["suffixes"].items(), key=lambda x: x[1], reverse=True | |
)[:5]: | |
if len(suffix) >= 2: # Only use suffixes of length 2 or more | |
suggested_patterns.append(f"{suffix}$") | |
# Add high-frequency infixes | |
for infix, _ in sorted( | |
common_patterns["infixes"].items(), key=lambda x: x[1], reverse=True | |
)[:10]: | |
if len(infix) >= 3: # Only use infixes of length 3 or more | |
suggested_patterns.append(infix) | |
return common_patterns, suggested_patterns | |
# ❗❗❗Sample data | |
# Takes in dict[str, str] | |
terms = {} | |
patterns, suggested_regex = analyze_patterns(terms) | |
print("Most Common Patterns:") | |
print("\nPrefixes:") | |
for prefix, count in sorted( | |
patterns["prefixes"].items(), key=lambda x: x[1], reverse=True | |
)[:10]: | |
print(f"'{prefix}': {count} occurrences") | |
print("\nSuffixes:") | |
for suffix, count in sorted( | |
patterns["suffixes"].items(), key=lambda x: x[1], reverse=True | |
)[:10]: | |
print(f"'{suffix}': {count} occurrences") | |
print("\nCommon Infixes:") | |
for infix, count in sorted( | |
patterns["infixes"].items(), key=lambda x: x[1], reverse=True | |
)[:10]: | |
print(f"'{infix}': {count} occurrences") | |
print("\nSuggested Regex Patterns:") | |
for pattern in suggested_regex: | |
print(f"/.*{pattern}.*/i") | |
optimized_regex = f"/{('|'.join(suggested_regex))}/i" | |
print("\nOptimized Combined Regex:") | |
print(optimized_regex) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment