Skip to content

Instantly share code, notes, and snippets.

@WomB0ComB0
Created November 24, 2024 23:00
Show Gist options
  • Save WomB0ComB0/b45a78eb6efd75173958b9f2b03c3e8e to your computer and use it in GitHub Desktop.
Save WomB0ComB0/b45a78eb6efd75173958b9f2b03c3e8e to your computer and use it in GitHub Desktop.
Python script that takes in 1x1 dimensional data to suggest the best regex against the inputted data
from collections import Counter
from typing import Dict, List, Tuple
def analyze_patterns(terms_dict: Dict[str, str]) -> Tuple[Dict[str, int], List[str]]:
"""
Analyzes terms to find common patterns and potential regex optimizations.
Returns frequency of patterns and suggested regex patterns.
"""
# Convert all terms to list and lowercase for consistency
words = [word.lower() for word in terms_dict.keys()]
# Initialize pattern storage
prefix_patterns = Counter()
suffix_patterns = Counter()
infix_patterns = Counter()
# Common prefixes to look for
prefix_length = 1
while prefix_length <= 4:
prefixes = [
word[:prefix_length] for word in words if len(word) >= prefix_length
]
prefix_patterns.update(prefixes)
prefix_length += 1
# Common suffixes
suffix_length = 2
while suffix_length <= 4:
suffixes = [
word[-suffix_length:] for word in words if len(word) >= suffix_length
]
suffix_patterns.update(suffixes)
suffix_length += 1
# Find common word parts (length 3 or more)
for word in words:
for i in range(len(word) - 2):
for j in range(i + 3, min(i + 7, len(word) + 1)):
infix = word[i:j]
if len(infix) >= 3:
infix_patterns[infix] += 1
# Filter patterns by frequency
common_patterns = {
"prefixes": {k: v for k, v in prefix_patterns.items() if v >= 3},
"suffixes": {k: v for k, v in suffix_patterns.items() if v >= 3},
"infixes": {k: v for k, v in infix_patterns.items() if v >= 3},
}
suggested_patterns = []
for prefix, _ in sorted(
common_patterns["prefixes"].items(), key=lambda x: x[1], reverse=True
)[:5]:
if len(prefix) >= 2: # Only use prefixes of length 2 or more
suggested_patterns.append(f"^{prefix}")
# Add high-frequency suffixes
for suffix, _ in sorted(
common_patterns["suffixes"].items(), key=lambda x: x[1], reverse=True
)[:5]:
if len(suffix) >= 2: # Only use suffixes of length 2 or more
suggested_patterns.append(f"{suffix}$")
# Add high-frequency infixes
for infix, _ in sorted(
common_patterns["infixes"].items(), key=lambda x: x[1], reverse=True
)[:10]:
if len(infix) >= 3: # Only use infixes of length 3 or more
suggested_patterns.append(infix)
return common_patterns, suggested_patterns
# ❗❗❗Sample data
# Takes in dict[str, str]
terms = {}
patterns, suggested_regex = analyze_patterns(terms)
print("Most Common Patterns:")
print("\nPrefixes:")
for prefix, count in sorted(
patterns["prefixes"].items(), key=lambda x: x[1], reverse=True
)[:10]:
print(f"'{prefix}': {count} occurrences")
print("\nSuffixes:")
for suffix, count in sorted(
patterns["suffixes"].items(), key=lambda x: x[1], reverse=True
)[:10]:
print(f"'{suffix}': {count} occurrences")
print("\nCommon Infixes:")
for infix, count in sorted(
patterns["infixes"].items(), key=lambda x: x[1], reverse=True
)[:10]:
print(f"'{infix}': {count} occurrences")
print("\nSuggested Regex Patterns:")
for pattern in suggested_regex:
print(f"/.*{pattern}.*/i")
optimized_regex = f"/{('|'.join(suggested_regex))}/i"
print("\nOptimized Combined Regex:")
print(optimized_regex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment