Skip to content

Instantly share code, notes, and snippets.

Last active February 8, 2025 22:02
Show Gist options
  • Save jasonsnell/7df8d908b6c466e916c3ccce7a9e736d to your computer and use it in GitHub Desktop.
Save jasonsnell/7df8d908b6c466e916c3ccce7a9e736d to your computer and use it in GitHub Desktop.
Count Duplicates in List
import subprocess
import re
import sys
from Levenshtein import ratio
from collections import defaultdict, Counter
# Configuration: Adjust as needed
INCLUDE_PARENTHESES = False # Set to True to include parentheticals, False to exclude them
OUTPUT_TO_CLIPBOARD = True # Set to True to copy output to clipboard, False to print to console
def get_clipboard_data():
"""Retrieve clipboard content using pbpaste on macOS."""
return subprocess.check_output("pbpaste", universal_newlines=True).strip()
except subprocess.CalledProcessError as e:
print("Error accessing clipboard:", e)
return ""
def set_clipboard_data(data):
"""Set clipboard content using pbcopy on macOS."""
process = subprocess.Popen("pbcopy", stdin=subprocess.PIPE)
except Exception as e:
print("Error setting clipboard:", e)
def normalize_response(response):
"""Normalize response by removing articles, punctuation, extra spaces, and making it lowercase."""
response = response.strip().lower()
response = re.sub(r"[^a-z0-9\s]", "", response) # Remove punctuation
if response.startswith("the "):
response = response[4:] # Remove "the " if present
response = re.sub(r"\s+", " ", response) # Collapse multiple spaces
return response
def group_similar_responses(responses, threshold=0.75):
grouped_responses = defaultdict(list)
# Iterate through all non-blank responses
for response in responses:
if not response.strip(): # Skip blank lines
normalized_response = normalize_response(response)
# Compare with existing groups
for key in grouped_responses:
if ratio(normalized_response, key) >= threshold:
# No similar group found, create a new group
# Format grouped_responses with unique, case-insensitive coalesced responses
formatted_groups = {}
for key, items in grouped_responses.items():
# Count frequency of each variant
count_variants = Counter(map(str.lower, items))
most_common = count_variants.most_common(1)[0][0] # Most frequent item
most_common_title = next(item for item in items if item.lower() == most_common).title()
# List other unique variants, excluding the most common one
other_variants = sorted(
{item.title() for item in items if item.lower() != most_common}
if INCLUDE_PARENTHESES and other_variants:
formatted_groups[f"{most_common_title} ({', '.join(other_variants)})"] = len(items)
formatted_groups[most_common_title] = len(items)
return formatted_groups
def process_responses(input_data):
"""Process and format responses."""
responses = input_data.splitlines()
result = group_similar_responses(responses, threshold=0.75)
# Prepare output
output = "\n".join(f"{key}\t{count}" for key, count in sorted(result.items(), key=lambda x: x[1], reverse=True))
# Output to clipboard or console
print("Output copied to clipboard.")
# Main function to handle input and output options
if __name__ == "__main__":
if len(sys.argv) > 1:
# Accept input directly as an argument
input_data = " ".join(sys.argv[1:])
# Use clipboard as input source
input_data = get_clipboard_data()
if not input_data:
print("No clipboard data found!")
Copy link

I assume ChatGPT wrote that part ;-)

Copy link

foresmac commented Jan 16, 2025

Line 31–32 could be replaced with:
response = response.removeprefix("the ")

I also think you could get ride of the map call to lower when making the counter since you lowercase all the strings when you normalize the data.
I see; the normalized form is only used to group items as a dict key.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment