Created
August 25, 2023 10:50
-
-
Save yosignals/8ce4c2d1a8a39f08d65db84b080cdc65 to your computer and use it in GitHub Desktop.
Twitter dump duplicate joiner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
# Directory path where the txt files are stored | |
folder_path = '.' | |
# Adjusted regular expressions to match the given line format | |
email_pattern = re.compile(r'Email: ([\w\.-]+@[\w\.-]+)') | |
screen_name_pattern = re.compile(r'ScreenName: (\S+)') | |
# Dictionary to store email addresses, their counts, and associated unique screen names | |
email_dict = {} | |
# Loop through each file in the directory | |
for filename in os.listdir(folder_path): | |
if filename.endswith('.txt'): | |
with open(os.path.join(folder_path, filename), 'r') as file: | |
for line_number, line in enumerate(file, 1): # Iterating over the file line by line | |
email_matches = email_pattern.findall(line) | |
screen_name_matches = screen_name_pattern.findall(line) | |
# Assuming one email and one screen name per line | |
if email_matches and screen_name_matches: | |
email = email_matches[0] | |
screen_name = screen_name_matches[0] | |
if email in email_dict: | |
email_dict[email]['screen_names'].add(screen_name) # Using a set to prevent duplicates | |
else: | |
email_dict[email] = {'screen_names': {screen_name}} | |
# Write the summary to output.txt | |
with open('output.txt', 'w') as outfile: | |
found_duplicates = False | |
for email, data in email_dict.items(): | |
# Checking if there are multiple unique screen names for the email | |
if len(data['screen_names']) > 1: | |
found_duplicates = True | |
outfile.write(f"Email: {email} has accounts with the following unique screen names:\n") | |
for screen_name in data['screen_names']: | |
outfile.write(f"- {screen_name}\n") | |
outfile.write("\n") | |
if not found_duplicates: | |
outfile.write("No email addresses with multiple unique screen names found.\n") | |
print("Summary written to output.txt") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment