Skip to content

Instantly share code, notes, and snippets.

@atodev
Created March 17, 2023 03:53
Show Gist options
  • Save atodev/9b8a62934f7abb467a3a6a14b1c829c5 to your computer and use it in GitHub Desktop.
Save atodev/9b8a62934f7abb467a3a6a14b1c829c5 to your computer and use it in GitHub Desktop.
[LLC]
-- first get all rows with multiple recipients and convert them to one per line:
----------------------------------------------
[email protected] [email protected],[email protected],[email protected],[email protected],[email protected],[email protected] inbound 1670537679
create a python script to replace that line with
[email protected] [email protected] inbound 1670537679
[email protected] [email protected] inbound 1670537679
[email protected] [email protected] inbound 1670537679
[email protected] [email protected] inbound 1670537679
[email protected] [email protected] inbound 1670537679
[email protected] [email protected] inbound 1670537679
-----------------------------------------------------------
# split rows where needed
input_file_name = "../Data/pfizerv2_2.llc"
output_file_name = "output_file22.txt"
with open(input_file_name, "r") as input_file, open(output_file_name, "w") as output_file:
for line in input_file:
line_parts = line.strip().split("\t")
sender = line_parts[0]
recipients = line_parts[1].split(",")
direction = line_parts[2]
timestamp = line_parts[3]
for recipient in recipients:
new_line = f"{sender}\t{recipient}\t{direction}\t{timestamp}\n"
output_file.write(new_line)
------------------------------------------------------------------------
----- now filter the file for inscope emails - at least one per line
import re
import csv
#create an addess file with all subjects in scope - one per line
with open("address.csv", "r") as csv_file:
items_to_match = [row[0] for row in csv.reader(csv_file)]
# Open the file to read from - this is the llc c file from the client
#with open("../Data/pfizerv2_2.llc", "r") as input_file, open("output_file.txt", "w") as output_file:
with open("output_file22.txt", "r") as input_file, open("output_file33.txt", "w") as output_file:
# Define a regular expression to match any of the items to match
regex = re.compile("|".join(items_to_match))
# Process the input file in chunks
while True:
chunk = input_file.readlines(10000) # Read 10,000 lines at a time
if not chunk:
break # End of file
# Filter the chunk to find lines that contain at least one item to match
filtered_chunk = [line for line in chunk if regex.search(line)]
# Write the filtered lines to the output file
output_file.writelines(filtered_chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment