atodev · March 17, 2023 03:53
diff --git a/filter file for inscope emails b/filter file for inscope emails
 -- first get all rows with multiple recipients and convert them to one per line:
 ----------------------------------------------
 [email protected]	[email protected],[email protected],[email protected],[email protected],[email protected],[email protected]	 inbound	1670537679
 create a python script to replace that line with

 [email protected]  [email protected]	inbound	1670537679
 [email protected]  [email protected]	inbound	1670537679
 [email protected]  [email protected]	inbound	1670537679
 [email protected]  [email protected]	inbound	1670537679
 [email protected]  [email protected]	inbound	1670537679
 [email protected]  [email protected]	inbound	1670537679
 -----------------------------------------------------------
 # split rows where needed
 input_file_name = "../Data/pfizerv2_2.llc"
 output_file_name = "output_file22.txt"

 with open(input_file_name, "r") as input_file, open(output_file_name, "w") as output_file:
    for line in input_file:
        line_parts = line.strip().split("\t")
        sender = line_parts[0]
        recipients = line_parts[1].split(",")
        direction = line_parts[2]
        timestamp = line_parts[3]
        for recipient in recipients:
            new_line = f"{sender}\t{recipient}\t{direction}\t{timestamp}\n"
            output_file.write(new_line)
            
 ------------------------------------------------------------------------

 -----  now filter the file for inscope emails - at least one per line

 import re
 import csv

 #create an addess file with all subjects in scope - one per line
 with open("address.csv", "r") as csv_file:
    items_to_match = [row[0] for row in csv.reader(csv_file)]


 # Open the file to read from - this is the llc c file from the client
 #with open("../Data/pfizerv2_2.llc", "r") as input_file, open("output_file.txt", "w") as output_file:
 with open("output_file22.txt", "r") as input_file, open("output_file33.txt", "w") as output_file:

    # Define a regular expression to match any of the items to match
    regex = re.compile("|".join(items_to_match))

    # Process the input file in chunks
    while True:
        chunk = input_file.readlines(10000)  # Read 10,000 lines at a time
        if not chunk:
            break  # End of file

        # Filter the chunk to find lines that contain at least one item to match
        filtered_chunk = [line for line in chunk if regex.search(line)]

        # Write the filtered lines to the output file
        output_file.writelines(filtered_chunk)
	-- first get all rows with multiple recipients and convert them to one per line:
	----------------------------------------------
	[email protected] [email protected],[email protected],[email protected],[email protected],[email protected],[email protected] inbound 1670537679
	create a python script to replace that line with

	[email protected] [email protected] inbound 1670537679
	[email protected] [email protected] inbound 1670537679
	[email protected] [email protected] inbound 1670537679
	[email protected] [email protected] inbound 1670537679
	[email protected] [email protected] inbound 1670537679
	[email protected] [email protected] inbound 1670537679
	-----------------------------------------------------------
	# split rows where needed
	input_file_name = "../Data/pfizerv2_2.llc"
	output_file_name = "output_file22.txt"

	with open(input_file_name, "r") as input_file, open(output_file_name, "w") as output_file:
	for line in input_file:
	line_parts = line.strip().split("\t")
	sender = line_parts[0]
	recipients = line_parts[1].split(",")
	direction = line_parts[2]
	timestamp = line_parts[3]
	for recipient in recipients:
	new_line = f"{sender}\t{recipient}\t{direction}\t{timestamp}\n"
	output_file.write(new_line)

	------------------------------------------------------------------------

	----- now filter the file for inscope emails - at least one per line

	import re
	import csv

	#create an addess file with all subjects in scope - one per line
	with open("address.csv", "r") as csv_file:
	items_to_match = [row[0] for row in csv.reader(csv_file)]


	# Open the file to read from - this is the llc c file from the client
	#with open("../Data/pfizerv2_2.llc", "r") as input_file, open("output_file.txt", "w") as output_file:
	with open("output_file22.txt", "r") as input_file, open("output_file33.txt", "w") as output_file:

	# Define a regular expression to match any of the items to match
	regex = re.compile("\|".join(items_to_match))

	# Process the input file in chunks
	while True:
	chunk = input_file.readlines(10000) # Read 10,000 lines at a time
	if not chunk:
	break # End of file

	# Filter the chunk to find lines that contain at least one item to match
	filtered_chunk = [line for line in chunk if regex.search(line)]

	# Write the filtered lines to the output file
	output_file.writelines(filtered_chunk)