Created
March 17, 2023 03:53
-
-
Save atodev/9b8a62934f7abb467a3a6a14b1c829c5 to your computer and use it in GitHub Desktop.
[LLC]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- first get all rows with multiple recipients and convert them to one per line: | |
---------------------------------------------- | |
[email protected] [email protected],[email protected],[email protected],[email protected],[email protected],[email protected] inbound 1670537679 | |
create a python script to replace that line with | |
[email protected] [email protected] inbound 1670537679 | |
[email protected] [email protected] inbound 1670537679 | |
[email protected] [email protected] inbound 1670537679 | |
[email protected] [email protected] inbound 1670537679 | |
[email protected] [email protected] inbound 1670537679 | |
[email protected] [email protected] inbound 1670537679 | |
----------------------------------------------------------- | |
# split rows where needed | |
input_file_name = "../Data/pfizerv2_2.llc" | |
output_file_name = "output_file22.txt" | |
with open(input_file_name, "r") as input_file, open(output_file_name, "w") as output_file: | |
for line in input_file: | |
line_parts = line.strip().split("\t") | |
sender = line_parts[0] | |
recipients = line_parts[1].split(",") | |
direction = line_parts[2] | |
timestamp = line_parts[3] | |
for recipient in recipients: | |
new_line = f"{sender}\t{recipient}\t{direction}\t{timestamp}\n" | |
output_file.write(new_line) | |
------------------------------------------------------------------------ | |
----- now filter the file for inscope emails - at least one per line | |
import re | |
import csv | |
#create an addess file with all subjects in scope - one per line | |
with open("address.csv", "r") as csv_file: | |
items_to_match = [row[0] for row in csv.reader(csv_file)] | |
# Open the file to read from - this is the llc c file from the client | |
#with open("../Data/pfizerv2_2.llc", "r") as input_file, open("output_file.txt", "w") as output_file: | |
with open("output_file22.txt", "r") as input_file, open("output_file33.txt", "w") as output_file: | |
# Define a regular expression to match any of the items to match | |
regex = re.compile("|".join(items_to_match)) | |
# Process the input file in chunks | |
while True: | |
chunk = input_file.readlines(10000) # Read 10,000 lines at a time | |
if not chunk: | |
break # End of file | |
# Filter the chunk to find lines that contain at least one item to match | |
filtered_chunk = [line for line in chunk if regex.search(line)] | |
# Write the filtered lines to the output file | |
output_file.writelines(filtered_chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment