Last active
May 3, 2023 20:52
-
-
Save thanhleviet/64ea913b8e02b186b336258d113c21d8 to your computer and use it in GitHub Desktop.
Simple python script for scanning PE files based on a list of patterns and write to a csv file with three columns: sample_id, forward, reverse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import argparse | |
def scan_paired_end_files(dir_path, pattern_list): | |
# Initialize a list to store the sample IDs and file paths | |
samples = [] | |
# Loop through all files in the directory | |
for file_name in os.listdir(dir_path): | |
# Check if the file ends with any of the pattern options | |
for pattern in pattern_list: | |
if file_name.endswith(pattern): | |
# Extract the sample ID from the file name | |
sample_id = file_name[:-len(pattern)] | |
# Check if both forward and reverse files exist for the current sample ID | |
forward_path = os.path.join(dir_path, file_name) | |
reverse_path = os.path.join(dir_path, sample_id + "_2" + pattern[2:]) | |
if os.path.exists(forward_path) and os.path.exists(reverse_path): | |
# Store the sample ID and file paths in the list | |
samples.append((sample_id, forward_path, reverse_path)) | |
else: | |
print(f"Error: Paired-end files not found for sample ID {sample_id}") | |
# break | |
# Write the sample IDs and file paths to a csv file | |
output_file = os.path.join(dir_path, "paired_end_files.csv") | |
with open(output_file, "w", newline="") as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(["sample_id", "forward", "reverse"]) | |
for sample in samples: | |
writer.writerow(sample) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Scan paired-end fastq files and generate a CSV file.") | |
parser.add_argument("dir_path", help="the directory containing the fastq files") | |
parser.add_argument("--pattern", nargs="*", default=["_1.non_host.fastq.gz", "_1_non_host.fastq.gz"], help="the file name pattern(s) to search for") | |
args = parser.parse_args() | |
scan_paired_end_files(args.dir_path, args.pattern) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment