Created
March 2, 2021 00:30
-
-
Save sminot/7dc131a3f8382a876b60cfeb6c194dce to your computer and use it in GitHub Desktop.
Merge data from multiple files containing MetaPhlAn2 outputs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import pandas as pd | |
import sys | |
def read_metaphlan(fp): | |
"""Function to read a single file with metaphlan2 results encoded as a TSV.""" | |
return pd.read_csv( # Function to read tabular data | |
fp, # Specify the file path | |
sep="\t", # Columns are delimited with tabs | |
comment="#", # Skip all columns starting with a # comment character | |
header=None, # Provide our own custom header (on the next line) | |
names=['clade', 'relative_abundance', 'coverage', 'average_genome_length_in_the_clade', 'estimated_number_of_reads_from_the_clade'] | |
) | |
def read_metaphlan_folder(folder, file_suffix=".metaphlan2.tsv"): | |
"""Function to read in and merge all of the metaphlan2 results in a particular folder.""" | |
# Format all data as a dict -- keys are sample names, values are the metaphlan abundances | |
dat = dict() | |
# Iterate over each of the files in the input folder | |
for fp in os.listdir(folder): | |
# If the folder ends with the specified file suffix | |
if fp.endswith(file_suffix): | |
# Parse the sample name from the file path | |
sample_name = fp.replace(file_suffix, "") | |
# Read in and save the data from this file | |
dat[sample_name] = read_metaphlan(os.path.join(folder, fp)) | |
# Return all of the data that has been read in | |
return dat | |
# The user should provide two inputs, the path to the folder containing all input files, and the output path | |
# Make sure that there are two input arguments | |
assert len(sys.argv) == 3, "Please specify input folder and output path." | |
# Parse the arguments | |
input_folder = sys.argv[1] | |
output_path = sys.argv[2] | |
# Make sure the input folder exists | |
assert os.path.exists(input_folder), f"Input folder does not exist: {input_folder}" | |
# Make sure the output file does not exist (so that we don't overwrite it) | |
assert not os.path.exists(output_path), f"Output path already exists ({output_path}), please specify another." | |
# Read in all of the input data | |
dat = read_metaphlan_folder(input_folder) | |
print(f"Read in data for {len(dat):,} specimens") | |
# Make sure that we were able to read in some data | |
assert len(dat) > 0, f"No files were found to parse in the input folder: {input_folder}" | |
# Format the output by merging all tables into a DataFrame | |
df = pd.DataFrame({ | |
sample_name: sample_dat.set_index("clade")["relative_abundance"] | |
for sample_name, sample_dat in dat.items() | |
}).fillna(0) | |
print(f"Read in data for {df.shape[0]:,} taxa") | |
# Write out to a file | |
df.to_csv(output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment