sminot · March 2, 2021 00:30
diff --git a/merge_metaphlan_tables.py b/merge_metaphlan_tables.py
 #!/usr/bin/env python3

 import os
 import pandas as pd
 import sys


 def read_metaphlan(fp):
    """Function to read a single file with metaphlan2 results encoded as a TSV."""

    return pd.read_csv(   # Function to read tabular data
        fp,               # Specify the file path
        sep="\t",         # Columns are delimited with tabs
        comment="#",      # Skip all columns starting with a # comment character
        header=None,      # Provide our own custom header (on the next line)
        names=['clade', 'relative_abundance', 'coverage', 'average_genome_length_in_the_clade', 'estimated_number_of_reads_from_the_clade']
    )


 def read_metaphlan_folder(folder, file_suffix=".metaphlan2.tsv"):
    """Function to read in and merge all of the metaphlan2 results in a particular folder."""

    # Format all data as a dict -- keys are sample names, values are the metaphlan abundances
    dat = dict()

    # Iterate over each of the files in the input folder
    for fp in os.listdir(folder):

        # If the folder ends with the specified file suffix
        if fp.endswith(file_suffix):

            # Parse the sample name from the file path
            sample_name = fp.replace(file_suffix, "")

            # Read in and save the data from this file
            dat[sample_name] = read_metaphlan(os.path.join(folder, fp))

    # Return all of the data that has been read in
    return dat

 # The user should provide two inputs, the path to the folder containing all input files, and the output path

 # Make sure that there are two input arguments
 assert len(sys.argv) == 3, "Please specify input folder and output path."

 # Parse the arguments
 input_folder = sys.argv[1]
 output_path = sys.argv[2]

 # Make sure the input folder exists
 assert os.path.exists(input_folder), f"Input folder does not exist: {input_folder}"

 # Make sure the output file does not exist (so that we don't overwrite it)
 assert not os.path.exists(output_path), f"Output path already exists ({output_path}), please specify another."

 # Read in all of the input data
 dat = read_metaphlan_folder(input_folder)
 print(f"Read in data for {len(dat):,} specimens")

 # Make sure that we were able to read in some data
 assert len(dat) > 0, f"No files were found to parse in the input folder: {input_folder}"

 # Format the output by merging all tables into a DataFrame
 df = pd.DataFrame({
    sample_name: sample_dat.set_index("clade")["relative_abundance"]
    for sample_name, sample_dat in dat.items()
 }).fillna(0)

 print(f"Read in data for {df.shape[0]:,} taxa")

 # Write out to a file
 df.to_csv(output_path)
	#!/usr/bin/env python3

	import os
	import pandas as pd
	import sys


	def read_metaphlan(fp):
	"""Function to read a single file with metaphlan2 results encoded as a TSV."""

	return pd.read_csv( # Function to read tabular data
	fp, # Specify the file path
	sep="\t", # Columns are delimited with tabs
	comment="#", # Skip all columns starting with a # comment character
	header=None, # Provide our own custom header (on the next line)
	names=['clade', 'relative_abundance', 'coverage', 'average_genome_length_in_the_clade', 'estimated_number_of_reads_from_the_clade']
	)


	def read_metaphlan_folder(folder, file_suffix=".metaphlan2.tsv"):
	"""Function to read in and merge all of the metaphlan2 results in a particular folder."""

	# Format all data as a dict -- keys are sample names, values are the metaphlan abundances
	dat = dict()

	# Iterate over each of the files in the input folder
	for fp in os.listdir(folder):

	# If the folder ends with the specified file suffix
	if fp.endswith(file_suffix):

	# Parse the sample name from the file path
	sample_name = fp.replace(file_suffix, "")

	# Read in and save the data from this file
	dat[sample_name] = read_metaphlan(os.path.join(folder, fp))

	# Return all of the data that has been read in
	return dat

	# The user should provide two inputs, the path to the folder containing all input files, and the output path

	# Make sure that there are two input arguments
	assert len(sys.argv) == 3, "Please specify input folder and output path."

	# Parse the arguments
	input_folder = sys.argv[1]
	output_path = sys.argv[2]

	# Make sure the input folder exists
	assert os.path.exists(input_folder), f"Input folder does not exist: {input_folder}"

	# Make sure the output file does not exist (so that we don't overwrite it)
	assert not os.path.exists(output_path), f"Output path already exists ({output_path}), please specify another."

	# Read in all of the input data
	dat = read_metaphlan_folder(input_folder)
	print(f"Read in data for {len(dat):,} specimens")

	# Make sure that we were able to read in some data
	assert len(dat) > 0, f"No files were found to parse in the input folder: {input_folder}"

	# Format the output by merging all tables into a DataFrame
	df = pd.DataFrame({
	sample_name: sample_dat.set_index("clade")["relative_abundance"]
	for sample_name, sample_dat in dat.items()
	}).fillna(0)

	print(f"Read in data for {df.shape[0]:,} taxa")

	# Write out to a file
	df.to_csv(output_path)
No results found