import os import numpy as np import pandas as pd # Directory to read in xlsx files from data_dir = r"data" output_file = r"output.csv" # Indexes of interested columns columns = { "TP1": 1, "TP2": 2, "TP3": 3, } # Indexes of interested rows rows = { "TEST": 5, "BP": 6, "AMBIOLIC": 7, "SOMETHING": 14, "WELP": 15, } # Define entry key key = { "": 0, "x": 1, "xx": 2, "xxx": 3, "o": -1, } final_df = None # Iterate over files in data directory, recursively for subdir, dirs, files in os.walk(data_dir): for filename in files: # Only look at the xlsx files if filename.endswith(".xlsx"): # Get the full file path from the file system walk filepath = os.path.join(subdir, filename) # Open the file into a pandas data frame data = pd.read_excel(filepath).replace(np.nan, "", regex=True) # Get the id from the filename id = os.path.splitext(filename)[0] # Make a temporary data frame to hold the data entry = pd.DataFrame(columns=[id]) # Iterate over the interested rows and columns for row in rows: for col in columns: # Generate the unique variable name variable = "{}_{}".format(row, col) # Set the variable in the temp data frame using the lookup key entry.loc[variable] = key[data.iloc[rows[row], columns[col]]] # Transpose entry to be a single row with several columns entry = entry.T # If a final dataframe hasn't been started, start it if final_df is None: final_df = entry # If there is a final data frame, append the new entry to it else: final_df = final_df.append(entry) # Print the final dataframe print(final_df) # Output generated data frame as a CSV file if final_df is not None: final_df.to_csv(output_file, index_label="id")