Last active
January 15, 2022 15:21
-
-
Save tommylees112/602399b2ca05e44633278cca56a7b999 to your computer and use it in GitHub Desktop.
Code to read csv files and combine them all together
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
import numpy as np | |
from datetime import datetime | |
# CHANGE THIS >>>>>>>>> | |
DATA_DIR = "/Users/tommylees/Downloads" | |
USER = "kouji" | |
FIRST_ROW_TO_READ = 13 # remember it is 0-based indexing | |
# >>>>>>>>>>> | |
def make_filename() -> str: | |
time_str = datetime.now().strftime("%Y%m%d_%H-%M-%S") | |
output_filename = f"{USER}_{time_str}_counting-scatter-joined.csv" | |
return output_filename | |
def process_the_raw_dataframe(df: pd.DataFrame, filename: str) -> pd.DataFrame: | |
# 1. find the end of the table | |
# defined by the `search_string` in `search_in_column` | |
search_string: str = "total positive clusters" | |
search_in_column: str = "population phenotypes" | |
# run the search | |
row_match: int = np.where(df[search_in_column] == search_string)[0][0] | |
# 2. select that sub-table | |
sub_df = df.loc[:row_match].iloc[:, :3] | |
# 3. fix the column names | |
sub_df.columns = ["positive", "population phenotypes", filename] | |
# 4. set the index for a join | |
sub_df = sub_df.set_index(["positive", "population phenotypes"]) | |
return sub_df | |
if __name__ == "__main__": | |
data_dir = Path(DATA_DIR) | |
# find all the paths to the csv files | |
# (note: you can remove the "counting-scatter" to just look for all csv files) | |
csv_files = list(data_dir.glob("*counting-scatter.csv")) | |
# Read the data | |
# NB: nrows=100 allows for these datatables to be different lengths, | |
# but on_bad_lines="skip" will skip the lines with tokenising errors | |
all_dfs = [ | |
pd.read_csv(f, skiprows=FIRST_ROW_TO_READ, nrows=100, on_bad_lines="skip") | |
for f in csv_files | |
] | |
# get the sub-table from within the dataframe | |
all_sub_dfs = [ | |
process_the_raw_dataframe(df, csv_files[ix].name) | |
for (ix, df) in enumerate(all_dfs) | |
] | |
# join all pd.join(all_sub_dfs) in list of dataframes | |
df = pd.concat(all_sub_dfs, axis=1).reset_index() | |
# save the dataframe to csv | |
filename = make_filename() | |
(data_dir / "OUTPUT").mkdir(exist_ok=True, parents=True) | |
df.to_csv(data_dir / "OUTPUT" / filename) | |
print(f'Saved combined file to: {data_dir / "OUTPUT" / filename}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To run:
You need a python environment with
pandas
andnumpy
Then you can run:
python kouji_code.py