Skip to content

Instantly share code, notes, and snippets.

@tommylees112
Last active January 15, 2022 15:21
Show Gist options
  • Save tommylees112/602399b2ca05e44633278cca56a7b999 to your computer and use it in GitHub Desktop.
Save tommylees112/602399b2ca05e44633278cca56a7b999 to your computer and use it in GitHub Desktop.
Code to read csv files and combine them all together
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import datetime
# CHANGE THIS >>>>>>>>>
DATA_DIR = "/Users/tommylees/Downloads"
USER = "kouji"
FIRST_ROW_TO_READ = 13 # remember it is 0-based indexing
# >>>>>>>>>>>
def make_filename() -> str:
time_str = datetime.now().strftime("%Y%m%d_%H-%M-%S")
output_filename = f"{USER}_{time_str}_counting-scatter-joined.csv"
return output_filename
def process_the_raw_dataframe(df: pd.DataFrame, filename: str) -> pd.DataFrame:
# 1. find the end of the table
# defined by the `search_string` in `search_in_column`
search_string: str = "total positive clusters"
search_in_column: str = "population phenotypes"
# run the search
row_match: int = np.where(df[search_in_column] == search_string)[0][0]
# 2. select that sub-table
sub_df = df.loc[:row_match].iloc[:, :3]
# 3. fix the column names
sub_df.columns = ["positive", "population phenotypes", filename]
# 4. set the index for a join
sub_df = sub_df.set_index(["positive", "population phenotypes"])
return sub_df
if __name__ == "__main__":
data_dir = Path(DATA_DIR)
# find all the paths to the csv files
# (note: you can remove the "counting-scatter" to just look for all csv files)
csv_files = list(data_dir.glob("*counting-scatter.csv"))
# Read the data
# NB: nrows=100 allows for these datatables to be different lengths,
# but on_bad_lines="skip" will skip the lines with tokenising errors
all_dfs = [
pd.read_csv(f, skiprows=FIRST_ROW_TO_READ, nrows=100, on_bad_lines="skip")
for f in csv_files
]
# get the sub-table from within the dataframe
all_sub_dfs = [
process_the_raw_dataframe(df, csv_files[ix].name)
for (ix, df) in enumerate(all_dfs)
]
# join all pd.join(all_sub_dfs) in list of dataframes
df = pd.concat(all_sub_dfs, axis=1).reset_index()
# save the dataframe to csv
filename = make_filename()
(data_dir / "OUTPUT").mkdir(exist_ok=True, parents=True)
df.to_csv(data_dir / "OUTPUT" / filename)
print(f'Saved combined file to: {data_dir / "OUTPUT" / filename}')
@tommylees112
Copy link
Author

tommylees112 commented Jan 15, 2022

To run:

You need a python environment with
pandas and numpy

Then you can run:
python kouji_code.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment