tommylees112 · January 15, 2022 15:21 · tommylees112 · Jan 15, 2022
diff --git a/kouji_code.py b/kouji_code.py
 import pandas as pd
 from pathlib import Path
 import numpy as np
 from datetime import datetime


 # CHANGE THIS >>>>>>>>>
 DATA_DIR = "/Users/tommylees/Downloads"
 USER = "kouji"
 FIRST_ROW_TO_READ = 13  # remember it is 0-based indexing
 # >>>>>>>>>>>


 def make_filename() -> str:
    time_str = datetime.now().strftime("%Y%m%d_%H-%M-%S")
    output_filename = f"{USER}_{time_str}_counting-scatter-joined.csv"
    return output_filename


 def process_the_raw_dataframe(df: pd.DataFrame, filename: str) -> pd.DataFrame:
    # 1. find the end of the table
    # defined by the `search_string` in `search_in_column`
    search_string: str = "total positive clusters"
    search_in_column: str = "population phenotypes"
    # run the search
    row_match: int = np.where(df[search_in_column] == search_string)[0][0]

    # 2. select that sub-table
    sub_df = df.loc[:row_match].iloc[:, :3]
    # 3. fix the column names
    sub_df.columns = ["positive", "population phenotypes", filename]
    # 4. set the index for a join
    sub_df = sub_df.set_index(["positive", "population phenotypes"])

    return sub_df


 if __name__ == "__main__":
    data_dir = Path(DATA_DIR)
    # find all the paths to the csv files
    # (note: you can remove the "counting-scatter" to just look for all csv files)
    csv_files = list(data_dir.glob("*counting-scatter.csv"))

    # Read the data
    # NB: nrows=100 allows for these datatables to be different lengths,
    #   but on_bad_lines="skip" will skip the lines with tokenising errors
    all_dfs = [
        pd.read_csv(f, skiprows=FIRST_ROW_TO_READ, nrows=100, on_bad_lines="skip")
        for f in csv_files
    ]

    # get the sub-table from within the dataframe
    all_sub_dfs = [
        process_the_raw_dataframe(df, csv_files[ix].name)
        for (ix, df) in enumerate(all_dfs)
    ]

    # join all pd.join(all_sub_dfs) in list of dataframes
    df = pd.concat(all_sub_dfs, axis=1).reset_index()

    # save the dataframe to csv
    filename = make_filename()
    (data_dir / "OUTPUT").mkdir(exist_ok=True, parents=True)

    df.to_csv(data_dir / "OUTPUT" / filename)
    print(f'Saved combined file to: {data_dir / "OUTPUT" / filename}')
	import pandas as pd
	from pathlib import Path
	import numpy as np
	from datetime import datetime


	# CHANGE THIS >>>>>>>>>
	DATA_DIR = "/Users/tommylees/Downloads"
	USER = "kouji"
	FIRST_ROW_TO_READ = 13 # remember it is 0-based indexing
	# >>>>>>>>>>>


	def make_filename() -> str:
	time_str = datetime.now().strftime("%Y%m%d_%H-%M-%S")
	output_filename = f"{USER}_{time_str}_counting-scatter-joined.csv"
	return output_filename


	def process_the_raw_dataframe(df: pd.DataFrame, filename: str) -> pd.DataFrame:
	# 1. find the end of the table
	# defined by the `search_string` in `search_in_column`
	search_string: str = "total positive clusters"
	search_in_column: str = "population phenotypes"
	# run the search
	row_match: int = np.where(df[search_in_column] == search_string)[0][0]

	# 2. select that sub-table
	sub_df = df.loc[:row_match].iloc[:, :3]
	# 3. fix the column names
	sub_df.columns = ["positive", "population phenotypes", filename]
	# 4. set the index for a join
	sub_df = sub_df.set_index(["positive", "population phenotypes"])

	return sub_df


	if __name__ == "__main__":
	data_dir = Path(DATA_DIR)
	# find all the paths to the csv files
	# (note: you can remove the "counting-scatter" to just look for all csv files)
	csv_files = list(data_dir.glob("*counting-scatter.csv"))

	# Read the data
	# NB: nrows=100 allows for these datatables to be different lengths,
	# but on_bad_lines="skip" will skip the lines with tokenising errors
	all_dfs = [
	pd.read_csv(f, skiprows=FIRST_ROW_TO_READ, nrows=100, on_bad_lines="skip")
	for f in csv_files
	]

	# get the sub-table from within the dataframe
	all_sub_dfs = [
	process_the_raw_dataframe(df, csv_files[ix].name)
	for (ix, df) in enumerate(all_dfs)
	]

	# join all pd.join(all_sub_dfs) in list of dataframes
	df = pd.concat(all_sub_dfs, axis=1).reset_index()

	# save the dataframe to csv
	filename = make_filename()
	(data_dir / "OUTPUT").mkdir(exist_ok=True, parents=True)

	df.to_csv(data_dir / "OUTPUT" / filename)
	print(f'Saved combined file to: {data_dir / "OUTPUT" / filename}')