Last active
June 1, 2017 13:20
-
-
Save gireeshkbogu/4bf83fc4331bbe621bb22189708f70f9 to your computer and use it in GitHub Desktop.
join multiple files (by using their location instead of specifying each file name) with similar repeat_ids and renames the columns with file names.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Gireesh Bogu | |
# Location: CRG, Barcelona | |
# Date: June 1, 2017 | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
# what it does: joins multiple files (by using their location instead of specifying each file name) with -- | |
# ---similar repeat_ids and renames the columns with file names. | |
# file1 = SRRX10101 | |
# repeat_id gtex_id norm_exp | |
# LTR GTEX1234 2.4 | |
# file2 = SRRX10102 | |
# repeat_id gtex_id norm_exp | |
# LTR GTEX1235 0 | |
# file3 = SRRX10103 | |
# repeat_id gtex_id norm_exp | |
# LTR GTEX1236 12 | |
# output | |
# repeat_id SRRX10101 SRRX10102 SRRX10103 | |
# LTR 2.4 0 12 | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
import os | |
import glob | |
import pandas as pd | |
# set the location of our files | |
files = glob.glob(r'/full_path/*.norm.exp') | |
def merge_files(files, **kwargs): | |
dfs = [] | |
for f in files: | |
# read the tab delimited file | |
df = pd.read_csv(f, sep='\t', usecols=['repeat_id', 'norm_exp'], index_col=['repeat_id']) | |
# rename columns with file names | |
df = df.rename(columns={'norm_exp':os.path.splitext(os.path.basename(f))[0]}) | |
# join the files | |
dfs.append(df) | |
return pd.concat(dfs, axis=1) | |
# merge all files into one master file | |
result = merge_files(files).reset_index() | |
# print output to the console | |
# print(result) | |
# save the output incrementally (piece by piece using mode 'append') as a tab delimited file | |
result.to_csv('file.output', mode='a', sep='\t') | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment