gireeshkbogu · June 1, 2017 13:20
diff --git a/multi_join.py b/multi_join.py
 # Author: Gireesh Bogu                                                                   
 # Location: CRG, Barcelona                                                               
 # Date: June 1, 2017                                                                     

 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

 # what it does: joins multiple files (by using their location instead of specifying each file name) with --
 # ---similar repeat_ids and renames the columns with file names.                                          

 # file1 =  SRRX10101
 # repeat_id     gtex_id norm_exp
 # LTR   GTEX1234        2.4

 # file2 =  SRRX10102
 # repeat_id     gtex_id norm_exp
 # LTR   GTEX1235        0

 # file3 =  SRRX10103
 # repeat_id     gtex_id norm_exp
 # LTR   GTEX1236        12

 # output
 # repeat_id     SRRX10101       SRRX10102       SRRX10103
 # LTR   2.4     0       12

 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

 import os
 import glob
 import pandas as pd

 # set the location of our files
 files = glob.glob(r'/full_path/*.norm.exp')

 def merge_files(files, **kwargs):
    dfs = []
    for f in files:
        # read the tab delimited file
        df = pd.read_csv(f, sep='\t', usecols=['repeat_id', 'norm_exp'], index_col=['repeat_id'])
        # rename columns with file names
        df = df.rename(columns={'norm_exp':os.path.splitext(os.path.basename(f))[0]})
        # join the files
        dfs.append(df)
    return pd.concat(dfs, axis=1)

 # merge all files into one master file
 result = merge_files(files).reset_index()
 # print output to the console
 # print(result)
 # save the output incrementally (piece by piece using mode 'append') as a tab delimited file
 result.to_csv('file.output', mode='a', sep='\t')

 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	# Author: Gireesh Bogu
	# Location: CRG, Barcelona
	# Date: June 1, 2017

	#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

	# what it does: joins multiple files (by using their location instead of specifying each file name) with --
	# ---similar repeat_ids and renames the columns with file names.

	# file1 = SRRX10101
	# repeat_id gtex_id norm_exp
	# LTR GTEX1234 2.4

	# file2 = SRRX10102
	# repeat_id gtex_id norm_exp
	# LTR GTEX1235 0

	# file3 = SRRX10103
	# repeat_id gtex_id norm_exp
	# LTR GTEX1236 12

	# output
	# repeat_id SRRX10101 SRRX10102 SRRX10103
	# LTR 2.4 0 12

	#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

	import os
	import glob
	import pandas as pd

	# set the location of our files
	files = glob.glob(r'/full_path/*.norm.exp')

	def merge_files(files, **kwargs):
	dfs = []
	for f in files:
	# read the tab delimited file
	df = pd.read_csv(f, sep='\t', usecols=['repeat_id', 'norm_exp'], index_col=['repeat_id'])
	# rename columns with file names
	df = df.rename(columns={'norm_exp':os.path.splitext(os.path.basename(f))[0]})
	# join the files
	dfs.append(df)
	return pd.concat(dfs, axis=1)

	# merge all files into one master file
	result = merge_files(files).reset_index()
	# print output to the console
	# print(result)
	# save the output incrementally (piece by piece using mode 'append') as a tab delimited file
	result.to_csv('file.output', mode='a', sep='\t')

	#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@