gireeshkbogu · May 31, 2017 09:04
diff --git a/convert_3columns_into_a_matrix.py b/convert_3columns_into_a_matrix.py
 # Author: Gireesh Bogu, Date: 27th May 2017, Place: CRG, Barcelona

 # worked on a file with 200 million rows and 3 columns (50 GB file) (Python 2.7)
 # make sure the file has 3 columns with a proper header
 # make sure there are no duplicates in the file
 # make sure the file is tab delimited
 ###################################################################################

 # USE THIS IF IT IS NOT A BIG FILE
 import pandas as pd
 # read a tab-delimited file
 my_data1 = pd.read_csv("fullpath/input_filename", sep='\t') 
 # read file as a data frame
 my_data2 = pd.DataFrame(my_data1) 
 # pivot the data
 my_data3 = my_data2.pivot('repeat_id', 'gtex_id', 'norm_exp')
 # save the file
 my_data3.to_csv("fullpath/output_filename", sep='\t')


 # USE THIS IF IT IS A BIG FILE
 import pandas as pd
 # split the big file into pieces
 chunker = pd.read_csv('test.input', sep='\t', chunksize=1000)
 # save the output as a dataframe
 tot=pd.DataFrame()
 # apply pivot function on each piece and add each pivoted or transposed piece to a single output file
 for piece in chunker:
     tot=tot.add(piece.pivot('repeat_id', 'gtex_id', 'norm_exp'), fill_value=0)
 # remove the repeats with zero expresson across all samples
 tot=tot.loc[(tot!=0).any(axis=1)]
 # save output as integers
 tot = tot.astype(int)
 # save the output incrementally (piece by piece using mode 'append') as a tab delimited file
 tot.to_csv('test.output', mode='a', sep='\t')
	# Author: Gireesh Bogu, Date: 27th May 2017, Place: CRG, Barcelona

	# worked on a file with 200 million rows and 3 columns (50 GB file) (Python 2.7)
	# make sure the file has 3 columns with a proper header
	# make sure there are no duplicates in the file
	# make sure the file is tab delimited
	###################################################################################

	# USE THIS IF IT IS NOT A BIG FILE
	import pandas as pd
	# read a tab-delimited file
	my_data1 = pd.read_csv("fullpath/input_filename", sep='\t')
	# read file as a data frame
	my_data2 = pd.DataFrame(my_data1)
	# pivot the data
	my_data3 = my_data2.pivot('repeat_id', 'gtex_id', 'norm_exp')
	# save the file
	my_data3.to_csv("fullpath/output_filename", sep='\t')


	# USE THIS IF IT IS A BIG FILE
	import pandas as pd
	# split the big file into pieces
	chunker = pd.read_csv('test.input', sep='\t', chunksize=1000)
	# save the output as a dataframe
	tot=pd.DataFrame()
	# apply pivot function on each piece and add each pivoted or transposed piece to a single output file
	for piece in chunker:
	tot=tot.add(piece.pivot('repeat_id', 'gtex_id', 'norm_exp'), fill_value=0)
	# remove the repeats with zero expresson across all samples
	tot=tot.loc[(tot!=0).any(axis=1)]
	# save output as integers
	tot = tot.astype(int)
	# save the output incrementally (piece by piece using mode 'append') as a tab delimited file
	tot.to_csv('test.output', mode='a', sep='\t')