Created
May 23, 2017 12:32
-
-
Save geocarvalho/e6fcaa8b0194e4e14aca533ba4e76879 to your computer and use it in GitHub Desktop.
Script to organize FASTQs in directories and construct a CSV file input on QIAGEN site taking as argument the bioinfo worklist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import pandas as pd | |
#Get the worklist as a parameter | |
worklist_bioinfo = sys.argv[1] | |
#Create DataFrame from worklist | |
header = ['pool', 'analysis_type', 'exame', 'control', 'gender'] | |
worklist_df = pd.read_csv(worklist_bioinfo, header=None, names=header) | |
worklist_df.set_index(['pool'], inplace=True) | |
worklist_df['sample name'] = worklist_df.index | |
worklist_df['read file 1'] = 0 | |
worklist_df['read file 2'] = 0 | |
worklist_df['gender'] = worklist_df['gender'].replace({ | |
'F':'Female', 'M':'Male', '-':'None'}) | |
#Iterate trough folder and find fastqs | |
for root, dirs, files in os.walk("."): | |
for file in files: | |
if file.endswith("fastq.gz"): | |
file_name = file.split(".")[0] | |
pool_name = file.split("_")[0] | |
if "R1" in file_name: | |
worklist_df.ix[pool_name, 'read file 1'] = file | |
else: | |
worklist_df.ix[pool_name, 'read file 2'] = file | |
worklist_df = worklist_df[(worklist_df['read file 1']!=0) & (worklist_df['read file 2']!=0)].fillna("None") | |
exames = worklist_df['exame'].unique() | |
for exame in exames: | |
df_to_write = worklist_df[worklist_df['exame'] == exame] | |
#Create folder with exam name | |
if not os.path.exists(os.path.dirname( | |
os.path.realpath(__file__)) + "/" + exame): | |
os.makedirs(exame) | |
#Move all FASTQ files to exam folder | |
for index, row in df_to_write.iterrows(): | |
os.rename(row['read file 1'], '%s/%s' % (exame, row['read file 1'])) | |
os.rename(row['read file 2'], '%s/%s' % (exame, row['read file 2'])) | |
#Create txt file inside exam folder | |
df_to_write = df_to_write[['read file 1', 'read file 2', 'sample name', 'gender']] | |
df_to_write.to_csv('%s/qiagen-%s.txt' % (exame,exame), index=None, sep='\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment