Created
October 27, 2017 09:45
-
-
Save avrilcoghlan/e7e8d49dcf6c716199ffba1a60321517 to your computer and use it in GitHub Desktop.
Python script to submit CRISPresso jobs on a compute farm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from collections import defaultdict | |
#====================================================================# | |
# define a function to read in a list of files: | |
def read_file_list(input_file_list): | |
file_list = [] | |
fileObj = open(input_file_list, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
file_list.append(line) | |
fileObj.close() | |
return(file_list) | |
#====================================================================# | |
# define a function to read in sequences from a file. | |
# The file has two columns, a barcode (1-96) in the first column, | |
# then a space, then the sequence (if any) in the second column. | |
def read_seqs(seq_file): | |
seq = defaultdict() | |
fileObj = open(seq_file, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
temp = line.split() | |
if len(temp) == 2: # there is some sequence for this barcode | |
barcode = int(temp[0]) # convert to an integer | |
sequence = temp[1] | |
assert(barcode not in seq) | |
seq[barcode] = sequence | |
fileObj.close() | |
return seq | |
#====================================================================# | |
def submit_crispresso_job(amplicon, gRNA, barcode, plate_number, fastq_dir, lane_id): | |
# get the current directory: | |
current_dir = os.getcwd() | |
# submit the CRISPresso job: | |
output_dir = "plate%d_barcode%d" % (plate_number, barcode) | |
fastq_file1 = "%s/%s#%d_1.fastq.gz" % (fastq_dir, lane_id, barcode) | |
fastq_file2 = "%s/%s#%d_2.fastq.gz" % (fastq_dir, lane_id, barcode) | |
# This uses a 50 bp window (suggested by Peter Keen). Patrick said "That way we are being generous in our definition of an indel event". | |
# This uses the --hide_mutations_outside_window_NHEJ option. Patrick said "I think '--hide_mutations_outside_window_NHEJ' is a great option. | |
# If it isn’t happening around the double stranded break site it probably isn’t happening.". | |
command1 = "/nfs/team87/farm3_lims2_vms/software/python_local/bin/CRISPResso -w 50 --hide_mutations_outside_window_NHEJ -o %s -r1 %s -r2 %s -a %s -g %s" % (output_dir, fastq_file1, fastq_file2, amplicon, gRNA) | |
# specify the bsub output and error file names: | |
bsub_out = "plate%d_barcode%d.o" % (plate_number, barcode) | |
bsub_err = "plate%d_barcode%d.e" % (plate_number, barcode) | |
# submit job, here I requested 1000 Mbyte of RAM | |
command2 = 'bsub -o %s -e %s -R "select[mem>1000] rusage[mem=1000]" -M1000 "%s"' % (bsub_out, bsub_err, command1) | |
print(command2) | |
os.system(command2) | |
return | |
#====================================================================# | |
# define a function to submit the CRISPresso jobs for a plate: | |
def submit_crispresso_jobs_for_plate(amplicon_file, gRNA_file, plate_number, fastq_dir, lane_id): | |
# first read in the amplicon sequences: | |
amplicons = read_seqs(amplicon_file) | |
# read in the gRNA sequences: | |
gRNAs = read_seqs(gRNA_file) | |
# submit the CRISPresso jobs for barcodes 1 - 96: | |
for barcode in range(1, 97): # index will go from 1 to 96 | |
if barcode in amplicons: | |
assert(barcode in gRNAs) | |
amplicon = amplicons[barcode] | |
gRNA = gRNAs[barcode] | |
# submit the CRISPResso job for this amplicon, barcode and gRNA: | |
submit_crispresso_job(amplicon, gRNA, barcode, plate_number, fastq_dir, lane_id) | |
else: | |
assert(barcode not in gRNAs) | |
return | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 5 or os.path.exists(sys.argv[1]) == False or os.path.exists(sys.argv[2]) == False or os.path.exists(sys.argv[3]) == False: | |
print("Usage: %s input_amplicon_file_list input_gRNA_file_list fastq_dir lane_id" % sys.argv[0]) | |
sys.exit(1) | |
input_amplicon_file_list = sys.argv[1] # input list of amplicon files | |
input_gRNA_file_list = sys.argv[2] # input list of gRNA files | |
fastq_dir = sys.argv[3] # directory containing the fastq files | |
lane_id = sys.argv[4] # the lane id., eg. 23528_1 | |
# read in the amplicon files: | |
amplicon_files = read_file_list(input_amplicon_file_list) | |
# read in the gRNA files: | |
gRNA_files = read_file_list(input_gRNA_file_list) | |
# each plate has 96 wells, look at the data plate by plate: | |
# (each plate has an amplicon file, and a gRNA file) | |
for index, amplicon_file in enumerate(amplicon_files): | |
plate_number = index + 1 | |
# find the corresponding gRNA file: | |
gRNA_file = gRNA_files[index] | |
# submit the CRISPresso jobs for this plate: | |
submit_crispresso_jobs_for_plate(amplicon_file, gRNA_file, plate_number, fastq_dir, lane_id) | |
print("FINISHED") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment