Skip to content

Instantly share code, notes, and snippets.

@sinamajidian
Created March 29, 2022 10:46
Show Gist options
  • Save sinamajidian/6b21476561704443bead0e90224f36d5 to your computer and use it in GitHub Desktop.
Save sinamajidian/6b21476561704443bead0e90224f36d5 to your computer and use it in GitHub Desktop.
Chop each read in a fastq file into few chunks
#!/usr/bin/python3
import numpy as np
from sys import argv
file_fq_input_addrss = argv[1]
file_fq_output_addrss = argv[2]
file_fq_input= open(file_fq_input_addrss,'r');
file_fq_ouput= open(file_fq_output_addrss,'w');
chunk_length = int(argv[3]) # 3000
record = []
for line in file_fq_input:
line_strip = line.strip()
# if line_strip.startswith("@"): # new recrod started
# record = [read_name_new_record]
if len(record)<4:
record.append(line_strip)
else:
read_name,sequence,pluse_line,quality = record
read_length = len(sequence)
if read_length != len(quality):
print("ERROR, length of sequence !=quality ",len(sequence), len(quality),read_name[:10])
# exit()
chunk_num = int(read_length/chunk_length)
for chunk_i in range(chunk_num):
if chunk_i < chunk_num-1 :
sequence_i = sequence[chunk_length*chunk_i:chunk_length*(chunk_i+1)]
quality_i = quality[chunk_length*chunk_i:chunk_length*(chunk_i+1)]
else:
sequence_i = sequence[chunk_length*chunk_i:]
quality_i = quality[chunk_length*chunk_i:]
file_fq_ouput.write(read_name+"_part_"+str(chunk_i)+"\n")
file_fq_ouput.write(sequence_i+"\n"+pluse_line+"\n"+quality_i+"\n")
read_name_new_record=line_strip
record = [read_name_new_record]
# last record
read_name,sequence,pluse_line,quality = record
read_length = len(sequence)
chunk_num = int(read_length/chunk_length)+1
for chunk_i in range(chunk_num):
if chunk_i < chunk_num-1:
sequence_i = sequence[chunk_length*chunk_i:chunk_length*(chunk_i+1)]
quality_i = quality[chunk_length*chunk_i:chunk_length*(chunk_i+1)]
else:
sequence_i = sequence[chunk_length*chunk_i:]
quality_i = quality[chunk_length*chunk_i:]
file_fq_ouput.write(read_name+"_part_"+str(chunk_i)+"\n")
file_fq_ouput.write(sequence_i+"\n"+pluse_line+"\n"+quality_i+"\n")
file_fq_input.close()
file_fq_ouput.close()
print("The chopped version is in ",file_fq_output_addrss)

The goal of this code is to chop each read in a fastq file into few chunks.

python3 read_chop.py input.fq  output.fq 3000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment