Last active
July 3, 2024 21:19
-
-
Save fomightez/8cd6d9ba88f975b64e43eba562894dec to your computer and use it in GitHub Desktop.
snippets for dealing with FASTA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is not presently all encompassing as it was started well after my sequence work repo | |
# at https://github.com/fomightez/sequencework , where much of this related code is. | |
# For making FASTA files/entriees out of dataframes, see 'specific dataframe contents saved as formatted text file example' | |
# in my useful pandas snippets gist https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 | |
# For reading in FASTA files/entries | |
# -See `mine_mito_features` function in `mine_mito_features_from_transcriptome.py` | |
# https://github.com/fomightez/sequencework/blob/master/Extract_Details_or_Annotation/mine_mito_features_from_transcriptome.py | |
# -See `get_fasta_seq` and `get_seq_from_URL` functions in `find_sequence_element_occurrences_in_sequence.py` in | |
# the https://github.com/fomightez/sequencework/tree/master/FindSequence repo | |
# (direct url: https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py ) | |
# From `patmatch-binder`, `specifically PatMatch nucleic handling flags demystified.ipynb`, and | |
# highly based on similar handling in `find_sequence_element_occurrences_in_sequence.py` | |
# (https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py) & | |
# now adapted into a proper (slightly better) script with command line provideable arguments at | |
# https://github.com/fomightez/sequencework/blob/master/ConvertSeq/convert_fasta_to_reverse_complement.py : | |
## READ IN FASTA FILE AND CONVERT TO REVERSE COMPLEMENT USING BIOPYTHON ** | |
from Bio import SeqIO | |
from Bio.Seq import Seq # for reverse complement | |
def get_seq_from_file(file_name): | |
''' | |
takes a file name and gets the sequence RECORD. | |
''' | |
fasta_iterator = SeqIO.parse(file_name, "fasta") | |
# Use of `next()` on next line to get first FASTA -formatted sequence is | |
# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html | |
# I think difference from `SeqIO.read()` in this approach is that it won't | |
# give an error if more than one entry is in the file. | |
record = next(fasta_iterator) | |
#return record.seq | |
#return record.id | |
return record | |
# Read FASTA file | |
input_file_name = "chr15.fsa" | |
sequence_record = get_seq_from_file(input_file_name) | |
#print(sequence_record) #FOR DEBUGGING | |
# Read multi FASTA file ((in other words single FASTA file with multiple sequences/records multiFASTA) | |
records = [] | |
for record in SeqIO.parse("TAIR9_pep_20090619.fa", "fasta"): | |
records.append(record) | |
thale_cress_prot_length = [] | |
for record in records: | |
#print (len(record.seq)) | |
thale_cress_prot_length.append(len(record.seq)) | |
# Get reverse complement | |
seq_rev_compl_record = sequence_record.reverse_complement() | |
seq_rev_compl_record.id = sequence_record.id #record needs id for writing FASTA | |
# or even easier, see https://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#reverse_complement | |
# and use argruments to keep id and description when reverse complementing | |
seq_rev_compl_record = sequence_record.reverse_complement(id=True,description=True) | |
#print(seq_rev_compl_record) #FOR DEBUGGING | |
# Save FASTA file for reverse complement | |
output_file_name = "chr15.revcompl.fa" | |
SeqIO.write(seq_rev_compl_record, output_file_name, "fasta"); | |
# Save multi fasta (in other words single FASTA file with multiple sequences/records) | |
SeqIO.write(records,file_name, "fasta"); | |
# based on https://www.biostars.org/p/48797/#48801 "If that's the case then | |
# note SeqIO.write() can take a list or a generator of SeqRecords so you should pass one of those" | |
# related to above , but dealing with URL source or file too: | |
def get_seq_from_URL(url): | |
''' | |
takes a URL and gets the sequence | |
''' | |
try: | |
from StringIO import StringIO | |
except ImportError: | |
from io import StringIO | |
# Getting html originally for just Python 3, adapted from | |
# https://stackoverflow.com/a/17510727/8508004 and then updated from to | |
# handle Python 2 and 3 according to same link. | |
try: | |
# For Python 3.0 and later | |
from urllib.request import urlopen | |
except ImportError: | |
# Fall back to Python 2's urllib2 | |
from urllib2 import urlopen | |
html = urlopen(url) | |
fasta_iterator = SeqIO.parse(StringIO( | |
html.read().decode(encoding='UTF-8')), "fasta") | |
# Use of `next()` on next line to get first FASTA -formatted sequence is | |
# based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html | |
# I think difference from `SeqIO.read()` in this approach is that it won't | |
# give an error if more than one entry is in the html. | |
# I found I needed `StringIO()` or got issues with trying to handle long file name. | |
record = next(fasta_iterator) | |
return record.seq | |
def get_fasta_seq(source): | |
''' | |
Takes a source URL or filepath/ file name and gets sequence if it is in | |
FASTA format. | |
It won't return anything if what is provided isn't FASTA format because | |
Biopython handles both trying to extract FASTA from URL and from file. | |
See https://stackoverflow.com/a/44294079/8508004. | |
Placing it in a function it easy to then check and provide some feedback. | |
''' | |
if source.lower().startswith("http"): | |
return get_seq_from_URL(source) | |
else: | |
# Read sequence, treating source as a filepath. | |
# Use of `with` on next line based on http://biopython.org/wiki/SeqIO , | |
# under "Sequence Input". Otherwise, backbone based on | |
# https://www.biostars.org/p/209383/, and fact `rU` mode depecated. | |
with open(source, "r") as handle: | |
for record in SeqIO.parse(handle, "fasta"): | |
# print(record.seq) # for debugging | |
return record.seq | |
# remove gaps and make a string into a SeqRecord that can be written to FASTA with `SeqIO.write(records,file_name, "fasta")` | |
from Bio.Alphabet import generic_dna | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
new_record = SeqRecord(Seq(text_string, generic_dna).ungap("-"), id= id_, description=id_descript))# based | |
# on https://www.biostars.org/p/48797/ and `.ungap()` method, see | |
# https://github.com/biopython/biopython/issues/1511 , and `description` | |
# from what I've seen for `id` plus https://biopython.org/wiki/SeqIO | |
# Note that you can leave out `id=` part and just put the necessary string in that | |
# position in function call because it is a required positoinal argument | |
#If the 'sequence' is already a Bio.Seq.Seq object, it can be made into a record with an id, or optionnally also a description | |
new_record = SeqRecord(seq_obj, "XS56782")) | |
#-or- | |
new_record = SeqRecord(seq_obj, id= "XS56782", description=id_descript)) | |
#note that a `record.decscription` will also contain the `record.id`; can be removed with `description_wo_id=record.description.split(record.id,1)[1]` so it won't show up twice | |
# Editing description line of FASTA from within a jupyter notebooks cell | |
# add identifiers to each `chr` so results for each strain clear later | |
chromosome_id_prefix = "chr" | |
def add_strain_id_to_description_line(file,strain_id): | |
''' | |
Takes a file and edits every description line to add | |
strain_id after the caret. | |
Saves the fixed file | |
''' | |
import sys | |
output_file_name = "temp.txt" | |
# prepare output file for saving so it will be open and ready | |
with open(output_file_name, 'w') as output_file: | |
# read in the input file | |
with open(file, 'r') as input_handler: | |
# prepare to give feeback later or allow skipping to certain start | |
lines_processed = 0 | |
for line in input_handler: | |
lines_processed += 1 | |
if line.startswith(">"): | |
rest_o_line = line.split(">") | |
new_line = ">"+strain_id + rest_o_line[1] | |
else: | |
new_line = line | |
# Send text to output | |
output_file.write(new_line) | |
# replace the original file with edited | |
!mv temp.txt {file} | |
# Feedback | |
sys.stderr.write("\n{} chromosome identifiers tagged.".format(file)) | |
for s in yue_et_al_strains: | |
add_strain_id_to_description_line(s+".genome.fa",s) | |
#function to get description line text from a FASTA file | |
def get_descriptionline(filename): | |
''' | |
Takes the name of file in FASTA format and gets the first description line, | |
i.e., the text after the caret on the first line. | |
Returns the text after the '>' symbol. | |
''' | |
import sys | |
output_file_name = "temp.txt" | |
# prepare output file for saving so it will be open and ready | |
with open(output_file_name, 'w') as output_file: | |
# read in the input file | |
with open(filename, 'r') as input_handler: | |
# prepare to give feeback later or allow skipping to certain start | |
lines_processed = 0 | |
for line in input_handler: | |
lines_processed += 1 | |
if lines_process == 1 and line.startswith(">"): | |
return line.split(">")[0].strip() | |
# save the protein sequence as FASTA | |
chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to | |
# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does) | |
# Note that I generalized the chunking to a function that can handle any type of | |
# residue in `get_seq_from_multiFASTA_with_match_in_description.py` | |
prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range( | |
0, len(prot_seq),chunk_size)] | |
prot_seq_fa = ">" + prot_descr + "\n"+ "\n".join(prot_seq_chunks) | |
p_output_file_name = strain_id +"_" + gene_name + "_protein_ortholog.fa" | |
with open(p_output_file_name, 'w') as output: | |
output.write(prot_seq_fa) | |
prot_seqs_fn_list.append(p_output_file_name) | |
sys.stderr.write("\n\nProtein sequence saved as " | |
"`{}`.".format(p_output_file_name)) | |
# above code made into function for working with pyfaidx; used in `blast-binder` repo to make it easier for users | |
# who already had a multi-FASTA sequence file of sequeneces of interest to plug into a workflow at a later | |
# point | |
def seq_to_file(prot_seq, description, name_suffix_to_use): | |
''' | |
function to save a sequence as a file in FASTA-format | |
''' | |
# save the protein sequence as FASTA | |
chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to | |
# multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does) | |
# Note that I generalized the chunking to a function that can handle any type of | |
# residue in `get_seq_from_multiFASTA_with_match_in_description.py` | |
prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range( | |
0, len(prot_seq),chunk_size)] | |
prot_seq_fa = ">" + description + "\n"+ "\n".join(prot_seq_chunks) | |
p_output_file_name = description.split()[0] + name_suffix_to_use | |
with open(p_output_file_name, 'w') as output: | |
output.write(prot_seq_fa) | |
sys.stderr.write("\n\nProtein sequence saved as " | |
"`{}`.".format(p_output_file_name)) | |
from pyfaidx import Fasta | |
records = Fasta(sequence) | |
for seq in records: | |
# make individual file (also see `!faidx --split-files {seq_file}` below) | |
seq_to_file(str(seq), seq.long_name, "_protein_ortholog.fa") # I am using `seq.long_name` here because it give more options for adapting the code to make a file name one prefers; however, in developing some other code I became aware that if the FASTA files are non-standard and have an empty line above the description line, that `seq.long_name` will included the caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, just change here to `seq.name` | |
# do SOMETHING WITH THAT FILE | |
''' | |
BLOCK TO DO SOMETHING HERE | |
''' | |
# One metric to assess is count all the letters present (because shows number of Ns too) | |
# [BASIC VERSION] | |
from pyfaidx import Fasta | |
import collections | |
for g in genomes: | |
concatenated_seqs = "" | |
chrs = Fasta(g) | |
for x in chrs: | |
#print(x.name) | |
concatenated_seqs += str(x) | |
print(g,":",collections.Counter(concatenated_seqs)) | |
# ['EXPANDED' VERSION] | |
from pyfaidx import Fasta | |
import pandas as pd | |
import collections | |
nt_counts = {} | |
for g in genomes: | |
strain_id = g.split(".genome.fa")[0] | |
concatenated_seqs = "" | |
chrs = Fasta(g) | |
for x in chrs: | |
#print(x.name) | |
concatenated_seqs += str(x) | |
nt_counts[strain_id] = collections.Counter(concatenated_seqs) | |
nt_count_df = pd.DataFrame.from_dict(nt_counts, orient='index').fillna(0) | |
nt_count_df["Total_nts"] = nt_count_df.sum(1) | |
def percent_calc(items): | |
''' | |
takes a list of two items and calculates percentage of first item | |
within total (second item) | |
''' | |
return items[0]/items[1] | |
nt_count_df['% N'] = nt_count_df[['N','Total_nts']].apply(percent_calc, axis=1) | |
nt_count_df = nt_count_df.style.format({'Total_nts':'{:.2E}','% N':'{:.2%}'}) | |
# Alternatives for handling description lines, i.e.,lines before sequence beginning with `>`, with pyfaidx: | |
# By default, `.name` attribute just goes to first encountered space in entry description line. You can access full | |
# description with `.long_name` attribute. <===NOTE while developing some other code I became aware that if the FASTA | |
# files are non-standard and have an empty line above the description line, that `seq.long_name` will included the | |
# caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, | |
# it may be easiest to just change to `seq.name` or strip off `>` if the starts with `>` because the first one won't have it. | |
from pyfaidx import Fasta | |
genes = Fasta('tests/data/genes.fasta') | |
for record in genes: | |
print(record.name) | |
print(record.long_name) | |
# new in v0.4.9, you can specify the `.name` attribute be the full description line if you specify `read_long_names=True`. | |
from pyfaidx import Fasta | |
genes = Fasta('tests/data/genes.fasta', read_long_names=True) | |
for record in genes: | |
print(record.name) | |
# You can also filter out based on the contents of the description line using 'filt_function'. The have | |
# simple example in the documentation now that filters on first letter in that line. Below is fancier | |
# where filters records where description line ends in patterns like '0-0' or '4182-4182'. | |
seqrecords = Fasta(fn, read_long_names=True, | |
filt_function = lambda x: x.split( | |
"extracted_coordinates: ")[1].split("-")[0] != x.split( | |
"extracted_coordinates: ")[1].split("-")[1]) | |
# Note that in this case I had details on the sequence in the description line. This won't always | |
# be the case so I could just use an if statement after that to conditionally skip those where | |
# sequence in the record isn't greater than 1, example `if len(record) > 1:`. I wanted to avoid | |
# adding yet another if conditional to the processing I was doing where I chose to use `filt_function` to | |
# filter. It seems I could also interate on the `Fasta` | |
# object only collecting to a new list the records where the length of the sequence was greater than 1 and then | |
# iterate over that list of records. Those should be fine for iterating even if I haven't figured out | |
# how to cast the list of records (each `<class 'pyfaidx.FastaRecord'>`) back to the the `class 'pyfaidx.Fasta'` | |
# count frequency of blocks of Ns in a string (presumably sequence) | |
import re | |
from collections import defaultdict | |
t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN" | |
matches = [] | |
len_match_dict = defaultdict(int) | |
min_number_Ns_in_row_to_collect = 1 | |
pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from | |
# code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on | |
# https://stackoverflow.com/a/250306/8508004 | |
for m in pattern_obj.finditer(t): | |
len_match_dict[len(m.group())] += 1 | |
matches.append(m.group()) | |
print(len_match_dict) | |
print(collections.Counter(matches)) | |
# Split a file using pyfaidx command line utility that is available in Jupyter when install pyfaidx module (note this may not split every fasta entry in a multifasta because it seems to keep regions together, like faSplit seems to maybe do, too. See https://github.com/fomightez/Fasta2Structure-cli/blob/4bbc18dff4d487ca3eeaeaad9511a1767a6eea4d/binder/postBuild) | |
seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa" | |
!faidx --split-files {seq_file} | |
# Package up a lot of sequence files (FASTA) into one archive for easy downloading , PLUS DEMONSTRATES | |
# merging a lot of inidvidual sequence files into one FASTA file (for Jupyter session) | |
#Archive the sequences (FASTA format) collected and any dataframes made | |
# Pickle each dataframe and also save as `tsv` for possible use elsewhere | |
strd_dataframes_fn_list = [] | |
def pickle_df_and_store_as_table(dataframe, prefix): | |
''' | |
Take a dataframe and a filename prefix and save a pickled form of that | |
dataframe and a text tablular data version (tab-sepearated values). | |
Returns the name of the pickled and text file. | |
''' | |
dataframe.to_pickle(prefix + ".pkl") | |
dataframe.to_csv(prefix + ".tsv", sep='\t',index = False) | |
return prefix + ".pkl", prefix + ".tsv" | |
# To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix | |
# associated as the value | |
df_n_fn_dict = { | |
"CTD_seq_of_protein_orthologs": CTD_seq_df, | |
"first_heptad_of_protein_orthologs": first_7_df, | |
"heptads_ofCTD_seq_of_protein_orthologs": repeat_df, | |
"main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df, | |
"fraction_matching_consensus_per_CTD": fraction_consensus_df, | |
} | |
import pandas as pd | |
for prefix, dataframe in df_n_fn_dict.items(): | |
#pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix) | |
strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix)) | |
# store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling | |
CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json" | |
import json | |
with open(CTD_seqs_fn_list_storedfn, 'w') as f: | |
json.dump(CTD_seqs_fn_list, f) | |
#for ease in aligning or other uses later save the all the CTDs as a concatenated file | |
cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa" | |
!cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn} | |
archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn] | |
archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz" | |
!tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command | |
sys.stderr.write("\nCollected CTD sequences" | |
" and tables of details gathered and saved as " | |
"`{}`.".format(archive_file_name)) | |
# Generate / create/ make multi-record FASTA file (multiFASTA. i.e., single FASTA file with multiple sequence entries) with mock / fake / simulate/ random sequence | |
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/nullseq.py | |
#!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/NullSeq_Functions.py | |
#%run nullseq.py -l 436 | |
!pip install fire | |
!curl -O https://raw.githubusercontent.com/mauriceling/bactome/master/randomseq.py | |
#!python randomseq.py FLS -- --help #???<-- That didn't match typical Python help syntax I was familar with but it is noted at https://github.com/google/python-fire | |
!python randomseq.py FLS --length=706 --n=3 > mock_seqs.fa | |
# store FASTA seqeunces as strings within Python scripts but get the sequence back for length | |
# or sequence as a string | |
def stringFASTA2seq(s): | |
''' | |
Takes a FASTA file contents that are currently as a Python string and | |
converts it to the sequence string. In other words, it removes the | |
description line and the line breaks and just makes a sequence string. | |
''' | |
l = s.split("\n") | |
return "".join(l[1:]) | |
example_sequence = '''>OMEGAminusHEG S.cerevisiae omega with homing endonuclease gene and associated DNA removedSPECULATIVE | |
ATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAATTGA | |
AGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAGCAGC | |
GAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGACTAG | |
ATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAAAATA | |
TTAAATAAATTTTTATTTGATAATGATATAGTCTGAACAATATAGTAATATATTGAAGTA | |
ATTATTTAAATGTAATTACGATAACAAAAAATTTGA | |
''' | |
length_example_sequence = len(stringFASTA2seq(example_sequence)) | |
# see my file `about parsing fasta formatted sequence data in biopython.md` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment