Skip to content

Instantly share code, notes, and snippets.

@dansondergaard
Last active May 25, 2017 23:28
Show Gist options
  • Select an option

  • Save dansondergaard/1a2996f9cf6c19281759f160e7dbcfd5 to your computer and use it in GitHub Desktop.

Select an option

Save dansondergaard/1a2996f9cf6c19281759f160e7dbcfd5 to your computer and use it in GitHub Desktop.
"""Converting fasta files to data frames and back again."""
import pandas as pd
import skbio as sk
def fasta_to_dataframe(path, id_col='id', description_col='description', sequence_col='sequence'):
"""Read a FASTA file into a pandas data frame.
This function is the inverse of :func:`dataframe_to_fasta`.
"""
def _make_tuple(entry):
return (entry.metadata['id'], entry.metadata['description'], str(entry))
entries = sk.io.read(path, format='fasta')
columns = [id_col, description_col, sequence_col]
data = (_make_tuple(entry) for entry in entries)
return pd.DataFrame.from_records(columns=columns, data=data)
def dataframe_to_fasta(df, path, id_col='id', description_col='description', sequence_col='sequence'):
"""Write a pandas data frame to a FASTA file.
This function is the inverse of :func:`fasta_to_dataframe`.
"""
def _make_sequence(row):
return sk.Sequence(
row[sequence_col],
metadata={'id': row[id_col], 'description': row[description_col]}
)
data = (_make_sequence(row) for row in df.iterrows())
sk.io.write(data, format='fasta', into=path)
def parse_cdhit(path):
"""Parse a CD-HIT clustering file.
Return a tuple `(assignments, representatives)` where `assignments` is
a dictionary mapping sequence identifiers to cluster names, and
`representatives` is a dictionary mapping cluster names to the sequence
identifier of the representative sequence.
TODO: Change this to return data frames instead.
"""
assignments = {}
representatives = {}
with open(path) as fileobj:
current_cluster = None
for line in fileobj:
if line.startswith('>'):
current_cluster = line[1:].strip()
else:
acc = line[line.index('>') + 1:line.index('...')]
if line.strip().endswith('*'):
representatives[current_cluster] = acc
assignments[acc] = current_cluster
return assignments, representatives
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment