Last active
May 25, 2017 23:28
-
-
Save dansondergaard/1a2996f9cf6c19281759f160e7dbcfd5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Converting fasta files to data frames and back again.""" | |
| import pandas as pd | |
| import skbio as sk | |
| def fasta_to_dataframe(path, id_col='id', description_col='description', sequence_col='sequence'): | |
| """Read a FASTA file into a pandas data frame. | |
| This function is the inverse of :func:`dataframe_to_fasta`. | |
| """ | |
| def _make_tuple(entry): | |
| return (entry.metadata['id'], entry.metadata['description'], str(entry)) | |
| entries = sk.io.read(path, format='fasta') | |
| columns = [id_col, description_col, sequence_col] | |
| data = (_make_tuple(entry) for entry in entries) | |
| return pd.DataFrame.from_records(columns=columns, data=data) | |
| def dataframe_to_fasta(df, path, id_col='id', description_col='description', sequence_col='sequence'): | |
| """Write a pandas data frame to a FASTA file. | |
| This function is the inverse of :func:`fasta_to_dataframe`. | |
| """ | |
| def _make_sequence(row): | |
| return sk.Sequence( | |
| row[sequence_col], | |
| metadata={'id': row[id_col], 'description': row[description_col]} | |
| ) | |
| data = (_make_sequence(row) for row in df.iterrows()) | |
| sk.io.write(data, format='fasta', into=path) | |
| def parse_cdhit(path): | |
| """Parse a CD-HIT clustering file. | |
| Return a tuple `(assignments, representatives)` where `assignments` is | |
| a dictionary mapping sequence identifiers to cluster names, and | |
| `representatives` is a dictionary mapping cluster names to the sequence | |
| identifier of the representative sequence. | |
| TODO: Change this to return data frames instead. | |
| """ | |
| assignments = {} | |
| representatives = {} | |
| with open(path) as fileobj: | |
| current_cluster = None | |
| for line in fileobj: | |
| if line.startswith('>'): | |
| current_cluster = line[1:].strip() | |
| else: | |
| acc = line[line.index('>') + 1:line.index('...')] | |
| if line.strip().endswith('*'): | |
| representatives[current_cluster] = acc | |
| assignments[acc] = current_cluster | |
| return assignments, representatives |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment