dansondergaard · May 25, 2017 23:28
diff --git a/bioconv.py b/bioconv.py
 """Converting fasta files to data frames and back again."""

 import pandas as pd
 import skbio as sk


 def fasta_to_dataframe(path, id_col='id', description_col='description', sequence_col='sequence'):
    """Read a FASTA file into a pandas data frame.
    
    This function is the inverse of :func:`dataframe_to_fasta`.
    """
    def _make_tuple(entry):
        return (entry.metadata['id'], entry.metadata['description'], str(entry)) 
    entries = sk.io.read(path, format='fasta')
    columns = [id_col, description_col, sequence_col]
    data = (_make_tuple(entry) for entry in entries)
    return pd.DataFrame.from_records(columns=columns, data=data)


 def dataframe_to_fasta(df, path, id_col='id', description_col='description', sequence_col='sequence'):
    """Write a pandas data frame to a FASTA file.
    
    This function is the inverse of :func:`fasta_to_dataframe`.
    """
    def _make_sequence(row):
        return sk.Sequence(
            row[sequence_col], 
            metadata={'id': row[id_col], 'description': row[description_col]}
        )
    data = (_make_sequence(row) for row in df.iterrows())
    sk.io.write(data, format='fasta', into=path)

    
 def parse_cdhit(path):
    """Parse a CD-HIT clustering file.

    Return a tuple `(assignments, representatives)` where `assignments` is
    a dictionary mapping sequence identifiers to cluster names, and
    `representatives` is a dictionary mapping cluster names to the sequence
    identifier of the representative sequence.
    
    TODO: Change this to return data frames instead.
    """
    assignments = {}
    representatives = {}
    with open(path) as fileobj:
        current_cluster = None
        for line in fileobj:
            if line.startswith('>'):
                current_cluster = line[1:].strip()
            else:  
                acc = line[line.index('>') + 1:line.index('...')]
                if line.strip().endswith('*'):
                    representatives[current_cluster] = acc
                assignments[acc] = current_cluster
    return assignments, representatives
	"""Converting fasta files to data frames and back again."""

	import pandas as pd
	import skbio as sk


	def fasta_to_dataframe(path, id_col='id', description_col='description', sequence_col='sequence'):
	"""Read a FASTA file into a pandas data frame.

	This function is the inverse of :func:`dataframe_to_fasta`.
	"""
	def _make_tuple(entry):
	return (entry.metadata['id'], entry.metadata['description'], str(entry))
	entries = sk.io.read(path, format='fasta')
	columns = [id_col, description_col, sequence_col]
	data = (_make_tuple(entry) for entry in entries)
	return pd.DataFrame.from_records(columns=columns, data=data)


	def dataframe_to_fasta(df, path, id_col='id', description_col='description', sequence_col='sequence'):
	"""Write a pandas data frame to a FASTA file.

	This function is the inverse of :func:`fasta_to_dataframe`.
	"""
	def _make_sequence(row):
	return sk.Sequence(
	row[sequence_col],
	metadata={'id': row[id_col], 'description': row[description_col]}
	)
	data = (_make_sequence(row) for row in df.iterrows())
	sk.io.write(data, format='fasta', into=path)


	def parse_cdhit(path):
	"""Parse a CD-HIT clustering file.

	Return a tuple `(assignments, representatives)` where `assignments` is
	a dictionary mapping sequence identifiers to cluster names, and
	`representatives` is a dictionary mapping cluster names to the sequence
	identifier of the representative sequence.

	TODO: Change this to return data frames instead.
	"""
	assignments = {}
	representatives = {}
	with open(path) as fileobj:
	current_cluster = None
	for line in fileobj:
	if line.startswith('>'):
	current_cluster = line[1:].strip()
	else:
	acc = line[line.index('>') + 1:line.index('...')]
	if line.strip().endswith('*'):
	representatives[current_cluster] = acc
	assignments[acc] = current_cluster
	return assignments, representatives
No results found