Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Last active February 20, 2025 21:45
Show Gist options
  • Save pansapiens/91057057beb5c9d4e38d6c5d809ad72e to your computer and use it in GitHub Desktop.
Save pansapiens/91057057beb5c9d4e38d6c5d809ad72e to your computer and use it in GitHub Desktop.
HMMER tblout / domtblout to pandas.DataFrame
#!/usr/bin/env python
# MIT License
# Copyright (c) 2022 Zebulun Arendsee (rhmmer code)
# Copyright (c) 2025 Andrew Perry (port to Python)
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE EOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# PUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
This is a quick LLM-guided port using parts of https://github.com/arendsee/rhmmer
There are alternatives, like parsing with Bio.SearchIO.HmmerIO or using PyHMMER.
This one is a no nonsense tblout/domtblout to a pandas.DataFrame converter.
>>> hits_df = parse_hmmer_output("my_hits.tblout")
>>> hits_df.head()
"""
from typing import Literal, Dict, Any, Union, TextIO
import logging
import sys
import argparse
from pathlib import Path
import pandas as pd
import numpy as np
def detect_hmmer_format(
file: Union[str, Path, "TextIO"]
) -> Literal["tblout", "domtblout"]:
"""Detect if a file is in HMMER tblout or domtblout format based on header.
Args:
file: Path to HMMER output file or file-like object
Returns:
Format type: either "tblout" or "domtblout"
Raises:
ValueError: If format cannot be detected
"""
if isinstance(file, (str, Path)):
with open(file) as f:
return detect_hmmer_format(f)
for line in file:
if isinstance(line, bytes):
line = line.decode()
if "--- full sequence ---" in line:
if "this domain" in line:
return "domtblout"
elif "best 1 domain" in line:
return "tblout"
file.seek(0)
raise ValueError(
"Could not detect HMMER output format - file may be empty or invalid"
)
def parse_hmmer_output(
file: Union[str, Path, "TextIO"],
fmt_type: Union[Literal["tblout", "domtblout"], Literal["auto"]] = "auto",
) -> pd.DataFrame:
"""Parse HMMER tblout or domtblout format files into a pandas DataFrame.
Args:
file: Path to HMMER output file or file-like object
fmt_type: Format type, either "tblout", "domtblout" or "auto" (default)
Returns:
DataFrame containing the parsed HMMER results
"""
if fmt_type == "auto":
fmt_type = detect_hmmer_format(file)
logging.info(f"Auto-detected format: {fmt_type}")
# Column specifications for different formats
tblout_dtypes: Dict[str, Any] = {
"domain_name": str,
"domain_accession": str,
"query_name": str,
"query_accession": str,
"sequence_evalue": float,
"sequence_score": float,
"sequence_bias": float,
"best_domain_evalue": float,
"best_domain_score": float,
"best_domain_bis": float,
"domain_number_exp": float,
"domain_number_reg": "Int64",
"domain_number_clu": "Int64",
"domain_number_ov": "Int64",
"domain_number_env": "Int64",
"domain_number_dom": "Int64",
"domain_number_rep": "Int64",
"domain_number_inc": str,
}
domtblout_dtypes: Dict[str, Any] = {
"domain_name": str,
"domain_accession": str,
"domain_len": "Int64",
"query_name": str,
"query_accession": str,
"qlen": "Int64",
"sequence_evalue": float,
"sequence_score": float,
"sequence_bias": float,
"domain_N": "Int64",
"domain_of": "Int64",
"domain_cevalue": float,
"domain_ievalue": float,
"domain_score": float,
"domain_bias": float,
"hmm_from": "Int64",
"hmm_to": "Int64",
"ali_from": "Int64",
"ali_to": "Int64",
"env_from": "Int64",
"env_to": "Int64",
"acc": float,
}
dtypes = tblout_dtypes if fmt_type == "tblout" else domtblout_dtypes
n_fields = len(dtypes)
# Read file and filter comment lines
if isinstance(file, (str, Path)):
with open(file) as f:
lines = [line for line in f if not line.startswith("#")]
else:
lines = []
for line in file:
# Convert bytes to str if needed
if isinstance(line, bytes):
line = line.decode()
if not line.startswith("#"):
lines.append(line)
file.seek(0) # Reset file position for potential reuse
# Extract the main data fields
data = []
descriptions = []
for line in lines:
fields = line.strip().split(maxsplit=n_fields)
if len(fields) >= n_fields:
data.append(fields[:n_fields])
if fmt_type == "domtblout" and len(fields) > n_fields:
descriptions.append(" ".join(fields[n_fields:]))
elif fmt_type == "tblout" and len(fields) > n_fields:
descriptions.append(" ".join(fields[n_fields:]))
# Create DataFrame
df = pd.DataFrame(data, columns=list(dtypes.keys()))
# Convert types
for col, dtype in dtypes.items():
if dtype in ("Int64", int, float):
df[col] = pd.to_numeric(df[col].replace("-", pd.NA), errors="coerce")
else:
df[col] = df[col].replace("-", pd.NA)
# Add descriptions for domtblout format
if fmt_type == "domtblout" and descriptions:
df["description"] = descriptions
elif fmt_type == "tblout" and descriptions:
df["description"] = descriptions
return df
def read_tblout(file: Union[str, Path]) -> pd.DataFrame:
"""Read a HMMER tblout format file.
Args:
file: Path to the tblout file
Returns:
DataFrame containing the parsed results
"""
return parse_hmmer_output(file, "tblout")
def read_domtblout(file: Union[str, Path]) -> pd.DataFrame:
"""Read a HMMER domtblout format file.
Args:
file: Path to the domtblout file
Returns:
DataFrame containing the parsed results
"""
return parse_hmmer_output(file, "domtblout")
def main():
parser = argparse.ArgumentParser(
description="Parse HMMER tblout/domtblout files to TSV"
)
parser.add_argument("file", help="Input HMMER output file")
parser.add_argument(
"--format",
"-f",
choices=["tblout", "domtblout", "auto"],
default="auto",
help="HMMER output format (default: auto-detect)",
)
parser.add_argument(
"--output", "-o", default="-", help="Output file (default: stdout)"
)
args = parser.parse_args()
# Setup logging
logging.basicConfig(
level=logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stderr
)
try:
df = parse_hmmer_output(args.file, args.format)
if args.output == "-":
df.to_csv(sys.stdout, sep="\t", index=False)
else:
df.to_csv(args.output, sep="\t", index=False)
except Exception as e:
logging.error(f"Error processing file: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment