pansapiens · February 20, 2025 21:45
diff --git a/hmmer_tblout_parser.py b/hmmer_tblout_parser.py
 #!/usr/bin/env python
 # MIT License
 # Copyright (c) 2022 Zebulun Arendsee (rhmmer code)
 # Copyright (c) 2025 Andrew Perry (port to Python)

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:

 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE EOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # PUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 """
 This is a quick LLM-guided port using parts of https://github.com/arendsee/rhmmer
 There are alternatives, like parsing with Bio.SearchIO.HmmerIO or using PyHMMER.
 This one is a no nonsense tblout/domtblout to a pandas.DataFrame converter.

 >>> hits_df = parse_hmmer_output("my_hits.tblout")
 >>> hits_df.head()
 """

 from typing import Literal, Dict, Any, Union, TextIO
 import logging
 import sys
 import argparse
 from pathlib import Path

 import pandas as pd
 import numpy as np


 def detect_hmmer_format(
    file: Union[str, Path, "TextIO"]
 ) -> Literal["tblout", "domtblout"]:
    """Detect if a file is in HMMER tblout or domtblout format based on header.

    Args:
        file: Path to HMMER output file or file-like object

    Returns:
        Format type: either "tblout" or "domtblout"

    Raises:
        ValueError: If format cannot be detected
    """
    if isinstance(file, (str, Path)):
        with open(file) as f:
            return detect_hmmer_format(f)

    for line in file:
        if isinstance(line, bytes):
            line = line.decode()
        if "--- full sequence ---" in line:
            if "this domain" in line:
                return "domtblout"
            elif "best 1 domain" in line:
                return "tblout"
    file.seek(0)
    raise ValueError(
        "Could not detect HMMER output format - file may be empty or invalid"
    )


 def parse_hmmer_output(
    file: Union[str, Path, "TextIO"],
    fmt_type: Union[Literal["tblout", "domtblout"], Literal["auto"]] = "auto",
 ) -> pd.DataFrame:
    """Parse HMMER tblout or domtblout format files into a pandas DataFrame.

    Args:
        file: Path to HMMER output file or file-like object
        fmt_type: Format type, either "tblout", "domtblout" or "auto" (default)

    Returns:
        DataFrame containing the parsed HMMER results
    """
    if fmt_type == "auto":
        fmt_type = detect_hmmer_format(file)
        logging.info(f"Auto-detected format: {fmt_type}")

    # Column specifications for different formats
    tblout_dtypes: Dict[str, Any] = {
        "domain_name": str,
        "domain_accession": str,
        "query_name": str,
        "query_accession": str,
        "sequence_evalue": float,
        "sequence_score": float,
        "sequence_bias": float,
        "best_domain_evalue": float,
        "best_domain_score": float,
        "best_domain_bis": float,
        "domain_number_exp": float,
        "domain_number_reg": "Int64",
        "domain_number_clu": "Int64",
        "domain_number_ov": "Int64",
        "domain_number_env": "Int64",
        "domain_number_dom": "Int64",
        "domain_number_rep": "Int64",
        "domain_number_inc": str,
    }

    domtblout_dtypes: Dict[str, Any] = {
        "domain_name": str,
        "domain_accession": str,
        "domain_len": "Int64",
        "query_name": str,
        "query_accession": str,
        "qlen": "Int64",
        "sequence_evalue": float,
        "sequence_score": float,
        "sequence_bias": float,
        "domain_N": "Int64",
        "domain_of": "Int64",
        "domain_cevalue": float,
        "domain_ievalue": float,
        "domain_score": float,
        "domain_bias": float,
        "hmm_from": "Int64",
        "hmm_to": "Int64",
        "ali_from": "Int64",
        "ali_to": "Int64",
        "env_from": "Int64",
        "env_to": "Int64",
        "acc": float,
    }

    dtypes = tblout_dtypes if fmt_type == "tblout" else domtblout_dtypes
    n_fields = len(dtypes)

    # Read file and filter comment lines
    if isinstance(file, (str, Path)):
        with open(file) as f:
            lines = [line for line in f if not line.startswith("#")]
    else:
        lines = []
        for line in file:
            # Convert bytes to str if needed
            if isinstance(line, bytes):
                line = line.decode()
            if not line.startswith("#"):
                lines.append(line)
        file.seek(0)  # Reset file position for potential reuse

    # Extract the main data fields
    data = []
    descriptions = []
    for line in lines:
        fields = line.strip().split(maxsplit=n_fields)
        if len(fields) >= n_fields:
            data.append(fields[:n_fields])
            if fmt_type == "domtblout" and len(fields) > n_fields:
                descriptions.append(" ".join(fields[n_fields:]))
            elif fmt_type == "tblout" and len(fields) > n_fields:
                descriptions.append(" ".join(fields[n_fields:]))

    # Create DataFrame
    df = pd.DataFrame(data, columns=list(dtypes.keys()))

    # Convert types
    for col, dtype in dtypes.items():
        if dtype in ("Int64", int, float):
            df[col] = pd.to_numeric(df[col].replace("-", pd.NA), errors="coerce")
        else:
            df[col] = df[col].replace("-", pd.NA)

    # Add descriptions for domtblout format
    if fmt_type == "domtblout" and descriptions:
        df["description"] = descriptions
    elif fmt_type == "tblout" and descriptions:
        df["description"] = descriptions

    return df


 def read_tblout(file: Union[str, Path]) -> pd.DataFrame:
    """Read a HMMER tblout format file.

    Args:
        file: Path to the tblout file

    Returns:
        DataFrame containing the parsed results
    """
    return parse_hmmer_output(file, "tblout")


 def read_domtblout(file: Union[str, Path]) -> pd.DataFrame:
    """Read a HMMER domtblout format file.

    Args:
        file: Path to the domtblout file

    Returns:
        DataFrame containing the parsed results
    """
    return parse_hmmer_output(file, "domtblout")


 def main():
    parser = argparse.ArgumentParser(
        description="Parse HMMER tblout/domtblout files to TSV"
    )
    parser.add_argument("file", help="Input HMMER output file")
    parser.add_argument(
        "--format",
        "-f",
        choices=["tblout", "domtblout", "auto"],
        default="auto",
        help="HMMER output format (default: auto-detect)",
    )
    parser.add_argument(
        "--output", "-o", default="-", help="Output file (default: stdout)"
    )
    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        level=logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stderr
    )

    try:
        df = parse_hmmer_output(args.file, args.format)
        if args.output == "-":
            df.to_csv(sys.stdout, sep="\t", index=False)
        else:
            df.to_csv(args.output, sep="\t", index=False)
    except Exception as e:
        logging.error(f"Error processing file: {e}")
        sys.exit(1)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# MIT License
	# Copyright (c) 2022 Zebulun Arendsee (rhmmer code)
	# Copyright (c) 2025 Andrew Perry (port to Python)

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE EOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# PUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	"""
	This is a quick LLM-guided port using parts of https://github.com/arendsee/rhmmer
	There are alternatives, like parsing with Bio.SearchIO.HmmerIO or using PyHMMER.
	This one is a no nonsense tblout/domtblout to a pandas.DataFrame converter.

	>>> hits_df = parse_hmmer_output("my_hits.tblout")
	>>> hits_df.head()
	"""

	from typing import Literal, Dict, Any, Union, TextIO
	import logging
	import sys
	import argparse
	from pathlib import Path

	import pandas as pd
	import numpy as np


	def detect_hmmer_format(
	file: Union[str, Path, "TextIO"]
	) -> Literal["tblout", "domtblout"]:
	"""Detect if a file is in HMMER tblout or domtblout format based on header.

	Args:
	file: Path to HMMER output file or file-like object

	Returns:
	Format type: either "tblout" or "domtblout"

	Raises:
	ValueError: If format cannot be detected
	"""
	if isinstance(file, (str, Path)):
	with open(file) as f:
	return detect_hmmer_format(f)

	for line in file:
	if isinstance(line, bytes):
	line = line.decode()
	if "--- full sequence ---" in line:
	if "this domain" in line:
	return "domtblout"
	elif "best 1 domain" in line:
	return "tblout"
	file.seek(0)
	raise ValueError(
	"Could not detect HMMER output format - file may be empty or invalid"
	)


	def parse_hmmer_output(
	file: Union[str, Path, "TextIO"],
	fmt_type: Union[Literal["tblout", "domtblout"], Literal["auto"]] = "auto",
	) -> pd.DataFrame:
	"""Parse HMMER tblout or domtblout format files into a pandas DataFrame.

	Args:
	file: Path to HMMER output file or file-like object
	fmt_type: Format type, either "tblout", "domtblout" or "auto" (default)

	Returns:
	DataFrame containing the parsed HMMER results
	"""
	if fmt_type == "auto":
	fmt_type = detect_hmmer_format(file)
	logging.info(f"Auto-detected format: {fmt_type}")

	# Column specifications for different formats
	tblout_dtypes: Dict[str, Any] = {
	"domain_name": str,
	"domain_accession": str,
	"query_name": str,
	"query_accession": str,
	"sequence_evalue": float,
	"sequence_score": float,
	"sequence_bias": float,
	"best_domain_evalue": float,
	"best_domain_score": float,
	"best_domain_bis": float,
	"domain_number_exp": float,
	"domain_number_reg": "Int64",
	"domain_number_clu": "Int64",
	"domain_number_ov": "Int64",
	"domain_number_env": "Int64",
	"domain_number_dom": "Int64",
	"domain_number_rep": "Int64",
	"domain_number_inc": str,
	}

	domtblout_dtypes: Dict[str, Any] = {
	"domain_name": str,
	"domain_accession": str,
	"domain_len": "Int64",
	"query_name": str,
	"query_accession": str,
	"qlen": "Int64",
	"sequence_evalue": float,
	"sequence_score": float,
	"sequence_bias": float,
	"domain_N": "Int64",
	"domain_of": "Int64",
	"domain_cevalue": float,
	"domain_ievalue": float,
	"domain_score": float,
	"domain_bias": float,
	"hmm_from": "Int64",
	"hmm_to": "Int64",
	"ali_from": "Int64",
	"ali_to": "Int64",
	"env_from": "Int64",
	"env_to": "Int64",
	"acc": float,
	}

	dtypes = tblout_dtypes if fmt_type == "tblout" else domtblout_dtypes
	n_fields = len(dtypes)

	# Read file and filter comment lines
	if isinstance(file, (str, Path)):
	with open(file) as f:
	lines = [line for line in f if not line.startswith("#")]
	else:
	lines = []
	for line in file:
	# Convert bytes to str if needed
	if isinstance(line, bytes):
	line = line.decode()
	if not line.startswith("#"):
	lines.append(line)
	file.seek(0) # Reset file position for potential reuse

	# Extract the main data fields
	data = []
	descriptions = []
	for line in lines:
	fields = line.strip().split(maxsplit=n_fields)
	if len(fields) >= n_fields:
	data.append(fields[:n_fields])
	if fmt_type == "domtblout" and len(fields) > n_fields:
	descriptions.append(" ".join(fields[n_fields:]))
	elif fmt_type == "tblout" and len(fields) > n_fields:
	descriptions.append(" ".join(fields[n_fields:]))

	# Create DataFrame
	df = pd.DataFrame(data, columns=list(dtypes.keys()))

	# Convert types
	for col, dtype in dtypes.items():
	if dtype in ("Int64", int, float):
	df[col] = pd.to_numeric(df[col].replace("-", pd.NA), errors="coerce")
	else:
	df[col] = df[col].replace("-", pd.NA)

	# Add descriptions for domtblout format
	if fmt_type == "domtblout" and descriptions:
	df["description"] = descriptions
	elif fmt_type == "tblout" and descriptions:
	df["description"] = descriptions

	return df


	def read_tblout(file: Union[str, Path]) -> pd.DataFrame:
	"""Read a HMMER tblout format file.

	Args:
	file: Path to the tblout file

	Returns:
	DataFrame containing the parsed results
	"""
	return parse_hmmer_output(file, "tblout")


	def read_domtblout(file: Union[str, Path]) -> pd.DataFrame:
	"""Read a HMMER domtblout format file.

	Args:
	file: Path to the domtblout file

	Returns:
	DataFrame containing the parsed results
	"""
	return parse_hmmer_output(file, "domtblout")


	def main():
	parser = argparse.ArgumentParser(
	description="Parse HMMER tblout/domtblout files to TSV"
	)
	parser.add_argument("file", help="Input HMMER output file")
	parser.add_argument(
	"--format",
	"-f",
	choices=["tblout", "domtblout", "auto"],
	default="auto",
	help="HMMER output format (default: auto-detect)",
	)
	parser.add_argument(
	"--output", "-o", default="-", help="Output file (default: stdout)"
	)
	args = parser.parse_args()

	# Setup logging
	logging.basicConfig(
	level=logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stderr
	)

	try:
	df = parse_hmmer_output(args.file, args.format)
	if args.output == "-":
	df.to_csv(sys.stdout, sep="\t", index=False)
	else:
	df.to_csv(args.output, sep="\t", index=False)
	except Exception as e:
	logging.error(f"Error processing file: {e}")
	sys.exit(1)


	if __name__ == "__main__":
	main()