kartben · March 6, 2025 09:26
diff --git a/cleanup-long-dts-bindings-descs.py b/cleanup-long-dts-bindings-descs.py
 #!/usr/bin/env python3

 import os
 import yaml
 import sys
 import argparse
 from pathlib import Path
 import statistics
 import re

 def get_first_sentence(text):
    """Extract the first sentence from a text block (typically a node description).

    Args:
        text: The text to extract the first sentence from.

    Returns:
        The first sentence found in the text, or the entire text if no sentence
        boundary is found.
    """
    if not text:
        return ""

    text = text.replace('\n', ' ')
    # Split by double spaces to get paragraphs
    paragraphs = text.split('  ')
    first_paragraph = paragraphs[0].strip()

    # Look for a period followed by a space in the first paragraph
    period_match = re.search(r'(.*?)\.(?:\s|$)', first_paragraph)
    if period_match:
        return period_match.group(1).strip()

    # If no period in the first paragraph, return the entire first paragraph
    return first_paragraph

 def find_descriptions(bindings_dir, threshold=None):
    """
    Recursively search through binding files and collect description information.

    Args:
        bindings_dir: Path to the bindings directory
        threshold: Optional character count threshold for filtering "long" descriptions

    Returns:
        List of tuples (file_path, description_length, first_line, full_description)
    """
    descriptions = []

    # Walk through all directories under bindings_dir
    for root, dirs, files in os.walk(bindings_dir):
        for file in files:
            if file.endswith('.yaml'):
                file_path = os.path.join(root, file)

                try:
                    with open(file_path, 'r') as f:
                        try:
                            # Load the YAML content
                            content = yaml.safe_load(f)

                            # Check if the file has a description field
                            if content and 'description' in content:
                                description = content['description']

                                # If description is a string, get the first sentence
                                if isinstance(description, str):
                                    first_sentence = get_first_sentence(description)
                                    length = len(first_sentence)

                                    # If threshold is None or the length exceeds threshold
                                    if threshold is None or length > threshold:
                                        rel_path = os.path.relpath(file_path, bindings_dir)
                                        descriptions.append((rel_path, length, first_sentence, description))
                        except yaml.YAMLError:
                            # Skip files with invalid YAML
                            pass
                except Exception as e:
                    # Skip files that can't be read
                    pass

    return descriptions

 def main():
    parser = argparse.ArgumentParser(description='Find binding files with long descriptions')
    parser.add_argument('bindings_dir', help='Path to the bindings directory')
    parser.add_argument('-t', '--threshold', type=int, default=80,
                        help='Character count threshold for considering a description "long" (default: 80)')
    parser.add_argument('-a', '--all', action='store_true',
                        help='Show all descriptions, not just long ones')
    parser.add_argument('-f', '--full', action='store_true',
                        help='Show full descriptions, not just the first line')
    parser.add_argument('-o', '--output', help='Output file to write results to')
    parser.add_argument('-s', '--stats', action='store_true',
                        help='Show statistics about description lengths')

    args = parser.parse_args()

    if not os.path.isdir(args.bindings_dir):
        print(f"Error: {args.bindings_dir} is not a valid directory")
        sys.exit(1)

    # Get all descriptions if --all is specified, otherwise only those exceeding threshold
    threshold = None if args.all else args.threshold
    descriptions = find_descriptions(args.bindings_dir, threshold)

    # Sort by description length (longest first)
    descriptions.sort(key=lambda x: x[1], reverse=True)

    # Prepare output
    output_lines = []

    if not descriptions:
        message = "No binding files with descriptions found."
        output_lines.append(message)
    else:
        if args.all:
            message = f"Found {len(descriptions)} binding files with descriptions:"
        else:
            message = f"Found {len(descriptions)} binding files with descriptions longer than {args.threshold} characters:"
        output_lines.append(message)

        # Add statistics if requested
        if args.stats:
            lengths = [length for _, length, _, _ in descriptions]
            output_lines.append("\nStatistics:")
            output_lines.append(f"  Min length: {min(lengths)}")
            output_lines.append(f"  Max length: {max(lengths)}")
            output_lines.append(f"  Average length: {sum(lengths) / len(lengths):.2f}")
            output_lines.append(f"  Median length: {statistics.median(lengths):.2f}")
            if len(lengths) > 1:
                output_lines.append(f"  Standard deviation: {statistics.stdev(lengths):.2f}")

        output_lines.append("\nFormat: LENGTH | FILE | DESCRIPTION")
        output_lines.append("-" * 100)

        for file_path, length, first_line, full_description in descriptions:
            # Determine which description to display
            if args.full:
                display_desc = full_description
            else:
                # Truncate very long descriptions for display
                display_desc = first_line[:100] + "..." if len(first_line) > 100 else first_line

            output_lines.append(f"{length:4d} | {file_path:50s} | {display_desc}")

    # Output results
    output_text = "\n".join(output_lines)

    if args.output:
        with open(args.output, 'w') as f:
            f.write(output_text)
        print(f"Results written to {args.output}")
    else:
        print(output_text)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import os
	import yaml
	import sys
	import argparse
	from pathlib import Path
	import statistics
	import re

	def get_first_sentence(text):
	"""Extract the first sentence from a text block (typically a node description).

	Args:
	text: The text to extract the first sentence from.

	Returns:
	The first sentence found in the text, or the entire text if no sentence
	boundary is found.
	"""
	if not text:
	return ""

	text = text.replace('\n', ' ')
	# Split by double spaces to get paragraphs
	paragraphs = text.split(' ')
	first_paragraph = paragraphs[0].strip()

	# Look for a period followed by a space in the first paragraph
	period_match = re.search(r'(.*?)\.(?:\s\|$)', first_paragraph)
	if period_match:
	return period_match.group(1).strip()

	# If no period in the first paragraph, return the entire first paragraph
	return first_paragraph

	def find_descriptions(bindings_dir, threshold=None):
	"""
	Recursively search through binding files and collect description information.

	Args:
	bindings_dir: Path to the bindings directory
	threshold: Optional character count threshold for filtering "long" descriptions

	Returns:
	List of tuples (file_path, description_length, first_line, full_description)
	"""
	descriptions = []

	# Walk through all directories under bindings_dir
	for root, dirs, files in os.walk(bindings_dir):
	for file in files:
	if file.endswith('.yaml'):
	file_path = os.path.join(root, file)

	try:
	with open(file_path, 'r') as f:
	try:
	# Load the YAML content
	content = yaml.safe_load(f)

	# Check if the file has a description field
	if content and 'description' in content:
	description = content['description']

	# If description is a string, get the first sentence
	if isinstance(description, str):
	first_sentence = get_first_sentence(description)
	length = len(first_sentence)

	# If threshold is None or the length exceeds threshold
	if threshold is None or length > threshold:
	rel_path = os.path.relpath(file_path, bindings_dir)
	descriptions.append((rel_path, length, first_sentence, description))
	except yaml.YAMLError:
	# Skip files with invalid YAML
	pass
	except Exception as e:
	# Skip files that can't be read
	pass

	return descriptions

	def main():
	parser = argparse.ArgumentParser(description='Find binding files with long descriptions')
	parser.add_argument('bindings_dir', help='Path to the bindings directory')
	parser.add_argument('-t', '--threshold', type=int, default=80,
	help='Character count threshold for considering a description "long" (default: 80)')
	parser.add_argument('-a', '--all', action='store_true',
	help='Show all descriptions, not just long ones')
	parser.add_argument('-f', '--full', action='store_true',
	help='Show full descriptions, not just the first line')
	parser.add_argument('-o', '--output', help='Output file to write results to')
	parser.add_argument('-s', '--stats', action='store_true',
	help='Show statistics about description lengths')

	args = parser.parse_args()

	if not os.path.isdir(args.bindings_dir):
	print(f"Error: {args.bindings_dir} is not a valid directory")
	sys.exit(1)

	# Get all descriptions if --all is specified, otherwise only those exceeding threshold
	threshold = None if args.all else args.threshold
	descriptions = find_descriptions(args.bindings_dir, threshold)

	# Sort by description length (longest first)
	descriptions.sort(key=lambda x: x[1], reverse=True)

	# Prepare output
	output_lines = []

	if not descriptions:
	message = "No binding files with descriptions found."
	output_lines.append(message)
	else:
	if args.all:
	message = f"Found {len(descriptions)} binding files with descriptions:"
	else:
	message = f"Found {len(descriptions)} binding files with descriptions longer than {args.threshold} characters:"
	output_lines.append(message)

	# Add statistics if requested
	if args.stats:
	lengths = [length for _, length, _, _ in descriptions]
	output_lines.append("\nStatistics:")
	output_lines.append(f" Min length: {min(lengths)}")
	output_lines.append(f" Max length: {max(lengths)}")
	output_lines.append(f" Average length: {sum(lengths) / len(lengths):.2f}")
	output_lines.append(f" Median length: {statistics.median(lengths):.2f}")
	if len(lengths) > 1:
	output_lines.append(f" Standard deviation: {statistics.stdev(lengths):.2f}")

	output_lines.append("\nFormat: LENGTH \| FILE \| DESCRIPTION")
	output_lines.append("-" * 100)

	for file_path, length, first_line, full_description in descriptions:
	# Determine which description to display
	if args.full:
	display_desc = full_description
	else:
	# Truncate very long descriptions for display
	display_desc = first_line[:100] + "..." if len(first_line) > 100 else first_line

	output_lines.append(f"{length:4d} \| {file_path:50s} \| {display_desc}")

	# Output results
	output_text = "\n".join(output_lines)

	if args.output:
	with open(args.output, 'w') as f:
	f.write(output_text)
	print(f"Results written to {args.output}")
	else:
	print(output_text)

	if __name__ == "__main__":
	main()