freelze · February 1, 2025 15:16
diff --git a/compare_txt_using_diff_match_patch.py b/compare_txt_using_diff_match_patch.py
 import diff_match_patch
 import re
 from pathlib import Path
 from typing import List, Tuple

 # --- Configuration Variables (Text Colors) ---
 TEXT1_DELETION_COLOR = "#ff0000"  # Red
 TEXT2_INSERTION_COLOR = "#008000"  # Green


 def get_output_filename(*filenames: str) -> str:
    """Generate output HTML filename from multiple input filenames"""
    stems = [Path(filename).stem for filename in filenames]
    return f"{'_'.join(stems)}.html"

 class DiffHighlighter:
    def __init__(self):
        self.dmp = diff_match_patch.diff_match_patch()

    def process_line_diffs(self, standard_line: str, other_lines: List[Tuple[int, str]]) -> List[Tuple[int, str]]:
        """
        Process differences between a standard line and multiple other lines, 
        generating HTML for each comparison.

        Args:
            standard_line: The line from the standard file to compare against.
            other_lines: A list of tuples, each containing the original line 
                         index and the line content from other files.

        Returns:
            List of tuples, each containing the original line index and the 
            corresponding HTML string with differences highlighted.
        """
        htmls = []
        for other_index, other_line in other_lines:
            diffs = self.dmp.diff_main(standard_line, other_line)
            self.dmp.diff_cleanupSemanticLossless(diffs)
            htmls.append((other_index, self._generate_comparison_html(diffs)))
        return htmls

    def _generate_comparison_html(self, diffs) -> str:
        """Generate HTML for comparison, highlighting deletions in red and insertions in green"""
        html = ""
        for op, data in diffs:
            if op == diff_match_patch.diff_match_patch.DIFF_EQUAL:
                html += data
            elif op == diff_match_patch.diff_match_patch.DIFF_DELETE:
                html += f"<span style=\"background-color:#ffdddd;color:{TEXT1_DELETION_COLOR};\">{data}</span>"
            elif op == diff_match_patch.diff_match_patch.DIFF_INSERT:
                html += f"<ins style=\"background-color:#e6ffe6;color:{TEXT2_INSERTION_COLOR};\">{data}</ins>"
        return html

 def split_markdown_blocks(text: str) -> dict:
    """Split markdown text into blocks based on ### headers"""
    blocks = {}
    current_header = None
    current_content = []

    lines = text.splitlines()
    i = 0
    while i < len(lines):
        line = lines[i]
        if line.startswith('###'):
            # Save previous block if exists
            if current_header:
                blocks[current_header] = '\n'.join(current_content)
            # Start new block
            current_header = line.strip()
            current_content = []
            i += 1  # Move to the next line after the header
        else:
            if current_header:  # Only add content if we have a header
                current_content.append(line)
            i += 1

    # Save last block
    if current_header:
        blocks[current_header] = '\n'.join(current_content)

    return blocks

 def highlight_differences_line_by_line(texts: List[str], filenames: List[str], standard_file_index: int = 0) -> List[str]:
    """
    Compares multiple markdown texts line by line within each block,
    generates HTML output, adds the filename after each line, and 
    combines the output into a single list, preserving the original 
    order of lines and grouping lines with the same number together.
    Allows specifying a standard file for comparison.

    Args:
        texts: List of markdown texts.
        filenames: List of filenames corresponding to the texts.
        standard_file_index: Index of the file to be used as the standard 
                             for comparison.

    Returns:
        List of HTML strings with differences highlighted line by line 
        and filenames appended.
    """
    highlighter = DiffHighlighter()

    # Split all texts into blocks
    blocks_list = [split_markdown_blocks(text) for text in texts]

    # Get all unique headers
    all_headers = sorted(set().union(*[set(blocks.keys()) for blocks in blocks_list]))

    combined_html_blocks = []

    # Process each block
    for header in all_headers:
        # Add header
        combined_html_blocks.append(f"<h4>{header}</h4>")

        # Extract content for this block from all texts
        contents = [blocks.get(header, '') for blocks in blocks_list]

        if any(contents):
            # Extract numbered lines and their numbers from all contents
            numbered_lines_list = [re.findall(r'^(\d+\.) (.*)$', content, re.MULTILINE) for content in contents]

            # Create dictionaries mapping numbers to lines for all contents
            lines_dicts = [{num: line for num, line in numbered_lines} for numbered_lines in numbered_lines_list]

            # Get all unique numbers
            all_numbers = sorted(set().union(*[set(lines_dict.keys()) for lines_dict in lines_dicts]), key=lambda x: int(x.split('.')[0]))

            # Initialize a list to store lines in their original order
            ordered_lines = []
            
            # Compare lines based on numbers
            for num in all_numbers:
                lines = [lines_dict.get(num, '') for lines_dict in lines_dicts]
                
                # Create a list to store the file index and line content for each file
                indexed_lines = []
                for file_index, line in enumerate(lines):
                    numbered_line = f"{num} {line}" if line else " "  # Use non-breaking space for empty lines
                    indexed_lines.append((file_index, numbered_line))

                # Use the line from the standard file for comparison
                standard_line = indexed_lines[standard_file_index][1]

                # Compare the standard line with all other lines
                other_lines = [(index, line) for index, line in indexed_lines if index != standard_file_index]
                
                # Process line differences against the standard line
                compared_lines = highlighter.process_line_diffs(standard_line, other_lines)

                # Add the standard line to the ordered_lines list
                filename = filenames[standard_file_index]
                html_with_filename = f"{standard_line} ({filename})"
                ordered_lines.append((int(num.split('.')[0]), standard_file_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))

                # Add compared lines to the ordered_lines list
                for other_index, html in compared_lines:
                  filename = filenames[other_index]
                  html_with_filename = f"{html} ({filename})"
                  ordered_lines.append((int(num.split('.')[0]), other_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))

            # Sort lines based on their line number and then their original index to preserve order within the block
            ordered_lines.sort()

            # Add the ordered lines to the combined output
            for _, _, line_html in ordered_lines:
                combined_html_blocks.append(line_html)

        # Add separator between blocks
        combined_html_blocks.append("<hr>")

    return combined_html_blocks

 def generate_diff_html(texts: List[str], output_path: str, filenames: List[str], standard_file_index: int, line_by_line: bool = True):
    """
    Generate HTML file showing differences between multiple texts.

    Args:
        texts: List of texts to compare.
        output_path: Path to save the output HTML file.
        filenames: List of filenames corresponding to the texts.
        standard_file_index: Index of the file to be used as the standard for comparison.
        line_by_line: If True, perform line-by-line comparison within blocks.
    """
    if line_by_line:
        combined_html_blocks = highlight_differences_line_by_line(texts, filenames, standard_file_index)
    else:
        raise ValueError("Block-by-block comparison is not supported for multiple files.")

    # Get filenames for display
    file_names = [Path(filename).stem for filename in filenames]

    html_template = """<!DOCTYPE html>
 <html>
 <head>
    <title>Diff Comparison</title>
    <meta charset="UTF-8">
    <style>
        .container {{
            display: block;
        }}
        .section {{
            padding: 10px;
        }}
        hr {{
            border: 1px solid #ccc;
            margin: 20px 0;
        }}
        h4 {{
            background-color: #f0f0f0;
            padding: 5px;
            margin: 10px 0;
        }}
        .section-header {{
            padding: 10px;
            border-bottom: 2px solid #ccc;
        }}
    </style>
 </head>
 <body>
    <div class="container">
        <div class="section">
            <h3 class="section-header">Comparison: {filenames}</h3>
            {content}
        </div>
    </div>
 </body>
 </html>"""

    html_content = html_template.format(
        content="\n".join(combined_html_blocks),
        filenames=", ".join(file_names)
    )

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)

 def get_ref_filename(output_path: str) -> str:
    """Extract reference filename from the output path"""
    # Output path format is: ref_compare_comparison.html
    return output_path.split('_compare_')[0]

 def get_comparison_filename(output_path: str) -> str:
    """Extract comparison filename from the output path"""
    # Output path format is: ref_compare_comparison.html
    return output_path.split('_compare_')[1].replace('.html', '')

 def try_read_file(file_path: str) -> str:
    """
    Try to read file with different encodings
    
    Args:
        file_path: Path to the file
        
    Returns:
        File contents as string
    """
    encodings = ['utf-8', 'shift-jis', 'gbk', 'big5', 'cp932']
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
            
    raise UnicodeDecodeError(f"Failed to read {file_path} with encodings: {encodings}")

 def compare_multiple_files(folder_path: str, output_path: str, standard_file_index: int = 0, line_by_line: bool = True) -> None:
    """
    Compare multiple txt files in a folder and generate a combined HTML output.

    Args:
        folder_path: Path to the folder containing txt files.
        output_path: Path to save the output HTML file.
        standard_file_index: Index of the file to be used as the standard for comparison.
        line_by_line: If True, use line-by-line comparison within blocks.
    """
    try:
        folder = Path(folder_path).resolve()
        print(f"Processing folder: {folder}")

        txt_files = sorted(folder.glob("*.txt"))  # Sort files for consistent ordering

        if not txt_files:
            print(f"No txt files found in {folder_path}")
            return

        print(f"\nFound {len(txt_files)} txt files to compare:")
        for f in txt_files:
            print(f"  - {f.name}")

        texts = [try_read_file(str(txt_file)) for txt_file in txt_files]
        filenames = [txt_file.stem for txt_file in txt_files]  # Use stem to get filename without extension

        generate_diff_html(texts, output_path, filenames, standard_file_index, line_by_line)

        print(f"Generated comparison: {output_path}")

    except Exception as e:
        print(f"Error: {str(e)}")

 # Example usage for comparing multiple files in a folder
 if __name__ == "__main__":

    folder_path = r"YOUR_FOLDER_PATH"
    output_filename = "multiple_txt_comparison.html"
    output_path = Path(folder_path) / output_filename
    
    # Specify the index of the standard file (e.g., 0 for the first file)
    standard_file_index = 0  
    
    compare_multiple_files(folder_path, str(output_path), standard_file_index, line_by_line=True)
	import diff_match_patch
	import re
	from pathlib import Path
	from typing import List, Tuple

	# --- Configuration Variables (Text Colors) ---
	TEXT1_DELETION_COLOR = "#ff0000" # Red
	TEXT2_INSERTION_COLOR = "#008000" # Green


	def get_output_filename(*filenames: str) -> str:
	"""Generate output HTML filename from multiple input filenames"""
	stems = [Path(filename).stem for filename in filenames]
	return f"{'_'.join(stems)}.html"

	class DiffHighlighter:
	def __init__(self):
	self.dmp = diff_match_patch.diff_match_patch()

	def process_line_diffs(self, standard_line: str, other_lines: List[Tuple[int, str]]) -> List[Tuple[int, str]]:
	"""
	Process differences between a standard line and multiple other lines,
	generating HTML for each comparison.

	Args:
	standard_line: The line from the standard file to compare against.
	other_lines: A list of tuples, each containing the original line
	index and the line content from other files.

	Returns:
	List of tuples, each containing the original line index and the
	corresponding HTML string with differences highlighted.
	"""
	htmls = []
	for other_index, other_line in other_lines:
	diffs = self.dmp.diff_main(standard_line, other_line)
	self.dmp.diff_cleanupSemanticLossless(diffs)
	htmls.append((other_index, self._generate_comparison_html(diffs)))
	return htmls

	def _generate_comparison_html(self, diffs) -> str:
	"""Generate HTML for comparison, highlighting deletions in red and insertions in green"""
	html = ""
	for op, data in diffs:
	if op == diff_match_patch.diff_match_patch.DIFF_EQUAL:
	html += data
	elif op == diff_match_patch.diff_match_patch.DIFF_DELETE:
	html += f"<span style=\"background-color:#ffdddd;color:{TEXT1_DELETION_COLOR};\">{data}</span>"
	elif op == diff_match_patch.diff_match_patch.DIFF_INSERT:
	html += f"<ins style=\"background-color:#e6ffe6;color:{TEXT2_INSERTION_COLOR};\">{data}</ins>"
	return html

	def split_markdown_blocks(text: str) -> dict:
	"""Split markdown text into blocks based on ### headers"""
	blocks = {}
	current_header = None
	current_content = []

	lines = text.splitlines()
	i = 0
	while i < len(lines):
	line = lines[i]
	if line.startswith('###'):
	# Save previous block if exists
	if current_header:
	blocks[current_header] = '\n'.join(current_content)
	# Start new block
	current_header = line.strip()
	current_content = []
	i += 1 # Move to the next line after the header
	else:
	if current_header: # Only add content if we have a header
	current_content.append(line)
	i += 1

	# Save last block
	if current_header:
	blocks[current_header] = '\n'.join(current_content)

	return blocks

	def highlight_differences_line_by_line(texts: List[str], filenames: List[str], standard_file_index: int = 0) -> List[str]:
	"""
	Compares multiple markdown texts line by line within each block,
	generates HTML output, adds the filename after each line, and
	combines the output into a single list, preserving the original
	order of lines and grouping lines with the same number together.
	Allows specifying a standard file for comparison.

	Args:
	texts: List of markdown texts.
	filenames: List of filenames corresponding to the texts.
	standard_file_index: Index of the file to be used as the standard
	for comparison.

	Returns:
	List of HTML strings with differences highlighted line by line
	and filenames appended.
	"""
	highlighter = DiffHighlighter()

	# Split all texts into blocks
	blocks_list = [split_markdown_blocks(text) for text in texts]

	# Get all unique headers
	all_headers = sorted(set().union(*[set(blocks.keys()) for blocks in blocks_list]))

	combined_html_blocks = []

	# Process each block
	for header in all_headers:
	# Add header
	combined_html_blocks.append(f"<h4>{header}</h4>")

	# Extract content for this block from all texts
	contents = [blocks.get(header, '') for blocks in blocks_list]

	if any(contents):
	# Extract numbered lines and their numbers from all contents
	numbered_lines_list = [re.findall(r'^(\d+\.) (.*)$', content, re.MULTILINE) for content in contents]

	# Create dictionaries mapping numbers to lines for all contents
	lines_dicts = [{num: line for num, line in numbered_lines} for numbered_lines in numbered_lines_list]

	# Get all unique numbers
	all_numbers = sorted(set().union(*[set(lines_dict.keys()) for lines_dict in lines_dicts]), key=lambda x: int(x.split('.')[0]))

	# Initialize a list to store lines in their original order
	ordered_lines = []

	# Compare lines based on numbers
	for num in all_numbers:
	lines = [lines_dict.get(num, '') for lines_dict in lines_dicts]

	# Create a list to store the file index and line content for each file
	indexed_lines = []
	for file_index, line in enumerate(lines):
	numbered_line = f"{num} {line}" if line else " " # Use non-breaking space for empty lines
	indexed_lines.append((file_index, numbered_line))

	# Use the line from the standard file for comparison
	standard_line = indexed_lines[standard_file_index][1]

	# Compare the standard line with all other lines
	other_lines = [(index, line) for index, line in indexed_lines if index != standard_file_index]

	# Process line differences against the standard line
	compared_lines = highlighter.process_line_diffs(standard_line, other_lines)

	# Add the standard line to the ordered_lines list
	filename = filenames[standard_file_index]
	html_with_filename = f"{standard_line} ({filename})"
	ordered_lines.append((int(num.split('.')[0]), standard_file_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))

	# Add compared lines to the ordered_lines list
	for other_index, html in compared_lines:
	filename = filenames[other_index]
	html_with_filename = f"{html} ({filename})"
	ordered_lines.append((int(num.split('.')[0]), other_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))

	# Sort lines based on their line number and then their original index to preserve order within the block
	ordered_lines.sort()

	# Add the ordered lines to the combined output
	for _, _, line_html in ordered_lines:
	combined_html_blocks.append(line_html)

	# Add separator between blocks
	combined_html_blocks.append("<hr>")

	return combined_html_blocks

	def generate_diff_html(texts: List[str], output_path: str, filenames: List[str], standard_file_index: int, line_by_line: bool = True):
	"""
	Generate HTML file showing differences between multiple texts.

	Args:
	texts: List of texts to compare.
	output_path: Path to save the output HTML file.
	filenames: List of filenames corresponding to the texts.
	standard_file_index: Index of the file to be used as the standard for comparison.
	line_by_line: If True, perform line-by-line comparison within blocks.
	"""
	if line_by_line:
	combined_html_blocks = highlight_differences_line_by_line(texts, filenames, standard_file_index)
	else:
	raise ValueError("Block-by-block comparison is not supported for multiple files.")

	# Get filenames for display
	file_names = [Path(filename).stem for filename in filenames]

	html_template = """<!DOCTYPE html>
	<html>
	<head>
	<title>Diff Comparison</title>
	<meta charset="UTF-8">
	<style>
	.container {{
	display: block;
	}}
	.section {{
	padding: 10px;
	}}
	hr {{
	border: 1px solid #ccc;
	margin: 20px 0;
	}}
	h4 {{
	background-color: #f0f0f0;
	padding: 5px;
	margin: 10px 0;
	}}
	.section-header {{
	padding: 10px;
	border-bottom: 2px solid #ccc;
	}}
	</style>
	</head>
	<body>
	<div class="container">
	<div class="section">
	<h3 class="section-header">Comparison: {filenames}</h3>
	{content}
	</div>
	</div>
	</body>
	</html>"""

	html_content = html_template.format(
	content="\n".join(combined_html_blocks),
	filenames=", ".join(file_names)
	)

	with open(output_path, "w", encoding="utf-8") as f:
	f.write(html_content)

	def get_ref_filename(output_path: str) -> str:
	"""Extract reference filename from the output path"""
	# Output path format is: ref_compare_comparison.html
	return output_path.split('_compare_')[0]

	def get_comparison_filename(output_path: str) -> str:
	"""Extract comparison filename from the output path"""
	# Output path format is: ref_compare_comparison.html
	return output_path.split('_compare_')[1].replace('.html', '')

	def try_read_file(file_path: str) -> str:
	"""
	Try to read file with different encodings

	Args:
	file_path: Path to the file

	Returns:
	File contents as string
	"""
	encodings = ['utf-8', 'shift-jis', 'gbk', 'big5', 'cp932']

	for encoding in encodings:
	try:
	with open(file_path, 'r', encoding=encoding) as f:
	return f.read()
	except UnicodeDecodeError:
	continue

	raise UnicodeDecodeError(f"Failed to read {file_path} with encodings: {encodings}")

	def compare_multiple_files(folder_path: str, output_path: str, standard_file_index: int = 0, line_by_line: bool = True) -> None:
	"""
	Compare multiple txt files in a folder and generate a combined HTML output.

	Args:
	folder_path: Path to the folder containing txt files.
	output_path: Path to save the output HTML file.
	standard_file_index: Index of the file to be used as the standard for comparison.
	line_by_line: If True, use line-by-line comparison within blocks.
	"""
	try:
	folder = Path(folder_path).resolve()
	print(f"Processing folder: {folder}")

	txt_files = sorted(folder.glob("*.txt")) # Sort files for consistent ordering

	if not txt_files:
	print(f"No txt files found in {folder_path}")
	return

	print(f"\nFound {len(txt_files)} txt files to compare:")
	for f in txt_files:
	print(f" - {f.name}")

	texts = [try_read_file(str(txt_file)) for txt_file in txt_files]
	filenames = [txt_file.stem for txt_file in txt_files] # Use stem to get filename without extension

	generate_diff_html(texts, output_path, filenames, standard_file_index, line_by_line)

	print(f"Generated comparison: {output_path}")

	except Exception as e:
	print(f"Error: {str(e)}")

	# Example usage for comparing multiple files in a folder
	if __name__ == "__main__":

	folder_path = r"YOUR_FOLDER_PATH"
	output_filename = "multiple_txt_comparison.html"
	output_path = Path(folder_path) / output_filename

	# Specify the index of the standard file (e.g., 0 for the first file)
	standard_file_index = 0

	compare_multiple_files(folder_path, str(output_path), standard_file_index, line_by_line=True)