Created
February 1, 2025 15:16
-
-
Save freelze/682ce976779adff19679be4d2e031e38 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import diff_match_patch | |
import re | |
from pathlib import Path | |
from typing import List, Tuple | |
# --- Configuration Variables (Text Colors) --- | |
TEXT1_DELETION_COLOR = "#ff0000" # Red | |
TEXT2_INSERTION_COLOR = "#008000" # Green | |
def get_output_filename(*filenames: str) -> str: | |
"""Generate output HTML filename from multiple input filenames""" | |
stems = [Path(filename).stem for filename in filenames] | |
return f"{'_'.join(stems)}.html" | |
class DiffHighlighter: | |
def __init__(self): | |
self.dmp = diff_match_patch.diff_match_patch() | |
def process_line_diffs(self, standard_line: str, other_lines: List[Tuple[int, str]]) -> List[Tuple[int, str]]: | |
""" | |
Process differences between a standard line and multiple other lines, | |
generating HTML for each comparison. | |
Args: | |
standard_line: The line from the standard file to compare against. | |
other_lines: A list of tuples, each containing the original line | |
index and the line content from other files. | |
Returns: | |
List of tuples, each containing the original line index and the | |
corresponding HTML string with differences highlighted. | |
""" | |
htmls = [] | |
for other_index, other_line in other_lines: | |
diffs = self.dmp.diff_main(standard_line, other_line) | |
self.dmp.diff_cleanupSemanticLossless(diffs) | |
htmls.append((other_index, self._generate_comparison_html(diffs))) | |
return htmls | |
def _generate_comparison_html(self, diffs) -> str: | |
"""Generate HTML for comparison, highlighting deletions in red and insertions in green""" | |
html = "" | |
for op, data in diffs: | |
if op == diff_match_patch.diff_match_patch.DIFF_EQUAL: | |
html += data | |
elif op == diff_match_patch.diff_match_patch.DIFF_DELETE: | |
html += f"<span style=\"background-color:#ffdddd;color:{TEXT1_DELETION_COLOR};\">{data}</span>" | |
elif op == diff_match_patch.diff_match_patch.DIFF_INSERT: | |
html += f"<ins style=\"background-color:#e6ffe6;color:{TEXT2_INSERTION_COLOR};\">{data}</ins>" | |
return html | |
def split_markdown_blocks(text: str) -> dict: | |
"""Split markdown text into blocks based on ### headers""" | |
blocks = {} | |
current_header = None | |
current_content = [] | |
lines = text.splitlines() | |
i = 0 | |
while i < len(lines): | |
line = lines[i] | |
if line.startswith('###'): | |
# Save previous block if exists | |
if current_header: | |
blocks[current_header] = '\n'.join(current_content) | |
# Start new block | |
current_header = line.strip() | |
current_content = [] | |
i += 1 # Move to the next line after the header | |
else: | |
if current_header: # Only add content if we have a header | |
current_content.append(line) | |
i += 1 | |
# Save last block | |
if current_header: | |
blocks[current_header] = '\n'.join(current_content) | |
return blocks | |
def highlight_differences_line_by_line(texts: List[str], filenames: List[str], standard_file_index: int = 0) -> List[str]: | |
""" | |
Compares multiple markdown texts line by line within each block, | |
generates HTML output, adds the filename after each line, and | |
combines the output into a single list, preserving the original | |
order of lines and grouping lines with the same number together. | |
Allows specifying a standard file for comparison. | |
Args: | |
texts: List of markdown texts. | |
filenames: List of filenames corresponding to the texts. | |
standard_file_index: Index of the file to be used as the standard | |
for comparison. | |
Returns: | |
List of HTML strings with differences highlighted line by line | |
and filenames appended. | |
""" | |
highlighter = DiffHighlighter() | |
# Split all texts into blocks | |
blocks_list = [split_markdown_blocks(text) for text in texts] | |
# Get all unique headers | |
all_headers = sorted(set().union(*[set(blocks.keys()) for blocks in blocks_list])) | |
combined_html_blocks = [] | |
# Process each block | |
for header in all_headers: | |
# Add header | |
combined_html_blocks.append(f"<h4>{header}</h4>") | |
# Extract content for this block from all texts | |
contents = [blocks.get(header, '') for blocks in blocks_list] | |
if any(contents): | |
# Extract numbered lines and their numbers from all contents | |
numbered_lines_list = [re.findall(r'^(\d+\.) (.*)$', content, re.MULTILINE) for content in contents] | |
# Create dictionaries mapping numbers to lines for all contents | |
lines_dicts = [{num: line for num, line in numbered_lines} for numbered_lines in numbered_lines_list] | |
# Get all unique numbers | |
all_numbers = sorted(set().union(*[set(lines_dict.keys()) for lines_dict in lines_dicts]), key=lambda x: int(x.split('.')[0])) | |
# Initialize a list to store lines in their original order | |
ordered_lines = [] | |
# Compare lines based on numbers | |
for num in all_numbers: | |
lines = [lines_dict.get(num, '') for lines_dict in lines_dicts] | |
# Create a list to store the file index and line content for each file | |
indexed_lines = [] | |
for file_index, line in enumerate(lines): | |
numbered_line = f"{num} {line}" if line else " " # Use non-breaking space for empty lines | |
indexed_lines.append((file_index, numbered_line)) | |
# Use the line from the standard file for comparison | |
standard_line = indexed_lines[standard_file_index][1] | |
# Compare the standard line with all other lines | |
other_lines = [(index, line) for index, line in indexed_lines if index != standard_file_index] | |
# Process line differences against the standard line | |
compared_lines = highlighter.process_line_diffs(standard_line, other_lines) | |
# Add the standard line to the ordered_lines list | |
filename = filenames[standard_file_index] | |
html_with_filename = f"{standard_line} ({filename})" | |
ordered_lines.append((int(num.split('.')[0]), standard_file_index, f"<span style=\"display: block;\">{html_with_filename}</span>")) | |
# Add compared lines to the ordered_lines list | |
for other_index, html in compared_lines: | |
filename = filenames[other_index] | |
html_with_filename = f"{html} ({filename})" | |
ordered_lines.append((int(num.split('.')[0]), other_index, f"<span style=\"display: block;\">{html_with_filename}</span>")) | |
# Sort lines based on their line number and then their original index to preserve order within the block | |
ordered_lines.sort() | |
# Add the ordered lines to the combined output | |
for _, _, line_html in ordered_lines: | |
combined_html_blocks.append(line_html) | |
# Add separator between blocks | |
combined_html_blocks.append("<hr>") | |
return combined_html_blocks | |
def generate_diff_html(texts: List[str], output_path: str, filenames: List[str], standard_file_index: int, line_by_line: bool = True): | |
""" | |
Generate HTML file showing differences between multiple texts. | |
Args: | |
texts: List of texts to compare. | |
output_path: Path to save the output HTML file. | |
filenames: List of filenames corresponding to the texts. | |
standard_file_index: Index of the file to be used as the standard for comparison. | |
line_by_line: If True, perform line-by-line comparison within blocks. | |
""" | |
if line_by_line: | |
combined_html_blocks = highlight_differences_line_by_line(texts, filenames, standard_file_index) | |
else: | |
raise ValueError("Block-by-block comparison is not supported for multiple files.") | |
# Get filenames for display | |
file_names = [Path(filename).stem for filename in filenames] | |
html_template = """<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Diff Comparison</title> | |
<meta charset="UTF-8"> | |
<style> | |
.container {{ | |
display: block; | |
}} | |
.section {{ | |
padding: 10px; | |
}} | |
hr {{ | |
border: 1px solid #ccc; | |
margin: 20px 0; | |
}} | |
h4 {{ | |
background-color: #f0f0f0; | |
padding: 5px; | |
margin: 10px 0; | |
}} | |
.section-header {{ | |
padding: 10px; | |
border-bottom: 2px solid #ccc; | |
}} | |
</style> | |
</head> | |
<body> | |
<div class="container"> | |
<div class="section"> | |
<h3 class="section-header">Comparison: {filenames}</h3> | |
{content} | |
</div> | |
</div> | |
</body> | |
</html>""" | |
html_content = html_template.format( | |
content="\n".join(combined_html_blocks), | |
filenames=", ".join(file_names) | |
) | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(html_content) | |
def get_ref_filename(output_path: str) -> str: | |
"""Extract reference filename from the output path""" | |
# Output path format is: ref_compare_comparison.html | |
return output_path.split('_compare_')[0] | |
def get_comparison_filename(output_path: str) -> str: | |
"""Extract comparison filename from the output path""" | |
# Output path format is: ref_compare_comparison.html | |
return output_path.split('_compare_')[1].replace('.html', '') | |
def try_read_file(file_path: str) -> str: | |
""" | |
Try to read file with different encodings | |
Args: | |
file_path: Path to the file | |
Returns: | |
File contents as string | |
""" | |
encodings = ['utf-8', 'shift-jis', 'gbk', 'big5', 'cp932'] | |
for encoding in encodings: | |
try: | |
with open(file_path, 'r', encoding=encoding) as f: | |
return f.read() | |
except UnicodeDecodeError: | |
continue | |
raise UnicodeDecodeError(f"Failed to read {file_path} with encodings: {encodings}") | |
def compare_multiple_files(folder_path: str, output_path: str, standard_file_index: int = 0, line_by_line: bool = True) -> None: | |
""" | |
Compare multiple txt files in a folder and generate a combined HTML output. | |
Args: | |
folder_path: Path to the folder containing txt files. | |
output_path: Path to save the output HTML file. | |
standard_file_index: Index of the file to be used as the standard for comparison. | |
line_by_line: If True, use line-by-line comparison within blocks. | |
""" | |
try: | |
folder = Path(folder_path).resolve() | |
print(f"Processing folder: {folder}") | |
txt_files = sorted(folder.glob("*.txt")) # Sort files for consistent ordering | |
if not txt_files: | |
print(f"No txt files found in {folder_path}") | |
return | |
print(f"\nFound {len(txt_files)} txt files to compare:") | |
for f in txt_files: | |
print(f" - {f.name}") | |
texts = [try_read_file(str(txt_file)) for txt_file in txt_files] | |
filenames = [txt_file.stem for txt_file in txt_files] # Use stem to get filename without extension | |
generate_diff_html(texts, output_path, filenames, standard_file_index, line_by_line) | |
print(f"Generated comparison: {output_path}") | |
except Exception as e: | |
print(f"Error: {str(e)}") | |
# Example usage for comparing multiple files in a folder | |
if __name__ == "__main__": | |
folder_path = r"YOUR_FOLDER_PATH" | |
output_filename = "multiple_txt_comparison.html" | |
output_path = Path(folder_path) / output_filename | |
# Specify the index of the standard file (e.g., 0 for the first file) | |
standard_file_index = 0 | |
compare_multiple_files(folder_path, str(output_path), standard_file_index, line_by_line=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment