Skip to content

Instantly share code, notes, and snippets.

@freelze
Created February 1, 2025 15:16
Show Gist options
  • Save freelze/682ce976779adff19679be4d2e031e38 to your computer and use it in GitHub Desktop.
Save freelze/682ce976779adff19679be4d2e031e38 to your computer and use it in GitHub Desktop.
import diff_match_patch
import re
from pathlib import Path
from typing import List, Tuple
# --- Configuration Variables (Text Colors) ---
TEXT1_DELETION_COLOR = "#ff0000" # Red
TEXT2_INSERTION_COLOR = "#008000" # Green
def get_output_filename(*filenames: str) -> str:
"""Generate output HTML filename from multiple input filenames"""
stems = [Path(filename).stem for filename in filenames]
return f"{'_'.join(stems)}.html"
class DiffHighlighter:
def __init__(self):
self.dmp = diff_match_patch.diff_match_patch()
def process_line_diffs(self, standard_line: str, other_lines: List[Tuple[int, str]]) -> List[Tuple[int, str]]:
"""
Process differences between a standard line and multiple other lines,
generating HTML for each comparison.
Args:
standard_line: The line from the standard file to compare against.
other_lines: A list of tuples, each containing the original line
index and the line content from other files.
Returns:
List of tuples, each containing the original line index and the
corresponding HTML string with differences highlighted.
"""
htmls = []
for other_index, other_line in other_lines:
diffs = self.dmp.diff_main(standard_line, other_line)
self.dmp.diff_cleanupSemanticLossless(diffs)
htmls.append((other_index, self._generate_comparison_html(diffs)))
return htmls
def _generate_comparison_html(self, diffs) -> str:
"""Generate HTML for comparison, highlighting deletions in red and insertions in green"""
html = ""
for op, data in diffs:
if op == diff_match_patch.diff_match_patch.DIFF_EQUAL:
html += data
elif op == diff_match_patch.diff_match_patch.DIFF_DELETE:
html += f"<span style=\"background-color:#ffdddd;color:{TEXT1_DELETION_COLOR};\">{data}</span>"
elif op == diff_match_patch.diff_match_patch.DIFF_INSERT:
html += f"<ins style=\"background-color:#e6ffe6;color:{TEXT2_INSERTION_COLOR};\">{data}</ins>"
return html
def split_markdown_blocks(text: str) -> dict:
"""Split markdown text into blocks based on ### headers"""
blocks = {}
current_header = None
current_content = []
lines = text.splitlines()
i = 0
while i < len(lines):
line = lines[i]
if line.startswith('###'):
# Save previous block if exists
if current_header:
blocks[current_header] = '\n'.join(current_content)
# Start new block
current_header = line.strip()
current_content = []
i += 1 # Move to the next line after the header
else:
if current_header: # Only add content if we have a header
current_content.append(line)
i += 1
# Save last block
if current_header:
blocks[current_header] = '\n'.join(current_content)
return blocks
def highlight_differences_line_by_line(texts: List[str], filenames: List[str], standard_file_index: int = 0) -> List[str]:
"""
Compares multiple markdown texts line by line within each block,
generates HTML output, adds the filename after each line, and
combines the output into a single list, preserving the original
order of lines and grouping lines with the same number together.
Allows specifying a standard file for comparison.
Args:
texts: List of markdown texts.
filenames: List of filenames corresponding to the texts.
standard_file_index: Index of the file to be used as the standard
for comparison.
Returns:
List of HTML strings with differences highlighted line by line
and filenames appended.
"""
highlighter = DiffHighlighter()
# Split all texts into blocks
blocks_list = [split_markdown_blocks(text) for text in texts]
# Get all unique headers
all_headers = sorted(set().union(*[set(blocks.keys()) for blocks in blocks_list]))
combined_html_blocks = []
# Process each block
for header in all_headers:
# Add header
combined_html_blocks.append(f"<h4>{header}</h4>")
# Extract content for this block from all texts
contents = [blocks.get(header, '') for blocks in blocks_list]
if any(contents):
# Extract numbered lines and their numbers from all contents
numbered_lines_list = [re.findall(r'^(\d+\.) (.*)$', content, re.MULTILINE) for content in contents]
# Create dictionaries mapping numbers to lines for all contents
lines_dicts = [{num: line for num, line in numbered_lines} for numbered_lines in numbered_lines_list]
# Get all unique numbers
all_numbers = sorted(set().union(*[set(lines_dict.keys()) for lines_dict in lines_dicts]), key=lambda x: int(x.split('.')[0]))
# Initialize a list to store lines in their original order
ordered_lines = []
# Compare lines based on numbers
for num in all_numbers:
lines = [lines_dict.get(num, '') for lines_dict in lines_dicts]
# Create a list to store the file index and line content for each file
indexed_lines = []
for file_index, line in enumerate(lines):
numbered_line = f"{num} {line}" if line else " " # Use non-breaking space for empty lines
indexed_lines.append((file_index, numbered_line))
# Use the line from the standard file for comparison
standard_line = indexed_lines[standard_file_index][1]
# Compare the standard line with all other lines
other_lines = [(index, line) for index, line in indexed_lines if index != standard_file_index]
# Process line differences against the standard line
compared_lines = highlighter.process_line_diffs(standard_line, other_lines)
# Add the standard line to the ordered_lines list
filename = filenames[standard_file_index]
html_with_filename = f"{standard_line} ({filename})"
ordered_lines.append((int(num.split('.')[0]), standard_file_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))
# Add compared lines to the ordered_lines list
for other_index, html in compared_lines:
filename = filenames[other_index]
html_with_filename = f"{html} ({filename})"
ordered_lines.append((int(num.split('.')[0]), other_index, f"<span style=\"display: block;\">{html_with_filename}</span>"))
# Sort lines based on their line number and then their original index to preserve order within the block
ordered_lines.sort()
# Add the ordered lines to the combined output
for _, _, line_html in ordered_lines:
combined_html_blocks.append(line_html)
# Add separator between blocks
combined_html_blocks.append("<hr>")
return combined_html_blocks
def generate_diff_html(texts: List[str], output_path: str, filenames: List[str], standard_file_index: int, line_by_line: bool = True):
"""
Generate HTML file showing differences between multiple texts.
Args:
texts: List of texts to compare.
output_path: Path to save the output HTML file.
filenames: List of filenames corresponding to the texts.
standard_file_index: Index of the file to be used as the standard for comparison.
line_by_line: If True, perform line-by-line comparison within blocks.
"""
if line_by_line:
combined_html_blocks = highlight_differences_line_by_line(texts, filenames, standard_file_index)
else:
raise ValueError("Block-by-block comparison is not supported for multiple files.")
# Get filenames for display
file_names = [Path(filename).stem for filename in filenames]
html_template = """<!DOCTYPE html>
<html>
<head>
<title>Diff Comparison</title>
<meta charset="UTF-8">
<style>
.container {{
display: block;
}}
.section {{
padding: 10px;
}}
hr {{
border: 1px solid #ccc;
margin: 20px 0;
}}
h4 {{
background-color: #f0f0f0;
padding: 5px;
margin: 10px 0;
}}
.section-header {{
padding: 10px;
border-bottom: 2px solid #ccc;
}}
</style>
</head>
<body>
<div class="container">
<div class="section">
<h3 class="section-header">Comparison: {filenames}</h3>
{content}
</div>
</div>
</body>
</html>"""
html_content = html_template.format(
content="\n".join(combined_html_blocks),
filenames=", ".join(file_names)
)
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_content)
def get_ref_filename(output_path: str) -> str:
"""Extract reference filename from the output path"""
# Output path format is: ref_compare_comparison.html
return output_path.split('_compare_')[0]
def get_comparison_filename(output_path: str) -> str:
"""Extract comparison filename from the output path"""
# Output path format is: ref_compare_comparison.html
return output_path.split('_compare_')[1].replace('.html', '')
def try_read_file(file_path: str) -> str:
"""
Try to read file with different encodings
Args:
file_path: Path to the file
Returns:
File contents as string
"""
encodings = ['utf-8', 'shift-jis', 'gbk', 'big5', 'cp932']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
raise UnicodeDecodeError(f"Failed to read {file_path} with encodings: {encodings}")
def compare_multiple_files(folder_path: str, output_path: str, standard_file_index: int = 0, line_by_line: bool = True) -> None:
"""
Compare multiple txt files in a folder and generate a combined HTML output.
Args:
folder_path: Path to the folder containing txt files.
output_path: Path to save the output HTML file.
standard_file_index: Index of the file to be used as the standard for comparison.
line_by_line: If True, use line-by-line comparison within blocks.
"""
try:
folder = Path(folder_path).resolve()
print(f"Processing folder: {folder}")
txt_files = sorted(folder.glob("*.txt")) # Sort files for consistent ordering
if not txt_files:
print(f"No txt files found in {folder_path}")
return
print(f"\nFound {len(txt_files)} txt files to compare:")
for f in txt_files:
print(f" - {f.name}")
texts = [try_read_file(str(txt_file)) for txt_file in txt_files]
filenames = [txt_file.stem for txt_file in txt_files] # Use stem to get filename without extension
generate_diff_html(texts, output_path, filenames, standard_file_index, line_by_line)
print(f"Generated comparison: {output_path}")
except Exception as e:
print(f"Error: {str(e)}")
# Example usage for comparing multiple files in a folder
if __name__ == "__main__":
folder_path = r"YOUR_FOLDER_PATH"
output_filename = "multiple_txt_comparison.html"
output_path = Path(folder_path) / output_filename
# Specify the index of the standard file (e.g., 0 for the first file)
standard_file_index = 0
compare_multiple_files(folder_path, str(output_path), standard_file_index, line_by_line=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment