GlenDC · September 8, 2025 12:51
diff --git a/discord_export.py b/discord_export.py
 #!/usr/bin/env python3
 """
 discord_html_to_markdown.py

 Convert a Discord HTML export to Markdown while correctly handling replies.

 Usage:
  python discord_html_to_markdown.py input.html -o out.md
  python discord_html_to_markdown.py input.html --stdout
  python discord_html_to_markdown.py input.html -o out.md --filenames-only
 """

 from __future__ import annotations

 import argparse
 import re
 from pathlib import Path
 from typing import Iterable, List, Tuple, Optional
 from urllib.parse import urlparse

 from bs4 import BeautifulSoup, Tag

 # ---------------- Selectors and constants ----------------

 DATE_DIVIDER_CLASSES = (
    "divider__5126c", "hasContent__5126c",   # newer style
    "chatlog__message-group",                # older grouped exports sometimes carry date headers
 )

 MESSAGE_ITEM_CLASSES = (
    "messageListItem__5126c",  # newer style
    "chatlog__message",        # older style
 )

 HEADER_CLASS_RE = re.compile(r"(header_|chatlog__header)")
 USERNAME_CLASS_RE = re.compile(r"(username_|chatlog__author)")
 CONTENT_CLASS_RE = re.compile(r"(markup__|chatlog__content)")
 DIVIDER_CONTENT_CLASS_RE = re.compile(r"(content__|chatlog__date|chatlog__timestamp)")
 REPLY_PREVIEW_CLASS_RE = re.compile(r"(repl(y|ied)|reference)", re.IGNORECASE)

 ATTACH_EXTS = (".wav", ".mp3", ".flac", ".m4a", ".ogg", ".pdf", ".png", ".jpg", ".jpeg", ".webp", ".gif")


 # ---------------- Utilities ----------------

 def text_or_none(tag: Optional[Tag]) -> Optional[str]:
    if tag:
        t = tag.get_text(strip=True)
        return t if t else None
    return None


 def filename_from_url(url: str) -> str:
    path = urlparse(url).path
    name = path.split("/")[-1]
    return name or url


 def is_message_node(node: Tag) -> bool:
    classes = node.get("class") or []
    if node.name == "li" and any(cls in classes for cls in MESSAGE_ITEM_CLASSES):
        return True
    if node.name == "div" and "chatlog__message" in classes:
        return True
    return False


 def is_date_divider(node: Tag) -> bool:
    classes = node.get("class") or []
    if node.name == "div" and any(cls in classes for cls in DATE_DIVIDER_CLASSES):
        if "divider__5126c" in classes and "hasContent__5126c" in classes:
            return True
        if "chatlog__message-group" in classes:
            return True
    return False


 def extract_date_from_divider(node: Tag) -> Optional[str]:
    span = node.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
    if span and text_or_none(span):
        return span.get_text(strip=True)

    header = node.find("div", class_=HEADER_CLASS_RE)
    if header:
        date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
        if date_candidate and text_or_none(date_candidate):
            return date_candidate.get_text(strip=True)
    return None


 def remove_reply_previews(container: Tag) -> None:
    """
    Strip reply previews so we do not confuse them with the actual reply body.
    This removes typical preview blocks like:
      - elements with classes containing 'reply', 'replied', or 'reference'
      - first blockquote that looks like a reply quote
      - 'jump to message' anchors inside preview headers
    """
    # Remove obvious reply or reference blocks by class
    for e in container.find_all(True, class_=REPLY_PREVIEW_CLASS_RE):
        e.decompose()

    # Remove blockquote-based reply previews
    for bq in container.find_all("blockquote"):
        txt = bq.get_text(" ", strip=True).lower()
        if "reply" in txt or "jump to message" in txt or "message reference" in txt:
            bq.decompose()

    # Remove small header rows that contain only a jump link
    for a in container.find_all("a", href=True):
        href = a["href"].lower()
        t = a.get_text(strip=True).lower()
        if "jump" in t or "#message-" in href:
            a.decompose()


 def clean_content(div: Tag) -> str:
    # Replace emoji images with alt text
    for img in div.find_all("img"):
        alt = img.get("alt") or ""
        img.replace_with(alt)

    # Mentions to plain text
    for m in div.find_all(["span", "div"], class_=re.compile(r"(mention|roleMention)", re.IGNORECASE)):
        m.replace_with(m.get_text(strip=True))

    # Links to markdown
    for a in div.find_all("a"):
        href = a.get("href")
        txt = a.get_text(strip=True)
        if href:
            a.replace_with(f"[{txt or href}]({href})")
        else:
            a.replace_with(txt)

    # Unordered lists to bullets
    for ul in div.find_all("ul"):
        items = []
        for li in ul.find_all("li", recursive=False):
            items.append(f"- {li.get_text(' ', strip=True)}")
        ul.replace_with("\n".join(items))

    # Keep line breaks
    text = div.get_text("\n", strip=True)

    # Tidy spacing
    text = re.sub(r"[ \t]+\n", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text


 def extract_attachments(container: Tag) -> List[Tuple[str, str]]:
    atts: List[Tuple[str, str]] = []
    seen = set()
    for a in container.find_all("a", href=True):
        href = a["href"]
        base = href.split("?", 1)[0].lower()
        if base.endswith(ATTACH_EXTS):
            label = filename_from_url(href)
            key = (label, href)
            if key not in seen:
                atts.append(key)
                seen.add(key)
    return atts


 def extract_message_fields(node: Tag) -> Tuple[Optional[str], Optional[str], str, str, List[Tuple[str, str]]]:
    """
    Returns (date_if_in_header, timestamp, username, text, attachments)
    Correctly prefers the reply body over the reply preview.
    """
    header = node.find(["h3", "div"], class_=HEADER_CLASS_RE)
    timestamp = ""
    username = "Unknown"

    # Timestamp
    if header:
        time_tag = header.find("time")
        if time_tag and time_tag.get("aria-label"):
            timestamp = time_tag["aria-label"]
        elif time_tag and text_or_none(time_tag):
            timestamp = time_tag.get_text(strip=True)

        # Username
        user_span = header.find(["span", "a"], class_=USERNAME_CLASS_RE)
        if user_span and text_or_none(user_span):
            username = user_span.get_text(strip=True)

    # Content:
    # Discord often places reply preview markup before the actual reply message markup.
    # Strategy: find all content blocks and pick the last one, after removing preview elements.
    text = ""
    content_blocks = node.find_all("div", class_=CONTENT_CLASS_RE)
    if content_blocks:
        # Work on a copy-like reference: remove preview elements in each, then use the last
        for div in content_blocks:
            remove_reply_previews(div)
        text = clean_content(content_blocks[-1])

    # Attachments
    atts = extract_attachments(node)

    # Optional date inside header for older exports
    date_from_header = None
    if header:
        date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
        if date_candidate and text_or_none(date_candidate):
            raw = date_candidate.get_text(strip=True)
            if re.search(r"[A-Za-z]+\s+\d{1,2},\s+\d{4}", raw):
                date_from_header = raw

    return date_from_header, timestamp, username, text, atts


 # ---------------- Conversion ----------------

 def convert_html_to_markdown(html: str, filenames_only: bool = False) -> str:
    soup = BeautifulSoup(html, "html.parser")

    current_date: Optional[str] = None
    messages: List[Tuple[str, str, str, str, List[Tuple[str, str]]]] = []

    root_iter = (soup.body.find_all(True, recursive=True) if soup.body else soup.find_all(True, recursive=True))

    for node in root_iter:
        if is_date_divider(node):
            d = extract_date_from_divider(node)
            if d:
                current_date = d
            continue

        if is_message_node(node):
            date_from_header, ts, user, text, atts = extract_message_fields(node)
            msg_date = current_date or date_from_header or ""
            messages.append((msg_date, ts, user, text, atts))

    # Build markdown
    lines: List[str] = []
    on_date: Optional[str] = None

    for msg_date, ts, user, text, atts in messages:
        if msg_date and msg_date != on_date:
            lines.append(f"### {msg_date}")
            lines.append("")
            on_date = msg_date

        # Compact time like "July 29, 2025 at 7:26 PM" to "7:26 PM"
        display_time = ts
        m = re.search(r"\bat\s+([0-9]{1,2}:[0-9]{2}\s*[AP]M)\b", ts or "")
        if m:
            display_time = m.group(1)

        header_line = f"- {display_time} — **{user}**:" if display_time else f"- **{user}**:"
        lines.append(header_line)

        if text:
            for ln in text.splitlines():
                lines.append(f"  {ln}")

        for label, href in atts:
            if filenames_only:
                lines.append(f"  Attachment: [{label}]")
            else:
                lines.append(f"  Attachment: [{label}]({href})")

        lines.append("")

    return "\n".join(lines).strip()


 def main():
    parser = argparse.ArgumentParser(description="Convert Discord HTML export to Markdown.")
    parser.add_argument("input_html", type=Path, help="Path to Discord HTML export")
    parser.add_argument("-o", "--output", type=Path, help="Path to write Markdown output")
    parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout")
    parser.add_argument("--filenames-only", action="store_true", help="List attachments by filename without full URLs")
    args = parser.parse_args()

    html = args.input_html.read_text(encoding="utf-8")
    md = convert_html_to_markdown(html, filenames_only=args.filenames_only)

    if args.stdout or not args.output:
        print(md)
    else:
        args.output.write_text(md, encoding="utf-8")
        print(f"Wrote Markdown to {args.output}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	discord_html_to_markdown.py

	Convert a Discord HTML export to Markdown while correctly handling replies.

	Usage:
	python discord_html_to_markdown.py input.html -o out.md
	python discord_html_to_markdown.py input.html --stdout
	python discord_html_to_markdown.py input.html -o out.md --filenames-only
	"""

	from __future__ import annotations

	import argparse
	import re
	from pathlib import Path
	from typing import Iterable, List, Tuple, Optional
	from urllib.parse import urlparse

	from bs4 import BeautifulSoup, Tag

	# ---------------- Selectors and constants ----------------

	DATE_DIVIDER_CLASSES = (
	"divider__5126c", "hasContent__5126c", # newer style
	"chatlog__message-group", # older grouped exports sometimes carry date headers
	)

	MESSAGE_ITEM_CLASSES = (
	"messageListItem__5126c", # newer style
	"chatlog__message", # older style
	)

	HEADER_CLASS_RE = re.compile(r"(header_\|chatlog__header)")
	USERNAME_CLASS_RE = re.compile(r"(username_\|chatlog__author)")
	CONTENT_CLASS_RE = re.compile(r"(markup__\|chatlog__content)")
	DIVIDER_CONTENT_CLASS_RE = re.compile(r"(content__\|chatlog__date\|chatlog__timestamp)")
	REPLY_PREVIEW_CLASS_RE = re.compile(r"(repl(y\|ied)\|reference)", re.IGNORECASE)

	ATTACH_EXTS = (".wav", ".mp3", ".flac", ".m4a", ".ogg", ".pdf", ".png", ".jpg", ".jpeg", ".webp", ".gif")


	# ---------------- Utilities ----------------

	def text_or_none(tag: Optional[Tag]) -> Optional[str]:
	if tag:
	t = tag.get_text(strip=True)
	return t if t else None
	return None


	def filename_from_url(url: str) -> str:
	path = urlparse(url).path
	name = path.split("/")[-1]
	return name or url


	def is_message_node(node: Tag) -> bool:
	classes = node.get("class") or []
	if node.name == "li" and any(cls in classes for cls in MESSAGE_ITEM_CLASSES):
	return True
	if node.name == "div" and "chatlog__message" in classes:
	return True
	return False


	def is_date_divider(node: Tag) -> bool:
	classes = node.get("class") or []
	if node.name == "div" and any(cls in classes for cls in DATE_DIVIDER_CLASSES):
	if "divider__5126c" in classes and "hasContent__5126c" in classes:
	return True
	if "chatlog__message-group" in classes:
	return True
	return False


	def extract_date_from_divider(node: Tag) -> Optional[str]:
	span = node.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
	if span and text_or_none(span):
	return span.get_text(strip=True)

	header = node.find("div", class_=HEADER_CLASS_RE)
	if header:
	date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
	if date_candidate and text_or_none(date_candidate):
	return date_candidate.get_text(strip=True)
	return None


	def remove_reply_previews(container: Tag) -> None:
	"""
	Strip reply previews so we do not confuse them with the actual reply body.
	This removes typical preview blocks like:
	- elements with classes containing 'reply', 'replied', or 'reference'
	- first blockquote that looks like a reply quote
	- 'jump to message' anchors inside preview headers
	"""
	# Remove obvious reply or reference blocks by class
	for e in container.find_all(True, class_=REPLY_PREVIEW_CLASS_RE):
	e.decompose()

	# Remove blockquote-based reply previews
	for bq in container.find_all("blockquote"):
	txt = bq.get_text(" ", strip=True).lower()
	if "reply" in txt or "jump to message" in txt or "message reference" in txt:
	bq.decompose()

	# Remove small header rows that contain only a jump link
	for a in container.find_all("a", href=True):
	href = a["href"].lower()
	t = a.get_text(strip=True).lower()
	if "jump" in t or "#message-" in href:
	a.decompose()


	def clean_content(div: Tag) -> str:
	# Replace emoji images with alt text
	for img in div.find_all("img"):
	alt = img.get("alt") or ""
	img.replace_with(alt)

	# Mentions to plain text
	for m in div.find_all(["span", "div"], class_=re.compile(r"(mention\|roleMention)", re.IGNORECASE)):
	m.replace_with(m.get_text(strip=True))

	# Links to markdown
	for a in div.find_all("a"):
	href = a.get("href")
	txt = a.get_text(strip=True)
	if href:
	a.replace_with(f"[{txt or href}]({href})")
	else:
	a.replace_with(txt)

	# Unordered lists to bullets
	for ul in div.find_all("ul"):
	items = []
	for li in ul.find_all("li", recursive=False):
	items.append(f"- {li.get_text(' ', strip=True)}")
	ul.replace_with("\n".join(items))

	# Keep line breaks
	text = div.get_text("\n", strip=True)

	# Tidy spacing
	text = re.sub(r"[ \t]+\n", "\n", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text


	def extract_attachments(container: Tag) -> List[Tuple[str, str]]:
	atts: List[Tuple[str, str]] = []
	seen = set()
	for a in container.find_all("a", href=True):
	href = a["href"]
	base = href.split("?", 1)[0].lower()
	if base.endswith(ATTACH_EXTS):
	label = filename_from_url(href)
	key = (label, href)
	if key not in seen:
	atts.append(key)
	seen.add(key)
	return atts


	def extract_message_fields(node: Tag) -> Tuple[Optional[str], Optional[str], str, str, List[Tuple[str, str]]]:
	"""
	Returns (date_if_in_header, timestamp, username, text, attachments)
	Correctly prefers the reply body over the reply preview.
	"""
	header = node.find(["h3", "div"], class_=HEADER_CLASS_RE)
	timestamp = ""
	username = "Unknown"

	# Timestamp
	if header:
	time_tag = header.find("time")
	if time_tag and time_tag.get("aria-label"):
	timestamp = time_tag["aria-label"]
	elif time_tag and text_or_none(time_tag):
	timestamp = time_tag.get_text(strip=True)

	# Username
	user_span = header.find(["span", "a"], class_=USERNAME_CLASS_RE)
	if user_span and text_or_none(user_span):
	username = user_span.get_text(strip=True)

	# Content:
	# Discord often places reply preview markup before the actual reply message markup.
	# Strategy: find all content blocks and pick the last one, after removing preview elements.
	text = ""
	content_blocks = node.find_all("div", class_=CONTENT_CLASS_RE)
	if content_blocks:
	# Work on a copy-like reference: remove preview elements in each, then use the last
	for div in content_blocks:
	remove_reply_previews(div)
	text = clean_content(content_blocks[-1])

	# Attachments
	atts = extract_attachments(node)

	# Optional date inside header for older exports
	date_from_header = None
	if header:
	date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
	if date_candidate and text_or_none(date_candidate):
	raw = date_candidate.get_text(strip=True)
	if re.search(r"[A-Za-z]+\s+\d{1,2},\s+\d{4}", raw):
	date_from_header = raw

	return date_from_header, timestamp, username, text, atts


	# ---------------- Conversion ----------------

	def convert_html_to_markdown(html: str, filenames_only: bool = False) -> str:
	soup = BeautifulSoup(html, "html.parser")

	current_date: Optional[str] = None
	messages: List[Tuple[str, str, str, str, List[Tuple[str, str]]]] = []

	root_iter = (soup.body.find_all(True, recursive=True) if soup.body else soup.find_all(True, recursive=True))

	for node in root_iter:
	if is_date_divider(node):
	d = extract_date_from_divider(node)
	if d:
	current_date = d
	continue

	if is_message_node(node):
	date_from_header, ts, user, text, atts = extract_message_fields(node)
	msg_date = current_date or date_from_header or ""
	messages.append((msg_date, ts, user, text, atts))

	# Build markdown
	lines: List[str] = []
	on_date: Optional[str] = None

	for msg_date, ts, user, text, atts in messages:
	if msg_date and msg_date != on_date:
	lines.append(f"### {msg_date}")
	lines.append("")
	on_date = msg_date

	# Compact time like "July 29, 2025 at 7:26 PM" to "7:26 PM"
	display_time = ts
	m = re.search(r"\bat\s+([0-9]{1,2}:[0-9]{2}\s*[AP]M)\b", ts or "")
	if m:
	display_time = m.group(1)

	header_line = f"- {display_time} — {user}:" if display_time else f"- {user}:"
	lines.append(header_line)

	if text:
	for ln in text.splitlines():
	lines.append(f" {ln}")

	for label, href in atts:
	if filenames_only:
	lines.append(f" Attachment: [{label}]")
	else:
	lines.append(f" Attachment: [{label}]({href})")

	lines.append("")

	return "\n".join(lines).strip()


	def main():
	parser = argparse.ArgumentParser(description="Convert Discord HTML export to Markdown.")
	parser.add_argument("input_html", type=Path, help="Path to Discord HTML export")
	parser.add_argument("-o", "--output", type=Path, help="Path to write Markdown output")
	parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout")
	parser.add_argument("--filenames-only", action="store_true", help="List attachments by filename without full URLs")
	args = parser.parse_args()

	html = args.input_html.read_text(encoding="utf-8")
	md = convert_html_to_markdown(html, filenames_only=args.filenames_only)

	if args.stdout or not args.output:
	print(md)
	else:
	args.output.write_text(md, encoding="utf-8")
	print(f"Wrote Markdown to {args.output}")


	if __name__ == "__main__":
	main()