Created
September 8, 2025 12:51
-
-
Save GlenDC/f9b26d909631bcd7c4e45b518fc19783 to your computer and use it in GitHub Desktop.
discord chat export of copied html input data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| discord_html_to_markdown.py | |
| Convert a Discord HTML export to Markdown while correctly handling replies. | |
| Usage: | |
| python discord_html_to_markdown.py input.html -o out.md | |
| python discord_html_to_markdown.py input.html --stdout | |
| python discord_html_to_markdown.py input.html -o out.md --filenames-only | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import re | |
| from pathlib import Path | |
| from typing import Iterable, List, Tuple, Optional | |
| from urllib.parse import urlparse | |
| from bs4 import BeautifulSoup, Tag | |
| # ---------------- Selectors and constants ---------------- | |
| DATE_DIVIDER_CLASSES = ( | |
| "divider__5126c", "hasContent__5126c", # newer style | |
| "chatlog__message-group", # older grouped exports sometimes carry date headers | |
| ) | |
| MESSAGE_ITEM_CLASSES = ( | |
| "messageListItem__5126c", # newer style | |
| "chatlog__message", # older style | |
| ) | |
| HEADER_CLASS_RE = re.compile(r"(header_|chatlog__header)") | |
| USERNAME_CLASS_RE = re.compile(r"(username_|chatlog__author)") | |
| CONTENT_CLASS_RE = re.compile(r"(markup__|chatlog__content)") | |
| DIVIDER_CONTENT_CLASS_RE = re.compile(r"(content__|chatlog__date|chatlog__timestamp)") | |
| REPLY_PREVIEW_CLASS_RE = re.compile(r"(repl(y|ied)|reference)", re.IGNORECASE) | |
| ATTACH_EXTS = (".wav", ".mp3", ".flac", ".m4a", ".ogg", ".pdf", ".png", ".jpg", ".jpeg", ".webp", ".gif") | |
| # ---------------- Utilities ---------------- | |
| def text_or_none(tag: Optional[Tag]) -> Optional[str]: | |
| if tag: | |
| t = tag.get_text(strip=True) | |
| return t if t else None | |
| return None | |
| def filename_from_url(url: str) -> str: | |
| path = urlparse(url).path | |
| name = path.split("/")[-1] | |
| return name or url | |
| def is_message_node(node: Tag) -> bool: | |
| classes = node.get("class") or [] | |
| if node.name == "li" and any(cls in classes for cls in MESSAGE_ITEM_CLASSES): | |
| return True | |
| if node.name == "div" and "chatlog__message" in classes: | |
| return True | |
| return False | |
| def is_date_divider(node: Tag) -> bool: | |
| classes = node.get("class") or [] | |
| if node.name == "div" and any(cls in classes for cls in DATE_DIVIDER_CLASSES): | |
| if "divider__5126c" in classes and "hasContent__5126c" in classes: | |
| return True | |
| if "chatlog__message-group" in classes: | |
| return True | |
| return False | |
| def extract_date_from_divider(node: Tag) -> Optional[str]: | |
| span = node.find("span", class_=DIVIDER_CONTENT_CLASS_RE) | |
| if span and text_or_none(span): | |
| return span.get_text(strip=True) | |
| header = node.find("div", class_=HEADER_CLASS_RE) | |
| if header: | |
| date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE) | |
| if date_candidate and text_or_none(date_candidate): | |
| return date_candidate.get_text(strip=True) | |
| return None | |
| def remove_reply_previews(container: Tag) -> None: | |
| """ | |
| Strip reply previews so we do not confuse them with the actual reply body. | |
| This removes typical preview blocks like: | |
| - elements with classes containing 'reply', 'replied', or 'reference' | |
| - first blockquote that looks like a reply quote | |
| - 'jump to message' anchors inside preview headers | |
| """ | |
| # Remove obvious reply or reference blocks by class | |
| for e in container.find_all(True, class_=REPLY_PREVIEW_CLASS_RE): | |
| e.decompose() | |
| # Remove blockquote-based reply previews | |
| for bq in container.find_all("blockquote"): | |
| txt = bq.get_text(" ", strip=True).lower() | |
| if "reply" in txt or "jump to message" in txt or "message reference" in txt: | |
| bq.decompose() | |
| # Remove small header rows that contain only a jump link | |
| for a in container.find_all("a", href=True): | |
| href = a["href"].lower() | |
| t = a.get_text(strip=True).lower() | |
| if "jump" in t or "#message-" in href: | |
| a.decompose() | |
| def clean_content(div: Tag) -> str: | |
| # Replace emoji images with alt text | |
| for img in div.find_all("img"): | |
| alt = img.get("alt") or "" | |
| img.replace_with(alt) | |
| # Mentions to plain text | |
| for m in div.find_all(["span", "div"], class_=re.compile(r"(mention|roleMention)", re.IGNORECASE)): | |
| m.replace_with(m.get_text(strip=True)) | |
| # Links to markdown | |
| for a in div.find_all("a"): | |
| href = a.get("href") | |
| txt = a.get_text(strip=True) | |
| if href: | |
| a.replace_with(f"[{txt or href}]({href})") | |
| else: | |
| a.replace_with(txt) | |
| # Unordered lists to bullets | |
| for ul in div.find_all("ul"): | |
| items = [] | |
| for li in ul.find_all("li", recursive=False): | |
| items.append(f"- {li.get_text(' ', strip=True)}") | |
| ul.replace_with("\n".join(items)) | |
| # Keep line breaks | |
| text = div.get_text("\n", strip=True) | |
| # Tidy spacing | |
| text = re.sub(r"[ \t]+\n", "\n", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text | |
| def extract_attachments(container: Tag) -> List[Tuple[str, str]]: | |
| atts: List[Tuple[str, str]] = [] | |
| seen = set() | |
| for a in container.find_all("a", href=True): | |
| href = a["href"] | |
| base = href.split("?", 1)[0].lower() | |
| if base.endswith(ATTACH_EXTS): | |
| label = filename_from_url(href) | |
| key = (label, href) | |
| if key not in seen: | |
| atts.append(key) | |
| seen.add(key) | |
| return atts | |
| def extract_message_fields(node: Tag) -> Tuple[Optional[str], Optional[str], str, str, List[Tuple[str, str]]]: | |
| """ | |
| Returns (date_if_in_header, timestamp, username, text, attachments) | |
| Correctly prefers the reply body over the reply preview. | |
| """ | |
| header = node.find(["h3", "div"], class_=HEADER_CLASS_RE) | |
| timestamp = "" | |
| username = "Unknown" | |
| # Timestamp | |
| if header: | |
| time_tag = header.find("time") | |
| if time_tag and time_tag.get("aria-label"): | |
| timestamp = time_tag["aria-label"] | |
| elif time_tag and text_or_none(time_tag): | |
| timestamp = time_tag.get_text(strip=True) | |
| # Username | |
| user_span = header.find(["span", "a"], class_=USERNAME_CLASS_RE) | |
| if user_span and text_or_none(user_span): | |
| username = user_span.get_text(strip=True) | |
| # Content: | |
| # Discord often places reply preview markup before the actual reply message markup. | |
| # Strategy: find all content blocks and pick the last one, after removing preview elements. | |
| text = "" | |
| content_blocks = node.find_all("div", class_=CONTENT_CLASS_RE) | |
| if content_blocks: | |
| # Work on a copy-like reference: remove preview elements in each, then use the last | |
| for div in content_blocks: | |
| remove_reply_previews(div) | |
| text = clean_content(content_blocks[-1]) | |
| # Attachments | |
| atts = extract_attachments(node) | |
| # Optional date inside header for older exports | |
| date_from_header = None | |
| if header: | |
| date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE) | |
| if date_candidate and text_or_none(date_candidate): | |
| raw = date_candidate.get_text(strip=True) | |
| if re.search(r"[A-Za-z]+\s+\d{1,2},\s+\d{4}", raw): | |
| date_from_header = raw | |
| return date_from_header, timestamp, username, text, atts | |
| # ---------------- Conversion ---------------- | |
| def convert_html_to_markdown(html: str, filenames_only: bool = False) -> str: | |
| soup = BeautifulSoup(html, "html.parser") | |
| current_date: Optional[str] = None | |
| messages: List[Tuple[str, str, str, str, List[Tuple[str, str]]]] = [] | |
| root_iter = (soup.body.find_all(True, recursive=True) if soup.body else soup.find_all(True, recursive=True)) | |
| for node in root_iter: | |
| if is_date_divider(node): | |
| d = extract_date_from_divider(node) | |
| if d: | |
| current_date = d | |
| continue | |
| if is_message_node(node): | |
| date_from_header, ts, user, text, atts = extract_message_fields(node) | |
| msg_date = current_date or date_from_header or "" | |
| messages.append((msg_date, ts, user, text, atts)) | |
| # Build markdown | |
| lines: List[str] = [] | |
| on_date: Optional[str] = None | |
| for msg_date, ts, user, text, atts in messages: | |
| if msg_date and msg_date != on_date: | |
| lines.append(f"### {msg_date}") | |
| lines.append("") | |
| on_date = msg_date | |
| # Compact time like "July 29, 2025 at 7:26 PM" to "7:26 PM" | |
| display_time = ts | |
| m = re.search(r"\bat\s+([0-9]{1,2}:[0-9]{2}\s*[AP]M)\b", ts or "") | |
| if m: | |
| display_time = m.group(1) | |
| header_line = f"- {display_time} — **{user}**:" if display_time else f"- **{user}**:" | |
| lines.append(header_line) | |
| if text: | |
| for ln in text.splitlines(): | |
| lines.append(f" {ln}") | |
| for label, href in atts: | |
| if filenames_only: | |
| lines.append(f" Attachment: [{label}]") | |
| else: | |
| lines.append(f" Attachment: [{label}]({href})") | |
| lines.append("") | |
| return "\n".join(lines).strip() | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Convert Discord HTML export to Markdown.") | |
| parser.add_argument("input_html", type=Path, help="Path to Discord HTML export") | |
| parser.add_argument("-o", "--output", type=Path, help="Path to write Markdown output") | |
| parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout") | |
| parser.add_argument("--filenames-only", action="store_true", help="List attachments by filename without full URLs") | |
| args = parser.parse_args() | |
| html = args.input_html.read_text(encoding="utf-8") | |
| md = convert_html_to_markdown(html, filenames_only=args.filenames_only) | |
| if args.stdout or not args.output: | |
| print(md) | |
| else: | |
| args.output.write_text(md, encoding="utf-8") | |
| print(f"Wrote Markdown to {args.output}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment