Skip to content

Instantly share code, notes, and snippets.

@GlenDC
Created September 8, 2025 12:51
Show Gist options
  • Save GlenDC/f9b26d909631bcd7c4e45b518fc19783 to your computer and use it in GitHub Desktop.
Save GlenDC/f9b26d909631bcd7c4e45b518fc19783 to your computer and use it in GitHub Desktop.
discord chat export of copied html input data
#!/usr/bin/env python3
"""
discord_html_to_markdown.py
Convert a Discord HTML export to Markdown while correctly handling replies.
Usage:
python discord_html_to_markdown.py input.html -o out.md
python discord_html_to_markdown.py input.html --stdout
python discord_html_to_markdown.py input.html -o out.md --filenames-only
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
from typing import Iterable, List, Tuple, Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup, Tag
# ---------------- Selectors and constants ----------------
DATE_DIVIDER_CLASSES = (
"divider__5126c", "hasContent__5126c", # newer style
"chatlog__message-group", # older grouped exports sometimes carry date headers
)
MESSAGE_ITEM_CLASSES = (
"messageListItem__5126c", # newer style
"chatlog__message", # older style
)
HEADER_CLASS_RE = re.compile(r"(header_|chatlog__header)")
USERNAME_CLASS_RE = re.compile(r"(username_|chatlog__author)")
CONTENT_CLASS_RE = re.compile(r"(markup__|chatlog__content)")
DIVIDER_CONTENT_CLASS_RE = re.compile(r"(content__|chatlog__date|chatlog__timestamp)")
REPLY_PREVIEW_CLASS_RE = re.compile(r"(repl(y|ied)|reference)", re.IGNORECASE)
ATTACH_EXTS = (".wav", ".mp3", ".flac", ".m4a", ".ogg", ".pdf", ".png", ".jpg", ".jpeg", ".webp", ".gif")
# ---------------- Utilities ----------------
def text_or_none(tag: Optional[Tag]) -> Optional[str]:
if tag:
t = tag.get_text(strip=True)
return t if t else None
return None
def filename_from_url(url: str) -> str:
path = urlparse(url).path
name = path.split("/")[-1]
return name or url
def is_message_node(node: Tag) -> bool:
classes = node.get("class") or []
if node.name == "li" and any(cls in classes for cls in MESSAGE_ITEM_CLASSES):
return True
if node.name == "div" and "chatlog__message" in classes:
return True
return False
def is_date_divider(node: Tag) -> bool:
classes = node.get("class") or []
if node.name == "div" and any(cls in classes for cls in DATE_DIVIDER_CLASSES):
if "divider__5126c" in classes and "hasContent__5126c" in classes:
return True
if "chatlog__message-group" in classes:
return True
return False
def extract_date_from_divider(node: Tag) -> Optional[str]:
span = node.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
if span and text_or_none(span):
return span.get_text(strip=True)
header = node.find("div", class_=HEADER_CLASS_RE)
if header:
date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
if date_candidate and text_or_none(date_candidate):
return date_candidate.get_text(strip=True)
return None
def remove_reply_previews(container: Tag) -> None:
"""
Strip reply previews so we do not confuse them with the actual reply body.
This removes typical preview blocks like:
- elements with classes containing 'reply', 'replied', or 'reference'
- first blockquote that looks like a reply quote
- 'jump to message' anchors inside preview headers
"""
# Remove obvious reply or reference blocks by class
for e in container.find_all(True, class_=REPLY_PREVIEW_CLASS_RE):
e.decompose()
# Remove blockquote-based reply previews
for bq in container.find_all("blockquote"):
txt = bq.get_text(" ", strip=True).lower()
if "reply" in txt or "jump to message" in txt or "message reference" in txt:
bq.decompose()
# Remove small header rows that contain only a jump link
for a in container.find_all("a", href=True):
href = a["href"].lower()
t = a.get_text(strip=True).lower()
if "jump" in t or "#message-" in href:
a.decompose()
def clean_content(div: Tag) -> str:
# Replace emoji images with alt text
for img in div.find_all("img"):
alt = img.get("alt") or ""
img.replace_with(alt)
# Mentions to plain text
for m in div.find_all(["span", "div"], class_=re.compile(r"(mention|roleMention)", re.IGNORECASE)):
m.replace_with(m.get_text(strip=True))
# Links to markdown
for a in div.find_all("a"):
href = a.get("href")
txt = a.get_text(strip=True)
if href:
a.replace_with(f"[{txt or href}]({href})")
else:
a.replace_with(txt)
# Unordered lists to bullets
for ul in div.find_all("ul"):
items = []
for li in ul.find_all("li", recursive=False):
items.append(f"- {li.get_text(' ', strip=True)}")
ul.replace_with("\n".join(items))
# Keep line breaks
text = div.get_text("\n", strip=True)
# Tidy spacing
text = re.sub(r"[ \t]+\n", "\n", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text
def extract_attachments(container: Tag) -> List[Tuple[str, str]]:
atts: List[Tuple[str, str]] = []
seen = set()
for a in container.find_all("a", href=True):
href = a["href"]
base = href.split("?", 1)[0].lower()
if base.endswith(ATTACH_EXTS):
label = filename_from_url(href)
key = (label, href)
if key not in seen:
atts.append(key)
seen.add(key)
return atts
def extract_message_fields(node: Tag) -> Tuple[Optional[str], Optional[str], str, str, List[Tuple[str, str]]]:
"""
Returns (date_if_in_header, timestamp, username, text, attachments)
Correctly prefers the reply body over the reply preview.
"""
header = node.find(["h3", "div"], class_=HEADER_CLASS_RE)
timestamp = ""
username = "Unknown"
# Timestamp
if header:
time_tag = header.find("time")
if time_tag and time_tag.get("aria-label"):
timestamp = time_tag["aria-label"]
elif time_tag and text_or_none(time_tag):
timestamp = time_tag.get_text(strip=True)
# Username
user_span = header.find(["span", "a"], class_=USERNAME_CLASS_RE)
if user_span and text_or_none(user_span):
username = user_span.get_text(strip=True)
# Content:
# Discord often places reply preview markup before the actual reply message markup.
# Strategy: find all content blocks and pick the last one, after removing preview elements.
text = ""
content_blocks = node.find_all("div", class_=CONTENT_CLASS_RE)
if content_blocks:
# Work on a copy-like reference: remove preview elements in each, then use the last
for div in content_blocks:
remove_reply_previews(div)
text = clean_content(content_blocks[-1])
# Attachments
atts = extract_attachments(node)
# Optional date inside header for older exports
date_from_header = None
if header:
date_candidate = header.find("span", class_=DIVIDER_CONTENT_CLASS_RE)
if date_candidate and text_or_none(date_candidate):
raw = date_candidate.get_text(strip=True)
if re.search(r"[A-Za-z]+\s+\d{1,2},\s+\d{4}", raw):
date_from_header = raw
return date_from_header, timestamp, username, text, atts
# ---------------- Conversion ----------------
def convert_html_to_markdown(html: str, filenames_only: bool = False) -> str:
soup = BeautifulSoup(html, "html.parser")
current_date: Optional[str] = None
messages: List[Tuple[str, str, str, str, List[Tuple[str, str]]]] = []
root_iter = (soup.body.find_all(True, recursive=True) if soup.body else soup.find_all(True, recursive=True))
for node in root_iter:
if is_date_divider(node):
d = extract_date_from_divider(node)
if d:
current_date = d
continue
if is_message_node(node):
date_from_header, ts, user, text, atts = extract_message_fields(node)
msg_date = current_date or date_from_header or ""
messages.append((msg_date, ts, user, text, atts))
# Build markdown
lines: List[str] = []
on_date: Optional[str] = None
for msg_date, ts, user, text, atts in messages:
if msg_date and msg_date != on_date:
lines.append(f"### {msg_date}")
lines.append("")
on_date = msg_date
# Compact time like "July 29, 2025 at 7:26 PM" to "7:26 PM"
display_time = ts
m = re.search(r"\bat\s+([0-9]{1,2}:[0-9]{2}\s*[AP]M)\b", ts or "")
if m:
display_time = m.group(1)
header_line = f"- {display_time} — **{user}**:" if display_time else f"- **{user}**:"
lines.append(header_line)
if text:
for ln in text.splitlines():
lines.append(f" {ln}")
for label, href in atts:
if filenames_only:
lines.append(f" Attachment: [{label}]")
else:
lines.append(f" Attachment: [{label}]({href})")
lines.append("")
return "\n".join(lines).strip()
def main():
parser = argparse.ArgumentParser(description="Convert Discord HTML export to Markdown.")
parser.add_argument("input_html", type=Path, help="Path to Discord HTML export")
parser.add_argument("-o", "--output", type=Path, help="Path to write Markdown output")
parser.add_argument("--stdout", action="store_true", help="Print Markdown to stdout")
parser.add_argument("--filenames-only", action="store_true", help="List attachments by filename without full URLs")
args = parser.parse_args()
html = args.input_html.read_text(encoding="utf-8")
md = convert_html_to_markdown(html, filenames_only=args.filenames_only)
if args.stdout or not args.output:
print(md)
else:
args.output.write_text(md, encoding="utf-8")
print(f"Wrote Markdown to {args.output}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment