Skip to content

Instantly share code, notes, and snippets.

@JeremyMcCormick
Last active September 2, 2025 23:30
Show Gist options
  • Save JeremyMcCormick/8063121b7f4182027af2223d78858040 to your computer and use it in GitHub Desktop.
Save JeremyMcCormick/8063121b7f4182027af2223d78858040 to your computer and use it in GitHub Desktop.
Reads in exported ChatGPT conversation data (JSON format), filters based on keywords, and then writes out the filtered conversatiosn to JSON, HTML, or a directory with HTML files (Generated by ChatGPT-5)
#!/usr/bin/env python3
"""
Filter ChatGPT export conversations by keyword and optionally write results to
JSON, a single HTML file, or a directory with one HTML file per conversation.
Usage:
python filter_conv.py --json /path/to/conversations.json --keyword "ppdb" \
--out-json filtered_conversations.json \
--out-html filtered_conversations.html \
--out-dir filtered_conversations_dir
"""
from __future__ import annotations
import argparse
import html
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable
def _flatten_parts(parts: Iterable[Any]) -> list[str]:
"""Extract text fragments from a heterogeneous `parts` structure.
Parameters
----------
parts
Iterable of items that may be strings or dicts (possibly nested).
Returns
-------
list[str]
Extracted plain-text fragments.
"""
out: list[str] = []
for p in parts or []:
if isinstance(p, str):
out.append(p)
elif isinstance(p, dict):
if "text" in p and isinstance(p["text"], str):
out.append(p["text"])
elif "content" in p and isinstance(p["content"], str):
out.append(p["content"])
elif "parts" in p and isinstance(p["parts"], list):
out.extend(_flatten_parts(p["parts"]))
elif "content" in p and isinstance(p["content"], list):
out.extend(_flatten_parts(p["content"]))
return out
def _message_to_text(message: dict[str, Any]) -> str:
"""Extract best-effort plain text from a ChatGPT export `message` object.
Parameters
----------
message
The message dict under `node["message"]`.
Returns
-------
str
Concatenated plain text for this message (may be empty).
"""
if not message:
return ""
content = message.get("content")
if isinstance(content, dict):
if "parts" in content and isinstance(content["parts"], list):
return " ".join(_flatten_parts(content["parts"]))
if "text" in content and isinstance(content["text"], str):
return content["text"]
if "content" in content and isinstance(content["content"], list):
return " ".join(_flatten_parts(content["content"]))
if "content" in content and isinstance(content["content"], str):
return content["content"]
elif isinstance(content, list):
return " ".join(_flatten_parts(content))
parts = message.get("parts")
if isinstance(parts, list):
return " ".join(_flatten_parts(parts))
return ""
def _collect_messages(convo: dict[str, Any]) -> list[dict[str, Any]]:
"""Collect messages from a conversation into a chronological list.
Parameters
----------
convo
A single conversation object from `conversations.json`.
Returns
-------
list[dict[str, Any]]
Each item has keys: `role` (str), `text` (str), `create_time` (float|None).
"""
messages: list[dict[str, Any]] = []
mapping = convo.get("mapping", {})
if not isinstance(mapping, dict):
return messages
for node in mapping.values():
if not isinstance(node, dict):
continue
message = node.get("message")
if not isinstance(message, dict):
continue
role = (message.get("author") or {}).get("role") or ""
text = _message_to_text(message)
ts = message.get("create_time")
if ts is None:
ts = convo.get("create_time")
messages.append({"role": str(role), "text": text, "create_time": ts})
messages.sort(key=lambda m: (m["create_time"] is None, m["create_time"]))
return messages
def filter_conversations_by_project(json_path: str | Path, project_keyword: str) -> list[dict[str, Any]]:
"""Filter conversations whose text contains the given keyword (case-insensitive).
Parameters
----------
json_path
Path to the `conversations.json` file from ChatGPT export.
project_keyword
Keyword or phrase to match.
Returns
-------
list[dict[str, Any]]
Conversation dicts containing the keyword anywhere in their messages.
"""
path = Path(json_path)
with path.open(encoding="utf-8") as f:
data: list[dict[str, Any]] = json.load(f)
needle = project_keyword.lower()
results: list[dict[str, Any]] = []
for convo in data:
texts: list[str] = []
for m in _collect_messages(convo):
if m["text"]:
texts.append(m["text"])
full_text = " ".join(texts).lower()
if needle in full_text:
results.append(convo)
return results
def _fmt_dt(ts: float | int | None) -> str:
"""Format a UNIX timestamp as ISO 8601 UTC string.
Parameters
----------
ts
UNIX epoch seconds, or None.
Returns
-------
str
ISO 8601 string in UTC, or empty string if None/invalid.
"""
if ts is None:
return ""
try:
return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
except Exception:
return ""
def _sanitize_filename(name: str, max_len: int = 80) -> str:
"""Sanitize a string for safe filesystem usage.
Parameters
----------
name
Candidate filename (e.g., conversation title).
max_len
Maximum length of the resulting filename stem (before extension).
Returns
-------
str
Safe filename stem without extension.
"""
# Collapse whitespace and strip
s = " ".join(name.split()).strip()
# Replace unsafe characters with underscores
s = re.sub(r'[^\w\-\. ]+', "_", s, flags=re.UNICODE)
# Replace spaces with underscores
s = s.replace(" ", "_")
# Trim length
if len(s) > max_len:
s = s[:max_len].rstrip("_-.")
# Ensure non-empty
return s or "conversation"
def render_html(conversations: list[dict[str, Any]], keyword: str) -> str:
"""Render filtered conversations as a standalone HTML page.
Parameters
----------
conversations
List of conversation dicts (from export) already filtered.
keyword
The keyword used for filtering; shown in the header.
Returns
-------
str
Complete HTML document as a string.
"""
page_title = f"ChatGPT Conversations matching: {keyword}"
generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
parts: list[str] = [
"<!DOCTYPE html>",
"<html lang='en'>",
"<head>",
f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>",
"<meta name='viewport' content='width=device-width,initial-scale=1'>",
"<style>",
"body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}",
"h1{font-size:1.6rem;margin-bottom:0.25rem}",
"h2{font-size:1.2rem;margin:1.5rem 0 0.5rem}",
".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}",
".convo{border:1px solid #ddd;border-radius:8px;padding:1rem;margin-bottom:1.5rem;background:#fafafa}",
".msg{padding:0.5rem 0;border-top:1px solid #eee}",
".msg:first-child{border-top:none}",
".role{font-weight:600}",
".time{color:#555;font-size:0.85rem;margin-left:0.5rem}",
"pre{white-space:pre-wrap;word-wrap:break-word;background:#fff;border:1px solid #eee;border-radius:6px;padding:0.5rem;margin:0.25rem 0}",
"a{color:inherit}",
"</style>",
"</head>",
"<body>",
f"<h1>{html.escape(page_title)}</h1>",
f"<div class='meta'>Generated: {html.escape(generated)} • Total conversations: {len(conversations)}</div>",
]
for idx, convo in enumerate(conversations, start=1):
title = convo.get("title") or "(untitled)"
ctime = _fmt_dt(convo.get("create_time"))
parts.append("<div class='convo'>")
parts.append(f"<h2>{idx}. {html.escape(title)}</h2>")
if ctime:
parts.append(f"<div class='meta'>Created: {html.escape(ctime)}</div>")
messages = _collect_messages(convo)
for m in messages:
role = m.get("role") or ""
text = m.get("text") or ""
ts = _fmt_dt(m.get("create_time"))
safe_text = html.escape(text)
parts.append("<div class='msg'>")
head = f"<span class='role'>{html.escape(role)}</span>"
if ts:
head += f"<span class='time'>{html.escape(ts)}</span>"
parts.append(head)
parts.append(f"<pre>{safe_text}</pre>")
parts.append("</div>")
parts.append("</div>") # .convo
parts.append("</body></html>")
return "\n".join(parts)
def render_single_conversation_html(convo: dict[str, Any], idx: int) -> str:
"""Render a single conversation as a standalone HTML page.
Parameters
----------
convo
Conversation dict (from export).
idx
1-based index used for labeling.
Returns
-------
str
Complete HTML document for the conversation.
"""
title = convo.get("title") or "(untitled)"
ctime = _fmt_dt(convo.get("create_time"))
page_title = f"{idx}. {title}"
generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
parts: list[str] = [
"<!DOCTYPE html>",
"<html lang='en'>",
"<head>",
f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>",
"<meta name='viewport' content='width=device-width,initial-scale=1'>",
"<style>",
"body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}",
"h1{font-size:1.6rem;margin-bottom:0.25rem}",
".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}",
".msg{padding:0.75rem 0;border-top:1px solid #eee}",
".msg:first-child{border-top:none}",
".role{font-weight:600}",
".time{color:#555;font-size:0.85rem;margin-left:0.5rem}",
"pre{white-space:pre-wrap;word-wrap:break-word;background:#fff;border:1px solid #eee;border-radius:6px;padding:0.5rem;margin:0.25rem 0}",
"</style>",
"</head>",
"<body>",
f"<h1>{html.escape(page_title)}</h1>",
f"<div class='meta'>Created: {html.escape(ctime) if ctime else ''} • Exported: {html.escape(generated)}</div>",
]
messages = _collect_messages(convo)
for m in messages:
role = m.get("role") or ""
text = m.get("text") or ""
ts = _fmt_dt(m.get("create_time"))
safe_text = html.escape(text)
parts.append("<div class='msg'>")
head = f"<span class='role'>{html.escape(role)}</span>"
if ts:
head += f"<span class='time'>{html.escape(ts)}</span>"
parts.append(head)
parts.append(f"<pre>{safe_text}</pre>")
parts.append("</div>")
parts.append("</body></html>")
return "\n".join(parts)
def write_per_conversation_html(conversations: list[dict[str, Any]], out_dir: Path, make_index: bool = True) -> None:
"""Write one HTML file per conversation to a directory, with an optional index.
Parameters
----------
conversations
Filtered list of conversations to write.
out_dir
Target directory to create/write files into.
make_index
Whether to create an `index.html` linking to all conversation files.
Returns
-------
None
"""
out_dir.mkdir(parents=True, exist_ok=True)
index_rows: list[tuple[str, str, str]] = [] # (filename, title, created)
for idx, convo in enumerate(conversations, start=1):
title = convo.get("title") or "(untitled)"
created = _fmt_dt(convo.get("create_time"))
stem = f"{idx:04d}_{_sanitize_filename(title)}"
filename = f"{stem}.html"
file_path = out_dir / filename
html_doc = render_single_conversation_html(convo, idx)
file_path.write_text(html_doc, encoding="utf-8")
index_rows.append((filename, title, created))
if make_index:
page_title = f"{len(conversations)} conversations"
generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
parts: list[str] = [
"<!DOCTYPE html>",
"<html lang='en'>",
"<head>",
f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>",
"<meta name='viewport' content='width=device-width,initial-scale=1'>",
"<style>",
"body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}",
"h1{font-size:1.6rem;margin-bottom:0.25rem}",
".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}",
"table{border-collapse:collapse;width:100%}",
"th,td{border:1px solid #ddd;padding:0.5rem;text-align:left}",
"tr:nth-child(even){background:#fafafa}",
"a{color:inherit}",
"</style>",
"</head>",
"<body>",
f"<h1>{html.escape(page_title)}</h1>",
f"<div class='meta'>Generated: {html.escape(generated)}</div>",
"<table>",
"<thead><tr><th>#</th><th>Title</th><th>Created (UTC)</th></tr></thead>",
"<tbody>",
]
for i, (fname, title, created) in enumerate(index_rows, start=1):
parts.append(
"<tr>"
f"<td>{i}</td>"
f"<td><a href='{html.escape(fname)}'>{html.escape(title)}</a></td>"
f"<td>{html.escape(created)}</td>"
"</tr>"
)
parts.extend(["</tbody></table>", "</body></html>"])
(out_dir / "index.html").write_text("\n".join(parts), encoding="utf-8")
def main() -> None:
"""CLI entrypoint. Filters conversations by keyword and writes outputs as requested."""
parser = argparse.ArgumentParser(description="Filter ChatGPT conversations by keyword.")
parser.add_argument("--json", required=True, help="Path to conversations.json")
parser.add_argument("--keyword", required=True, help="Keyword or phrase to match (case-insensitive)")
parser.add_argument("--out-json", help="Optional path to write filtered JSON")
parser.add_argument("--out-html", help="Optional path to write a single combined HTML")
parser.add_argument("--out-dir", help="Optional directory to write one HTML per conversation (plus index.html)")
args = parser.parse_args()
matches = filter_conversations_by_project(args.json, args.keyword)
print(f"Found {len(matches)} conversations mentioning '{args.keyword}'")
for convo in matches:
title = convo.get("title") or "(untitled)"
created = _fmt_dt(convo.get("create_time"))
print(f"- {title} [{created}]")
if args.out_json:
Path(args.out_json).write_text(json.dumps(matches, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Wrote JSON: {args.out_json}")
if args.out_html:
html_doc = render_html(matches, args.keyword)
Path(args.out_html).write_text(html_doc, encoding="utf-8")
print(f"Wrote HTML: {args.out_html}")
if args.out_dir:
out_dir = Path(args.out_dir)
write_per_conversation_html(matches, out_dir, make_index=True)
print(f"Wrote {len(matches)} conversation files to: {out_dir.resolve()}")
if __name__ == "__main__":
main()
@JeremyMcCormick
Copy link
Author

This will write an index file to {output_dir}/index.html listing all of the conversations.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment