Last active
September 2, 2025 23:30
-
-
Save JeremyMcCormick/8063121b7f4182027af2223d78858040 to your computer and use it in GitHub Desktop.
Reads in exported ChatGPT conversation data (JSON format), filters based on keywords, and then writes out the filtered conversatiosn to JSON, HTML, or a directory with HTML files (Generated by ChatGPT-5)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Filter ChatGPT export conversations by keyword and optionally write results to | |
| JSON, a single HTML file, or a directory with one HTML file per conversation. | |
| Usage: | |
| python filter_conv.py --json /path/to/conversations.json --keyword "ppdb" \ | |
| --out-json filtered_conversations.json \ | |
| --out-html filtered_conversations.html \ | |
| --out-dir filtered_conversations_dir | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import html | |
| import json | |
| import re | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Any, Iterable | |
| def _flatten_parts(parts: Iterable[Any]) -> list[str]: | |
| """Extract text fragments from a heterogeneous `parts` structure. | |
| Parameters | |
| ---------- | |
| parts | |
| Iterable of items that may be strings or dicts (possibly nested). | |
| Returns | |
| ------- | |
| list[str] | |
| Extracted plain-text fragments. | |
| """ | |
| out: list[str] = [] | |
| for p in parts or []: | |
| if isinstance(p, str): | |
| out.append(p) | |
| elif isinstance(p, dict): | |
| if "text" in p and isinstance(p["text"], str): | |
| out.append(p["text"]) | |
| elif "content" in p and isinstance(p["content"], str): | |
| out.append(p["content"]) | |
| elif "parts" in p and isinstance(p["parts"], list): | |
| out.extend(_flatten_parts(p["parts"])) | |
| elif "content" in p and isinstance(p["content"], list): | |
| out.extend(_flatten_parts(p["content"])) | |
| return out | |
| def _message_to_text(message: dict[str, Any]) -> str: | |
| """Extract best-effort plain text from a ChatGPT export `message` object. | |
| Parameters | |
| ---------- | |
| message | |
| The message dict under `node["message"]`. | |
| Returns | |
| ------- | |
| str | |
| Concatenated plain text for this message (may be empty). | |
| """ | |
| if not message: | |
| return "" | |
| content = message.get("content") | |
| if isinstance(content, dict): | |
| if "parts" in content and isinstance(content["parts"], list): | |
| return " ".join(_flatten_parts(content["parts"])) | |
| if "text" in content and isinstance(content["text"], str): | |
| return content["text"] | |
| if "content" in content and isinstance(content["content"], list): | |
| return " ".join(_flatten_parts(content["content"])) | |
| if "content" in content and isinstance(content["content"], str): | |
| return content["content"] | |
| elif isinstance(content, list): | |
| return " ".join(_flatten_parts(content)) | |
| parts = message.get("parts") | |
| if isinstance(parts, list): | |
| return " ".join(_flatten_parts(parts)) | |
| return "" | |
| def _collect_messages(convo: dict[str, Any]) -> list[dict[str, Any]]: | |
| """Collect messages from a conversation into a chronological list. | |
| Parameters | |
| ---------- | |
| convo | |
| A single conversation object from `conversations.json`. | |
| Returns | |
| ------- | |
| list[dict[str, Any]] | |
| Each item has keys: `role` (str), `text` (str), `create_time` (float|None). | |
| """ | |
| messages: list[dict[str, Any]] = [] | |
| mapping = convo.get("mapping", {}) | |
| if not isinstance(mapping, dict): | |
| return messages | |
| for node in mapping.values(): | |
| if not isinstance(node, dict): | |
| continue | |
| message = node.get("message") | |
| if not isinstance(message, dict): | |
| continue | |
| role = (message.get("author") or {}).get("role") or "" | |
| text = _message_to_text(message) | |
| ts = message.get("create_time") | |
| if ts is None: | |
| ts = convo.get("create_time") | |
| messages.append({"role": str(role), "text": text, "create_time": ts}) | |
| messages.sort(key=lambda m: (m["create_time"] is None, m["create_time"])) | |
| return messages | |
| def filter_conversations_by_project(json_path: str | Path, project_keyword: str) -> list[dict[str, Any]]: | |
| """Filter conversations whose text contains the given keyword (case-insensitive). | |
| Parameters | |
| ---------- | |
| json_path | |
| Path to the `conversations.json` file from ChatGPT export. | |
| project_keyword | |
| Keyword or phrase to match. | |
| Returns | |
| ------- | |
| list[dict[str, Any]] | |
| Conversation dicts containing the keyword anywhere in their messages. | |
| """ | |
| path = Path(json_path) | |
| with path.open(encoding="utf-8") as f: | |
| data: list[dict[str, Any]] = json.load(f) | |
| needle = project_keyword.lower() | |
| results: list[dict[str, Any]] = [] | |
| for convo in data: | |
| texts: list[str] = [] | |
| for m in _collect_messages(convo): | |
| if m["text"]: | |
| texts.append(m["text"]) | |
| full_text = " ".join(texts).lower() | |
| if needle in full_text: | |
| results.append(convo) | |
| return results | |
| def _fmt_dt(ts: float | int | None) -> str: | |
| """Format a UNIX timestamp as ISO 8601 UTC string. | |
| Parameters | |
| ---------- | |
| ts | |
| UNIX epoch seconds, or None. | |
| Returns | |
| ------- | |
| str | |
| ISO 8601 string in UTC, or empty string if None/invalid. | |
| """ | |
| if ts is None: | |
| return "" | |
| try: | |
| return datetime.fromtimestamp(float(ts), tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") | |
| except Exception: | |
| return "" | |
| def _sanitize_filename(name: str, max_len: int = 80) -> str: | |
| """Sanitize a string for safe filesystem usage. | |
| Parameters | |
| ---------- | |
| name | |
| Candidate filename (e.g., conversation title). | |
| max_len | |
| Maximum length of the resulting filename stem (before extension). | |
| Returns | |
| ------- | |
| str | |
| Safe filename stem without extension. | |
| """ | |
| # Collapse whitespace and strip | |
| s = " ".join(name.split()).strip() | |
| # Replace unsafe characters with underscores | |
| s = re.sub(r'[^\w\-\. ]+', "_", s, flags=re.UNICODE) | |
| # Replace spaces with underscores | |
| s = s.replace(" ", "_") | |
| # Trim length | |
| if len(s) > max_len: | |
| s = s[:max_len].rstrip("_-.") | |
| # Ensure non-empty | |
| return s or "conversation" | |
| def render_html(conversations: list[dict[str, Any]], keyword: str) -> str: | |
| """Render filtered conversations as a standalone HTML page. | |
| Parameters | |
| ---------- | |
| conversations | |
| List of conversation dicts (from export) already filtered. | |
| keyword | |
| The keyword used for filtering; shown in the header. | |
| Returns | |
| ------- | |
| str | |
| Complete HTML document as a string. | |
| """ | |
| page_title = f"ChatGPT Conversations matching: {keyword}" | |
| generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") | |
| parts: list[str] = [ | |
| "<!DOCTYPE html>", | |
| "<html lang='en'>", | |
| "<head>", | |
| f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>", | |
| "<meta name='viewport' content='width=device-width,initial-scale=1'>", | |
| "<style>", | |
| "body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}", | |
| "h1{font-size:1.6rem;margin-bottom:0.25rem}", | |
| "h2{font-size:1.2rem;margin:1.5rem 0 0.5rem}", | |
| ".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}", | |
| ".convo{border:1px solid #ddd;border-radius:8px;padding:1rem;margin-bottom:1.5rem;background:#fafafa}", | |
| ".msg{padding:0.5rem 0;border-top:1px solid #eee}", | |
| ".msg:first-child{border-top:none}", | |
| ".role{font-weight:600}", | |
| ".time{color:#555;font-size:0.85rem;margin-left:0.5rem}", | |
| "pre{white-space:pre-wrap;word-wrap:break-word;background:#fff;border:1px solid #eee;border-radius:6px;padding:0.5rem;margin:0.25rem 0}", | |
| "a{color:inherit}", | |
| "</style>", | |
| "</head>", | |
| "<body>", | |
| f"<h1>{html.escape(page_title)}</h1>", | |
| f"<div class='meta'>Generated: {html.escape(generated)} • Total conversations: {len(conversations)}</div>", | |
| ] | |
| for idx, convo in enumerate(conversations, start=1): | |
| title = convo.get("title") or "(untitled)" | |
| ctime = _fmt_dt(convo.get("create_time")) | |
| parts.append("<div class='convo'>") | |
| parts.append(f"<h2>{idx}. {html.escape(title)}</h2>") | |
| if ctime: | |
| parts.append(f"<div class='meta'>Created: {html.escape(ctime)}</div>") | |
| messages = _collect_messages(convo) | |
| for m in messages: | |
| role = m.get("role") or "" | |
| text = m.get("text") or "" | |
| ts = _fmt_dt(m.get("create_time")) | |
| safe_text = html.escape(text) | |
| parts.append("<div class='msg'>") | |
| head = f"<span class='role'>{html.escape(role)}</span>" | |
| if ts: | |
| head += f"<span class='time'>{html.escape(ts)}</span>" | |
| parts.append(head) | |
| parts.append(f"<pre>{safe_text}</pre>") | |
| parts.append("</div>") | |
| parts.append("</div>") # .convo | |
| parts.append("</body></html>") | |
| return "\n".join(parts) | |
| def render_single_conversation_html(convo: dict[str, Any], idx: int) -> str: | |
| """Render a single conversation as a standalone HTML page. | |
| Parameters | |
| ---------- | |
| convo | |
| Conversation dict (from export). | |
| idx | |
| 1-based index used for labeling. | |
| Returns | |
| ------- | |
| str | |
| Complete HTML document for the conversation. | |
| """ | |
| title = convo.get("title") or "(untitled)" | |
| ctime = _fmt_dt(convo.get("create_time")) | |
| page_title = f"{idx}. {title}" | |
| generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") | |
| parts: list[str] = [ | |
| "<!DOCTYPE html>", | |
| "<html lang='en'>", | |
| "<head>", | |
| f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>", | |
| "<meta name='viewport' content='width=device-width,initial-scale=1'>", | |
| "<style>", | |
| "body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}", | |
| "h1{font-size:1.6rem;margin-bottom:0.25rem}", | |
| ".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}", | |
| ".msg{padding:0.75rem 0;border-top:1px solid #eee}", | |
| ".msg:first-child{border-top:none}", | |
| ".role{font-weight:600}", | |
| ".time{color:#555;font-size:0.85rem;margin-left:0.5rem}", | |
| "pre{white-space:pre-wrap;word-wrap:break-word;background:#fff;border:1px solid #eee;border-radius:6px;padding:0.5rem;margin:0.25rem 0}", | |
| "</style>", | |
| "</head>", | |
| "<body>", | |
| f"<h1>{html.escape(page_title)}</h1>", | |
| f"<div class='meta'>Created: {html.escape(ctime) if ctime else ''} • Exported: {html.escape(generated)}</div>", | |
| ] | |
| messages = _collect_messages(convo) | |
| for m in messages: | |
| role = m.get("role") or "" | |
| text = m.get("text") or "" | |
| ts = _fmt_dt(m.get("create_time")) | |
| safe_text = html.escape(text) | |
| parts.append("<div class='msg'>") | |
| head = f"<span class='role'>{html.escape(role)}</span>" | |
| if ts: | |
| head += f"<span class='time'>{html.escape(ts)}</span>" | |
| parts.append(head) | |
| parts.append(f"<pre>{safe_text}</pre>") | |
| parts.append("</div>") | |
| parts.append("</body></html>") | |
| return "\n".join(parts) | |
| def write_per_conversation_html(conversations: list[dict[str, Any]], out_dir: Path, make_index: bool = True) -> None: | |
| """Write one HTML file per conversation to a directory, with an optional index. | |
| Parameters | |
| ---------- | |
| conversations | |
| Filtered list of conversations to write. | |
| out_dir | |
| Target directory to create/write files into. | |
| make_index | |
| Whether to create an `index.html` linking to all conversation files. | |
| Returns | |
| ------- | |
| None | |
| """ | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| index_rows: list[tuple[str, str, str]] = [] # (filename, title, created) | |
| for idx, convo in enumerate(conversations, start=1): | |
| title = convo.get("title") or "(untitled)" | |
| created = _fmt_dt(convo.get("create_time")) | |
| stem = f"{idx:04d}_{_sanitize_filename(title)}" | |
| filename = f"{stem}.html" | |
| file_path = out_dir / filename | |
| html_doc = render_single_conversation_html(convo, idx) | |
| file_path.write_text(html_doc, encoding="utf-8") | |
| index_rows.append((filename, title, created)) | |
| if make_index: | |
| page_title = f"{len(conversations)} conversations" | |
| generated = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") | |
| parts: list[str] = [ | |
| "<!DOCTYPE html>", | |
| "<html lang='en'>", | |
| "<head>", | |
| f"<meta charset='utf-8'><title>{html.escape(page_title)}</title>", | |
| "<meta name='viewport' content='width=device-width,initial-scale=1'>", | |
| "<style>", | |
| "body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;line-height:1.5;margin:2rem;}", | |
| "h1{font-size:1.6rem;margin-bottom:0.25rem}", | |
| ".meta{color:#555;font-size:0.9rem;margin-bottom:1rem}", | |
| "table{border-collapse:collapse;width:100%}", | |
| "th,td{border:1px solid #ddd;padding:0.5rem;text-align:left}", | |
| "tr:nth-child(even){background:#fafafa}", | |
| "a{color:inherit}", | |
| "</style>", | |
| "</head>", | |
| "<body>", | |
| f"<h1>{html.escape(page_title)}</h1>", | |
| f"<div class='meta'>Generated: {html.escape(generated)}</div>", | |
| "<table>", | |
| "<thead><tr><th>#</th><th>Title</th><th>Created (UTC)</th></tr></thead>", | |
| "<tbody>", | |
| ] | |
| for i, (fname, title, created) in enumerate(index_rows, start=1): | |
| parts.append( | |
| "<tr>" | |
| f"<td>{i}</td>" | |
| f"<td><a href='{html.escape(fname)}'>{html.escape(title)}</a></td>" | |
| f"<td>{html.escape(created)}</td>" | |
| "</tr>" | |
| ) | |
| parts.extend(["</tbody></table>", "</body></html>"]) | |
| (out_dir / "index.html").write_text("\n".join(parts), encoding="utf-8") | |
| def main() -> None: | |
| """CLI entrypoint. Filters conversations by keyword and writes outputs as requested.""" | |
| parser = argparse.ArgumentParser(description="Filter ChatGPT conversations by keyword.") | |
| parser.add_argument("--json", required=True, help="Path to conversations.json") | |
| parser.add_argument("--keyword", required=True, help="Keyword or phrase to match (case-insensitive)") | |
| parser.add_argument("--out-json", help="Optional path to write filtered JSON") | |
| parser.add_argument("--out-html", help="Optional path to write a single combined HTML") | |
| parser.add_argument("--out-dir", help="Optional directory to write one HTML per conversation (plus index.html)") | |
| args = parser.parse_args() | |
| matches = filter_conversations_by_project(args.json, args.keyword) | |
| print(f"Found {len(matches)} conversations mentioning '{args.keyword}'") | |
| for convo in matches: | |
| title = convo.get("title") or "(untitled)" | |
| created = _fmt_dt(convo.get("create_time")) | |
| print(f"- {title} [{created}]") | |
| if args.out_json: | |
| Path(args.out_json).write_text(json.dumps(matches, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(f"Wrote JSON: {args.out_json}") | |
| if args.out_html: | |
| html_doc = render_html(matches, args.keyword) | |
| Path(args.out_html).write_text(html_doc, encoding="utf-8") | |
| print(f"Wrote HTML: {args.out_html}") | |
| if args.out_dir: | |
| out_dir = Path(args.out_dir) | |
| write_per_conversation_html(matches, out_dir, make_index=True) | |
| print(f"Wrote {len(matches)} conversation files to: {out_dir.resolve()}") | |
| if __name__ == "__main__": | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This will write an index file to
{output_dir}/index.htmllisting all of the conversations.