Skip to content

Instantly share code, notes, and snippets.

@gitcnd
Last active November 9, 2024 23:09
Show Gist options
  • Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.
Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.
program to read "conversations.json" export datafiles from a ChatGPT export, saving each thread into a new text file (suitable for uploading them all into a RAG AI system.)
#!/usr/bin/env python3
__version__ = '1.20241110'
"""
chatgpt_to_text.py program reads "conversations.json" ChatGPT export files,
and turns them into text files suitable for uploading into a RAG AI system.
Usage:
mkdir txt
cd txt
../chatgpt_to_text.py ../conversations.json > everything_in_one_file.txt
# Creates lots of files
"""
import json
import sys
import os
import html
YELLOW = "\033[33m"
RESET = "\033[0m"
def load_conversations(filename):
if not filename:
print("Error: No input file provided.")
sys.exit(1)
if not os.path.isfile(filename):
print(f"Error: File '{filename}' does not exist.")
sys.exit(1)
with open(filename, 'r') as file:
return json.load(file)
def unique_filename(base_name):
"""Generate a unique filename by appending -1, -2, etc. if a file already exists."""
# filter out disallowed characters
sanitized_title = "".join(char if char in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-") else "_" for char in base_name)
if not len(sanitized_title): sanitized_title = "new_chat"
filename = f"{sanitized_title}.txt"
counter = 1
while os.path.exists(filename):
filename = f"{sanitized_title}-{counter}.txt"
counter += 1
return filename
def extract_messages(conversation):
messages = []
current_node = conversation.get("current_node")
while current_node:
node = conversation["mapping"].get(current_node)
# Check if node exists and is a dictionary before proceeding
if not node or not isinstance(node, dict):
break
message = node.get("message")
# Check if message is a dictionary with content
if message and isinstance(message, dict):
message_content = message.get("content", {})
if (
isinstance(message_content, dict) and
message_content.get("content_type") == "text" and
message_content.get("parts")
):
author = message["author"]["role"]
circ='🔴' # red circle
if author == "assistant":
author = "ChatGPT"
circ='🔵' # blue circle
elif author == "system" and message.get("metadata", {}).get("is_user_system_message"):
author = "Custom user info"
text = message_content["parts"][0].strip()
text = html.unescape(text) # Convert HTML entities to plain text
if text: # Only add non-empty messages
messages.append(f"{circ} {author}: {text}\n")
# Move to the parent node
current_node = node.get("parent")
return messages[::-1] # Reverse to get correct order
def print_and_save_conversations(conversations):
for conversation in conversations:
title = conversation.get("title", "Untitled Conversation")
print(f"{YELLOW}Title: {title}{RESET}")
print("-" * (len("Title: ") + len(title)))
print()
messages = extract_messages(conversation)
# Print messages to the screen
for message in messages:
print(message)
print("\n") # Add extra space between conversations
# Save messages to a file
#sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe
#filename = unique_filename(sanitized_title)
filename = unique_filename(title)
with open(filename, 'w') as file:
file.write(f"Title: {title}\n")
file.write("-" * (len("Title: ") + len(title)) + "\n\n")
for message in messages:
file.write(message + "\n")
print(f"Saved conversation to {filename}\n")
# Ensure a filename argument is provided
if len(sys.argv) < 2:
print("Error: Please provide the path to the JSON file (e.g. conversations.json) as the first argument.")
sys.exit(1)
filename = sys.argv[1]
conversations = load_conversations(filename)
print_and_save_conversations(conversations)
# the end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment