Last active
November 9, 2024 23:09
-
-
Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.
program to read "conversations.json" export datafiles from a ChatGPT export, saving each thread into a new text file (suitable for uploading them all into a RAG AI system.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
__version__ = '1.20241110' | |
""" | |
chatgpt_to_text.py program reads "conversations.json" ChatGPT export files, | |
and turns them into text files suitable for uploading into a RAG AI system. | |
Usage: | |
mkdir txt | |
cd txt | |
../chatgpt_to_text.py ../conversations.json > everything_in_one_file.txt | |
# Creates lots of files | |
""" | |
import json | |
import sys | |
import os | |
import html | |
YELLOW = "\033[33m" | |
RESET = "\033[0m" | |
def load_conversations(filename): | |
if not filename: | |
print("Error: No input file provided.") | |
sys.exit(1) | |
if not os.path.isfile(filename): | |
print(f"Error: File '{filename}' does not exist.") | |
sys.exit(1) | |
with open(filename, 'r') as file: | |
return json.load(file) | |
def unique_filename(base_name): | |
"""Generate a unique filename by appending -1, -2, etc. if a file already exists.""" | |
# filter out disallowed characters | |
sanitized_title = "".join(char if char in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-") else "_" for char in base_name) | |
if not len(sanitized_title): sanitized_title = "new_chat" | |
filename = f"{sanitized_title}.txt" | |
counter = 1 | |
while os.path.exists(filename): | |
filename = f"{sanitized_title}-{counter}.txt" | |
counter += 1 | |
return filename | |
def extract_messages(conversation): | |
messages = [] | |
current_node = conversation.get("current_node") | |
while current_node: | |
node = conversation["mapping"].get(current_node) | |
# Check if node exists and is a dictionary before proceeding | |
if not node or not isinstance(node, dict): | |
break | |
message = node.get("message") | |
# Check if message is a dictionary with content | |
if message and isinstance(message, dict): | |
message_content = message.get("content", {}) | |
if ( | |
isinstance(message_content, dict) and | |
message_content.get("content_type") == "text" and | |
message_content.get("parts") | |
): | |
author = message["author"]["role"] | |
circ='🔴' # red circle | |
if author == "assistant": | |
author = "ChatGPT" | |
circ='🔵' # blue circle | |
elif author == "system" and message.get("metadata", {}).get("is_user_system_message"): | |
author = "Custom user info" | |
text = message_content["parts"][0].strip() | |
text = html.unescape(text) # Convert HTML entities to plain text | |
if text: # Only add non-empty messages | |
messages.append(f"{circ} {author}: {text}\n") | |
# Move to the parent node | |
current_node = node.get("parent") | |
return messages[::-1] # Reverse to get correct order | |
def print_and_save_conversations(conversations): | |
for conversation in conversations: | |
title = conversation.get("title", "Untitled Conversation") | |
print(f"{YELLOW}Title: {title}{RESET}") | |
print("-" * (len("Title: ") + len(title))) | |
print() | |
messages = extract_messages(conversation) | |
# Print messages to the screen | |
for message in messages: | |
print(message) | |
print("\n") # Add extra space between conversations | |
# Save messages to a file | |
#sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe | |
#filename = unique_filename(sanitized_title) | |
filename = unique_filename(title) | |
with open(filename, 'w') as file: | |
file.write(f"Title: {title}\n") | |
file.write("-" * (len("Title: ") + len(title)) + "\n\n") | |
for message in messages: | |
file.write(message + "\n") | |
print(f"Saved conversation to {filename}\n") | |
# Ensure a filename argument is provided | |
if len(sys.argv) < 2: | |
print("Error: Please provide the path to the JSON file (e.g. conversations.json) as the first argument.") | |
sys.exit(1) | |
filename = sys.argv[1] | |
conversations = load_conversations(filename) | |
print_and_save_conversations(conversations) | |
# the end. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment