Skip to content

Instantly share code, notes, and snippets.

@korakot
Last active July 14, 2023 15:25
Show Gist options
  • Select an option

  • Save korakot/9125a1e71e9371d2e3894c38b0f3d38e to your computer and use it in GitHub Desktop.

Select an option

Save korakot/9125a1e71e9371d2e3894c38b0f3d38e to your computer and use it in GitHub Desktop.
Convert chatgpt_conversations.zip to sqlite
import sqlite3
import pandas as pd
import json
import zipfile
import time
# Recreate the database with debug information
def create_database_debug(filename, db_name):
# Extract the JSON data from the zip file
with zipfile.ZipFile(filename, 'r') as zip_ref:
json_file = zip_ref.open('conversations.json')
data = json.load(json_file)
# Connect to the SQLite database (it will be created if it doesn't exist)
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
# Create the tables
cursor.execute("""CREATE TABLE IF NOT EXISTS conversations (
id TEXT PRIMARY KEY,
start_message_id TEXT,
end_message_id TEXT,
original_conversation_id TEXT,
original_start_message_id TEXT,
original_end_message_id TEXT
)""")
print("Created table: conversations")
cursor.execute("""CREATE TABLE IF NOT EXISTS messages (
id TEXT PRIMARY KEY,
role TEXT,
content TEXT,
timestamp TEXT,
conversation_id TEXT,
parent_id TEXT,
last_child_id TEXT
)""")
print("Created table: messages")
# Add the data to the tables
message_ids = set()
for conversation in data:
conv_id = conversation['id']
cursor.execute("INSERT INTO conversations (id) VALUES (?)", (conv_id,))
mapping = conversation['mapping']
messages = sorted([(msg_id, msg) for msg_id, msg in mapping.items()], key=lambda x: x[1].get('timestamp', ''))
start_message_id, end_message_id = messages[0][0], messages[-1][0]
original_start_message_id, original_end_message_id = None, None
for i, (msg_id, msg) in enumerate(messages):
# Skip messages that have already been added to the database
if msg_id in message_ids:
if original_start_message_id is None:
original_start_message_id = msg_id
original_end_message_id = msg_id
continue
role = msg.get('role', '')
content = msg.get('content', '')
timestamp = msg.get('timestamp', '')
parent_id = msg.get('parent', '')
last_child_id = msg.get('children', [None])[0] if 'children' in msg and msg['children'] else None
cursor.execute("""
INSERT INTO messages (id, role, content, timestamp, conversation_id, parent_id, last_child_id)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (msg_id, role, content, timestamp, conv_id, parent_id, last_child_id))
message_ids.add(msg_id)
if i == 0:
cursor.execute("UPDATE conversations SET start_message_id = ? WHERE id = ?", (msg_id, conv_id))
if i == len(messages) - 1:
cursor.execute("UPDATE conversations SET end_message_id = ? WHERE id = ?", (msg_id, conv_id))
cursor.execute("""
UPDATE conversations
SET original_start_message_id = ?, original_end_message_id = ?
WHERE id = ?
""", (original_start_message_id, original_end_message_id, conv_id))
# Commit the changes
conn.commit()
# Close the connection
conn.close()
print("Database creation complete.")
# Use the function to create and populate the database
filename = '/mnt/data/chatgpt_test.zip' # replace with your filename
db_name = f'/mnt/data/chatgpt_test_debug_{int(time.time())}.db'
create_database_debug(filename, db_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment