Last active
July 14, 2023 15:25
-
-
Save korakot/9125a1e71e9371d2e3894c38b0f3d38e to your computer and use it in GitHub Desktop.
Convert chatgpt_conversations.zip to sqlite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sqlite3 | |
| import pandas as pd | |
| import json | |
| import zipfile | |
| import time | |
| # Recreate the database with debug information | |
| def create_database_debug(filename, db_name): | |
| # Extract the JSON data from the zip file | |
| with zipfile.ZipFile(filename, 'r') as zip_ref: | |
| json_file = zip_ref.open('conversations.json') | |
| data = json.load(json_file) | |
| # Connect to the SQLite database (it will be created if it doesn't exist) | |
| conn = sqlite3.connect(db_name) | |
| cursor = conn.cursor() | |
| # Create the tables | |
| cursor.execute("""CREATE TABLE IF NOT EXISTS conversations ( | |
| id TEXT PRIMARY KEY, | |
| start_message_id TEXT, | |
| end_message_id TEXT, | |
| original_conversation_id TEXT, | |
| original_start_message_id TEXT, | |
| original_end_message_id TEXT | |
| )""") | |
| print("Created table: conversations") | |
| cursor.execute("""CREATE TABLE IF NOT EXISTS messages ( | |
| id TEXT PRIMARY KEY, | |
| role TEXT, | |
| content TEXT, | |
| timestamp TEXT, | |
| conversation_id TEXT, | |
| parent_id TEXT, | |
| last_child_id TEXT | |
| )""") | |
| print("Created table: messages") | |
| # Add the data to the tables | |
| message_ids = set() | |
| for conversation in data: | |
| conv_id = conversation['id'] | |
| cursor.execute("INSERT INTO conversations (id) VALUES (?)", (conv_id,)) | |
| mapping = conversation['mapping'] | |
| messages = sorted([(msg_id, msg) for msg_id, msg in mapping.items()], key=lambda x: x[1].get('timestamp', '')) | |
| start_message_id, end_message_id = messages[0][0], messages[-1][0] | |
| original_start_message_id, original_end_message_id = None, None | |
| for i, (msg_id, msg) in enumerate(messages): | |
| # Skip messages that have already been added to the database | |
| if msg_id in message_ids: | |
| if original_start_message_id is None: | |
| original_start_message_id = msg_id | |
| original_end_message_id = msg_id | |
| continue | |
| role = msg.get('role', '') | |
| content = msg.get('content', '') | |
| timestamp = msg.get('timestamp', '') | |
| parent_id = msg.get('parent', '') | |
| last_child_id = msg.get('children', [None])[0] if 'children' in msg and msg['children'] else None | |
| cursor.execute(""" | |
| INSERT INTO messages (id, role, content, timestamp, conversation_id, parent_id, last_child_id) | |
| VALUES (?, ?, ?, ?, ?, ?, ?) | |
| """, (msg_id, role, content, timestamp, conv_id, parent_id, last_child_id)) | |
| message_ids.add(msg_id) | |
| if i == 0: | |
| cursor.execute("UPDATE conversations SET start_message_id = ? WHERE id = ?", (msg_id, conv_id)) | |
| if i == len(messages) - 1: | |
| cursor.execute("UPDATE conversations SET end_message_id = ? WHERE id = ?", (msg_id, conv_id)) | |
| cursor.execute(""" | |
| UPDATE conversations | |
| SET original_start_message_id = ?, original_end_message_id = ? | |
| WHERE id = ? | |
| """, (original_start_message_id, original_end_message_id, conv_id)) | |
| # Commit the changes | |
| conn.commit() | |
| # Close the connection | |
| conn.close() | |
| print("Database creation complete.") | |
| # Use the function to create and populate the database | |
| filename = '/mnt/data/chatgpt_test.zip' # replace with your filename | |
| db_name = f'/mnt/data/chatgpt_test_debug_{int(time.time())}.db' | |
| create_database_debug(filename, db_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment