Created
February 14, 2024 00:31
-
-
Save lukestanley/eb1037478b1129a5ca0560eea761967e to your computer and use it in GitHub Desktop.
A method to get easily parsable conversations from a ChatGPT data export of Feb 2024 vintage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_messages(file_path="conversations.json", conversation_limit=None, message_limit=None): | |
with open(file_path, "r") as file: | |
data = json.load(file) | |
extracted_conversations = [] | |
conv_titles = [] | |
# Use the specified limits or the entire data length if no limit is specified | |
conversation_count = conversation_limit if conversation_limit else len(data) | |
# Iterate over conversations within the specified limit | |
for conversation in data[:conversation_count]: | |
title = conversation.get("title", "") | |
conv_titles.append(title) | |
extracted_messages = [] | |
message_count = 0 | |
for message_id, message_info in conversation["mapping"].items(): | |
if ( | |
message_limit and message_count >= message_limit | |
): # Apply message limit if specified | |
break | |
if message_info["message"]: # Ensure there's a message | |
content = message_info["message"]["content"] | |
if ( | |
"parts" in content and content["parts"] | |
): # Ensure there are parts with content | |
message_string = content["parts"][0] | |
role = message_info["message"]["author"]["role"] | |
if len(message_string) > 0: | |
extracted_messages.append( | |
{"role": role, "message": message_string} | |
) | |
message_count += 1 | |
extracted_conversations.append(extracted_messages) | |
return extracted_conversations, conv_titles |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment