Created
December 19, 2023 12:00
-
-
Save 14790897/29533887b0b5aba7c8cecb419c4be72d to your computer and use it in GitHub Desktop.
提取ChatGPT官方json数据中的完整对话
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
def find_bottom_most_node(conversation_data): | |
""" | |
Finds the bottom-most node in the conversation, which is the node without any children. | |
:param conversation_data: The conversation data in a nested dictionary format. | |
:return: The ID of the bottom-most node, or None if not found. | |
""" | |
if not isinstance(conversation_data, dict): | |
raise TypeError("Conversation data must be a dictionary.") | |
try: | |
for node_id, node in reversed(conversation_data.items()): | |
# Ensure that 'node' is a dictionary | |
if not isinstance(node, dict): | |
raise TypeError(f"Node {node_id} is not a dictionary.") | |
if "children" not in node: | |
raise KeyError(f"'children' key not found in node {node_id}.") | |
if not node.get("children"): # Check if the node has no children | |
return node_id | |
except Exception as e: | |
print(f"Error finding bottom-most node: {e}") | |
return None | |
def extract_conversation_path(conversation_data, start_node_id): | |
""" | |
Extracts the conversation path starting from the given node and going up to the root. | |
:param conversation_data: The conversation data in a nested dictionary format. | |
:param start_node_id: The ID of the starting node (bottom-most node in the conversation). | |
:return: A list of nodes representing the conversation path from the start node to the root. | |
""" | |
path = [] | |
current_node_id = start_node_id | |
while current_node_id: | |
# Get the current node | |
node = conversation_data.get(current_node_id) | |
if node is None: | |
break # Node not found | |
# Add the current node to the path | |
path.append(node) | |
# Move to the parent node | |
current_node_id = node.get("parent") | |
return path[::-1] # Reverse the path to start from the root | |
def extract_full_conversation(conversation_data, start_node_id): | |
""" | |
Extracts the full conversation path along with message contents, starting from the given node and going up to the root. | |
:param conversation_data: The conversation data in a nested dictionary format. | |
:param start_node_id: The ID of the starting node (bottom-most node in the conversation). | |
:return: A list of tuples, each containing the node ID and its message content, representing the conversation path. | |
""" | |
full_conversation = [] | |
current_node_id = start_node_id | |
while current_node_id: | |
# Get the current node | |
node = conversation_data.get(current_node_id) | |
if node is None: | |
break # Node not found | |
# Extracting the message content if available | |
message = node.get("message", {}) | |
content = message.get("content", {}) | |
parts = content.get("parts", []) | |
message_text = " ".join( | |
parts | |
) # Joining parts to form the complete message text | |
print("message_text:", message_text) | |
# Add the node ID and message text to the conversation | |
full_conversation.append((current_node_id, message_text)) | |
# Move to the parent node | |
current_node_id = node.get("parent") | |
return full_conversation[::-1] # Reverse to start from the root | |
file_path = "./example.json" # Replace with the actual file path | |
try: | |
# Load JSON data from the file | |
with open(file_path, "r", encoding="utf-8") as file: | |
conversation_data = json.load(file) | |
# print(conversation_data) | |
except Exception as e: | |
print(f"Error reading or processing the file: {e}") | |
conversation_data = conversation_data["mapping"] # Convert to a dictionary~ | |
# 获取最底部节点的ID | |
bottom_most_node_id = find_bottom_most_node(conversation_data) | |
print("find bottom_most_node_id:", bottom_most_node_id) | |
# 然后使用这个ID来提取对话路径 | |
if bottom_most_node_id: | |
conversation_text = extract_full_conversation( | |
conversation_data, bottom_most_node_id | |
) | |
# 指定要保存的文件名 | |
file_name = "conversation_output.txt" | |
# 使用 with 语句打开文件,确保文件正确关闭 | |
with open(file_name, "w", encoding="utf-8") as file: | |
# 遍历对话中的每个节点 | |
for node_id, message_content in conversation_text: | |
# 将节点ID和消息内容写入文件 | |
file.write(f"Node ID: {node_id}\nMessage Content:\n{message_content}\n\n") | |
else: | |
print("No bottom-most node found.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment