Last active
April 7, 2024 03:49
-
-
Save xzuyn/765157fa27738a9888dcb4e0aa3f5118 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| def load_json_or_jsonl(file_path): | |
| try: | |
| with open(file_path, "r") as file: | |
| try: | |
| # Try loading the entire file as JSON | |
| data = json.load(file) | |
| return data | |
| except json.JSONDecodeError: | |
| # If loading as JSON fails, try loading as JSON Lines | |
| file.seek(0) # Reset file pointer to the beginning | |
| lines = file.readlines() | |
| json_lines_data = [] | |
| for line in lines: | |
| try: | |
| item = json.loads(line.strip()) | |
| json_lines_data.append(item) | |
| except json.JSONDecodeError as e: | |
| print(f"Error decoding JSON in line: {e}") | |
| return json_lines_data | |
| except FileNotFoundError: | |
| print(f"File not found: {file_path}") | |
| return None | |
| def convert_to_desired_format(data): | |
| converted_data = [] | |
| for item in data: | |
| prompt = item["instruction"].strip() | |
| chosen_response = item["chosen_response"].strip() | |
| rejected_response = item["rejected_response"].strip() | |
| if prompt and chosen_response and rejected_response: | |
| chosen_dialogue = [ | |
| {"content": prompt, "role": "user"}, | |
| {"content": chosen_response, "role": "assistant"} | |
| ] | |
| rejected_dialogue = [ | |
| {"content": prompt, "role": "user"}, | |
| {"content": rejected_response, "role": "assistant"} | |
| ] | |
| converted_item = { | |
| "prompt": prompt, | |
| "chosen": chosen_dialogue, | |
| "rejected": rejected_dialogue | |
| } | |
| converted_data.append(converted_item) | |
| return converted_data | |
| json_data = load_json_or_jsonl( | |
| "./downloaded_datasets/argilla_distilabel-math-preference-dpo-train.jsonl" | |
| ) | |
| # Convert data to desired format | |
| converted_data = convert_to_desired_format(json_data) | |
| # Write converted data to a new JSON file | |
| with open( | |
| './downloaded_datasets/argilla/argilla_distilabel-math-preference-dpo-train.json', | |
| 'w' | |
| ) as f: | |
| json.dump(converted_data, f, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment