Last active
March 5, 2024 20:30
-
-
Save fsndzomga/237e55a6ae9a3f6fa7f7f1cc6547e269 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The goal of this program is to transform the poems.txt file into a well structured json file | |
from dotenv import load_dotenv | |
import json | |
import re | |
from anonLLM.llm import OpenaiLanguageModel | |
load_dotenv() | |
SEP = '---' | |
# Define the system message content | |
SYSTEM_CONTENT = "I wrote a very personal system prompt here" | |
llm = OpenaiLanguageModel(model="gpt-4", anonymize=False) | |
# Function to clean up the poem content | |
def clean_poem(poem_content): | |
# Replace multiple newlines with a single newline | |
poem_content = re.sub(r'\n+', '\n', poem_content).strip() | |
# Remove the newlines from the end of the poem content | |
poem_content = poem_content.rstrip('\n') | |
return poem_content | |
def get_thematic_content(poem): | |
# The prompt asks GPT-3 to summarize the thematic content of the poem | |
prompt = f"Analyse ce poème :\n\n{poem}\n\nRésume-le et explique ce qu'il évoque en 200 mots." | |
# Call the OpenAI API | |
content_summary = llm.generate( | |
prompt=prompt, | |
max_tokens=400 | |
) | |
return content_summary | |
# Function to process the text file and return JSON structure | |
def process_poems(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
poems_data = file.read() | |
# Split the file content by the poem separator | |
poems = poems_data.split(SEP) | |
json_structure = [] | |
for poem in poems: | |
# Split by newline to separate title from content | |
parts = poem.strip().split('\n', 1) | |
if len(parts) < 2: # Skip any empty sections | |
continue | |
title, content = parts | |
title = title.strip() | |
content = clean_poem(content) | |
# TODO: Extract thematic content or any other details if needed | |
thematic_content = get_thematic_content(content) | |
# Create a message structure for the current poem | |
json_structure.append({ | |
"messages": [ | |
{"role": "system", "content": SYSTEM_CONTENT}, | |
{"role": "user", "content": f"Ecris un poème intitulé {title}. {thematic_content}"}, | |
{"role": "assistant", "content": content.replace('\n', '\\n')} | |
] | |
}) | |
return json_structure | |
# Convert to JSON and save to file | |
def save_json(json_structure, output_file): | |
with open(output_file, 'w', encoding='utf-8') as file: | |
json.dump(json_structure, file, ensure_ascii=False, indent=2) | |
# Process the poems and save to JSON file | |
poems_json = process_poems('poems.txt') | |
save_json(poems_json, 'poems.json') | |
# Convert JSON to JSONL and save to file | |
with open('poems.jsonl', 'w', encoding='utf-8') as outfile: | |
for entry in poems_json: | |
json_record = json.dumps(entry, ensure_ascii=False) | |
outfile.write(json_record + '\n') | |
print("The poems have been processed and saved to 'poems.json' and 'poems.jsonl'.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment