Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Last active March 5, 2024 20:30
Show Gist options
  • Save fsndzomga/237e55a6ae9a3f6fa7f7f1cc6547e269 to your computer and use it in GitHub Desktop.
Save fsndzomga/237e55a6ae9a3f6fa7f7f1cc6547e269 to your computer and use it in GitHub Desktop.
# The goal of this program is to transform the poems.txt file into a well structured json file
from dotenv import load_dotenv
import json
import re
from anonLLM.llm import OpenaiLanguageModel
load_dotenv()
SEP = '---'
# Define the system message content
SYSTEM_CONTENT = "I wrote a very personal system prompt here"
llm = OpenaiLanguageModel(model="gpt-4", anonymize=False)
# Function to clean up the poem content
def clean_poem(poem_content):
# Replace multiple newlines with a single newline
poem_content = re.sub(r'\n+', '\n', poem_content).strip()
# Remove the newlines from the end of the poem content
poem_content = poem_content.rstrip('\n')
return poem_content
def get_thematic_content(poem):
# The prompt asks GPT-3 to summarize the thematic content of the poem
prompt = f"Analyse ce poème :\n\n{poem}\n\nRésume-le et explique ce qu'il évoque en 200 mots."
# Call the OpenAI API
content_summary = llm.generate(
prompt=prompt,
max_tokens=400
)
return content_summary
# Function to process the text file and return JSON structure
def process_poems(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
poems_data = file.read()
# Split the file content by the poem separator
poems = poems_data.split(SEP)
json_structure = []
for poem in poems:
# Split by newline to separate title from content
parts = poem.strip().split('\n', 1)
if len(parts) < 2: # Skip any empty sections
continue
title, content = parts
title = title.strip()
content = clean_poem(content)
# TODO: Extract thematic content or any other details if needed
thematic_content = get_thematic_content(content)
# Create a message structure for the current poem
json_structure.append({
"messages": [
{"role": "system", "content": SYSTEM_CONTENT},
{"role": "user", "content": f"Ecris un poème intitulé {title}. {thematic_content}"},
{"role": "assistant", "content": content.replace('\n', '\\n')}
]
})
return json_structure
# Convert to JSON and save to file
def save_json(json_structure, output_file):
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(json_structure, file, ensure_ascii=False, indent=2)
# Process the poems and save to JSON file
poems_json = process_poems('poems.txt')
save_json(poems_json, 'poems.json')
# Convert JSON to JSONL and save to file
with open('poems.jsonl', 'w', encoding='utf-8') as outfile:
for entry in poems_json:
json_record = json.dumps(entry, ensure_ascii=False)
outfile.write(json_record + '\n')
print("The poems have been processed and saved to 'poems.json' and 'poems.jsonl'.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment