fsndzomga · March 5, 2024 20:30
diff --git a/text-to-json.py b/text-to-json.py
 # The goal of this program is to transform the poems.txt file into a well structured json file
 from dotenv import load_dotenv
 import json
 import re
 from anonLLM.llm import OpenaiLanguageModel

 load_dotenv()

 SEP = '---'

 # Define the system message content
 SYSTEM_CONTENT = "I wrote a very personal system prompt here"

 llm = OpenaiLanguageModel(model="gpt-4", anonymize=False)


 # Function to clean up the poem content
 def clean_poem(poem_content):
    # Replace multiple newlines with a single newline
    poem_content = re.sub(r'\n+', '\n', poem_content).strip()
    # Remove the newlines from the end of the poem content
    poem_content = poem_content.rstrip('\n')
    return poem_content


 def get_thematic_content(poem):
    # The prompt asks GPT-3 to summarize the thematic content of the poem
    prompt = f"Analyse ce poème :\n\n{poem}\n\nRésume-le et explique ce qu'il évoque en 200 mots."

    # Call the OpenAI API
    content_summary = llm.generate(
        prompt=prompt,
        max_tokens=400
    )

    return content_summary


 # Function to process the text file and return JSON structure
 def process_poems(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        poems_data = file.read()

    # Split the file content by the poem separator
    poems = poems_data.split(SEP)

    json_structure = []

    for poem in poems:
        # Split by newline to separate title from content
        parts = poem.strip().split('\n', 1)
        if len(parts) < 2:  # Skip any empty sections
            continue
        title, content = parts
        title = title.strip()
        content = clean_poem(content)

        # TODO: Extract thematic content or any other details if needed
        thematic_content = get_thematic_content(content)

        # Create a message structure for the current poem
        json_structure.append({
            "messages": [
                {"role": "system", "content": SYSTEM_CONTENT},
                {"role": "user", "content": f"Ecris un poème intitulé {title}. {thematic_content}"},
                {"role": "assistant", "content": content.replace('\n', '\\n')}
            ]
        })

    return json_structure


 # Convert to JSON and save to file
 def save_json(json_structure, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(json_structure, file, ensure_ascii=False, indent=2)


 # Process the poems and save to JSON file
 poems_json = process_poems('poems.txt')
 save_json(poems_json, 'poems.json')

 # Convert JSON to JSONL and save to file
 with open('poems.jsonl', 'w', encoding='utf-8') as outfile:
    for entry in poems_json:
        json_record = json.dumps(entry, ensure_ascii=False)
        outfile.write(json_record + '\n')

 print("The poems have been processed and saved to 'poems.json' and 'poems.jsonl'.")
	# The goal of this program is to transform the poems.txt file into a well structured json file
	from dotenv import load_dotenv
	import json
	import re
	from anonLLM.llm import OpenaiLanguageModel

	load_dotenv()

	SEP = '---'

	# Define the system message content
	SYSTEM_CONTENT = "I wrote a very personal system prompt here"

	llm = OpenaiLanguageModel(model="gpt-4", anonymize=False)


	# Function to clean up the poem content
	def clean_poem(poem_content):
	# Replace multiple newlines with a single newline
	poem_content = re.sub(r'\n+', '\n', poem_content).strip()
	# Remove the newlines from the end of the poem content
	poem_content = poem_content.rstrip('\n')
	return poem_content


	def get_thematic_content(poem):
	# The prompt asks GPT-3 to summarize the thematic content of the poem
	prompt = f"Analyse ce poème :\n\n{poem}\n\nRésume-le et explique ce qu'il évoque en 200 mots."

	# Call the OpenAI API
	content_summary = llm.generate(
	prompt=prompt,
	max_tokens=400
	)

	return content_summary


	# Function to process the text file and return JSON structure
	def process_poems(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	poems_data = file.read()

	# Split the file content by the poem separator
	poems = poems_data.split(SEP)

	json_structure = []

	for poem in poems:
	# Split by newline to separate title from content
	parts = poem.strip().split('\n', 1)
	if len(parts) < 2: # Skip any empty sections
	continue
	title, content = parts
	title = title.strip()
	content = clean_poem(content)

	# TODO: Extract thematic content or any other details if needed
	thematic_content = get_thematic_content(content)

	# Create a message structure for the current poem
	json_structure.append({
	"messages": [
	{"role": "system", "content": SYSTEM_CONTENT},
	{"role": "user", "content": f"Ecris un poème intitulé {title}. {thematic_content}"},
	{"role": "assistant", "content": content.replace('\n', '\\n')}
	]
	})

	return json_structure


	# Convert to JSON and save to file
	def save_json(json_structure, output_file):
	with open(output_file, 'w', encoding='utf-8') as file:
	json.dump(json_structure, file, ensure_ascii=False, indent=2)


	# Process the poems and save to JSON file
	poems_json = process_poems('poems.txt')
	save_json(poems_json, 'poems.json')

	# Convert JSON to JSONL and save to file
	with open('poems.jsonl', 'w', encoding='utf-8') as outfile:
	for entry in poems_json:
	json_record = json.dumps(entry, ensure_ascii=False)
	outfile.write(json_record + '\n')

	print("The poems have been processed and saved to 'poems.json' and 'poems.jsonl'.")