Created
June 29, 2023 00:44
-
-
Save chrisclark/612ab8fa9c4c6dd5a85c1529162e0efd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json, os, requests, openai | |
openai_api_key = os.getenv('OPENAI_API_KEY') | |
openai.api_key = openai_api_key | |
chunks_dir = "novel/chunks/" | |
edits_dir = "novel/edits/" | |
#helper function to process files in a directory | |
def process_directory(d, func): | |
for filename in sorted(os.listdir(chunks_dir)): | |
file_path = os.path.join(chunks_dir, filename) | |
name, _ = os.path.splitext(filename) | |
if name == '.DS_Store': continue | |
with open(file_path, 'r') as f: | |
func(name, f) | |
def write_chunk(chunk, line_index): | |
padded_name = 'chunk_{:0{padding}d}.txt'.format(line_index, padding=5) | |
with open(os.path.join(chunks_dir, padded_name), "w") as chunk_file: | |
chunk_file.write(chunk) | |
def chunkify(file_path, max_words=1000): | |
chunk = "" | |
word_count = 0 | |
chunk_index = 0 | |
with open(file_path, 'r') as file: | |
for i, line in enumerate(file): | |
line_word_count = len(line.split()) | |
if word_count + line_word_count <= max_words: | |
chunk += line | |
word_count += line_word_count | |
else: | |
write_chunk(chunk, chunk_index) | |
chunk = line | |
word_count = line_word_count | |
chunk_index = i | |
# write the last chunk | |
write_chunk(chunk, chunk_index) | |
chunkify(os.path.join('novel/', "novel.txt")) | |
prompt = """ | |
You are a copy editor looking for issues in a novel before it is submitted to publishers. | |
You are looking for obvious grammar and spelling issues and any information that is obviously incorrect. Do not make suggestions related to style, or edits for clarity. Just focus on copy errors. | |
Please format your responses as a series of bullet points. Start with a quote of a few words from the novel that you are copy-editing (so it's easy to find in the novel), then follow with your comments/corrections. | |
Here are some examples of good copy edits: | |
- "an two hundred year" -> "a two hundred year" | |
- "on to the veranda" -> "onto the veranda" | |
- "full memory of the night" -> "full memories of the night" | |
- "Her and Luis's dog" -> "Her and Luis' dog" | |
- "Felicia stiffened almost indecipherably." -> "Felicia stiffened almost imperceptibly." | |
- "The is the family kitchen." -> "This is the family kitchen." | |
Do not suggest substitutions of one type of punctuation mark for another. For example, do not suggest replacing ` with ', or “ with ". | |
""" | |
def copy_edit(p): | |
messages= [ | |
{"role": "system", "content": prompt}, | |
{"role": "user", "content": f'Here is a chunk of the novel to copy edit: {p}'} | |
] | |
resp = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=messages, | |
temperature=0 | |
) | |
return resp['choices'][0]['message']['content'] | |
def copy_edit_file(name, f): | |
print(f"Processing file: {name}...") | |
edits = copy_edit(f.read()) | |
with open(os.path.join(edits_dir, f"{name}_edits.txt"), "w") as edits_file: | |
edits_file.write(edits) | |
print(f"Done.") | |
process_directory(chunks_dir, copy_edit_file) | |
def is_real_correction(input_str): | |
left_side = input_str.split('->')[0].strip(' -"').replace('’', "'").replace('“', '"').replace('”', '"') | |
right_side = input_str.split('->')[1].strip(' -"').replace('’', "'").replace('“', '"').replace('”', '"') | |
if left_side.endswith('"') and not right_side.endswith('"'): | |
left_side = left_side[:-1] | |
if left_side != right_side and 'remove extra space' not in right_side: | |
return f'- {left_side} -> {right_side}' | |
def post_process(name, f): | |
lines = f.readlines() | |
with open(os.path.join('novel/', "final_edits.txt"), "a") as edits_file: | |
for line in lines: | |
result = is_real_correction(line.strip()) | |
if result: | |
edits_file.write(result + '\n') | |
process_directory(edits_dir, post_process) | |
def find_hallucinations() | |
hallucinations = [] | |
with open(os.path.join('novel/', "novel.txt"), "r") as f: | |
novel = f.read() | |
with open(os.path.join('novel/', "consolidated_edits.txt"), "r") as edits_file: | |
lines = edits_file.readlines() | |
for line in lines: | |
left_side = line.split('->')[0].strip(' -"') | |
# deal with extra " character | |
if left_side not in novel and left_side[:-1] not in novel: | |
hallucinations.append(left_side) | |
return hallucinations | |
print(find_hallucinations()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment