Instantly share code, notes, and snippets.
Last active
September 11, 2023 19:29
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save bdashore3/4c9f3a812c1a68013fdb23e1179c7765 to your computer and use it in GitHub Desktop.
LimaRP-ShareGPT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import re | |
import os.path | |
import statistics | |
import random | |
import pandas | |
import yaml | |
from transformers import LlamaTokenizer, AutoTokenizer | |
# Training examples designed for 4k context size, used for the initial ~1000 | |
# samples release of LimaRP. | |
files = glob.glob('./data/**/*.yaml', recursive=True) | |
outfile = 'out/train4k.jsonl' | |
# Training examples designed for 8k context size. Avoid using with a 4k context | |
# size (in other words, limiting data length to 4096 tokens using the option below), | |
# as character personas and scenario may end up not being accurate to the context. | |
# files = glob.glob('./data-long/**/*.yaml', recursive=True) | |
# outfile = 'out/train8k.jsonl' | |
# Old evals. These are generally lower-quality training examples removed from the | |
# initially-made 4k dataset, or training examples with unwanted issues. | |
# files = glob.glob('./data-evals/**/*.yaml', recursive=True) | |
# outfile = '/home/anon/bin/qlora/piper/eval.jsonl' | |
# The pretrained model path is needed for using its tokenizer | |
pretrained_model_path = 'F:\AI\models\meta-llama_Llama-2-13b-hf' | |
# Try to limit training example length by removing early messages, without clipping | |
# them. This works better if the RP messages in the conversations aren't too long, | |
# otherwise training examples can end up being significantly shorter than the limit. | |
limit_data = False | |
limit_data_length = 4096 | |
# This changes the way the data is arranged in the output json files, affecting | |
# model prediction during training in subtle ways. Supported formats: | |
# | |
# 'output_only' | |
# everything on the output like Guanaco (similar to unsupervised tuning). | |
# 'bot_output' | |
# system+conversation on the input, last bot response on the output (same as Pygmalion). | |
# 'system_input' | |
# system on the input, entire conversation on the output. | |
train_format = 'output_only' | |
# Use original character names instead of replacing them with alternative labels. | |
use_original_names = True | |
# Alternative labels to prepend at the start of the utterance, e.g. `USER:` | |
label_user = 'USER' | |
label_bot = 'CHAR' | |
# Alternative labels to use inside the utterance | |
placeholder_user = 'USER' | |
placeholder_bot = 'CHAR' | |
# Various instruct and model sequences. Original LimaRP format. | |
seq_system = '<<SYSTEM>>' | |
seq_human = '<<HUMAN>>' | |
seq_aibot = '<<AIBOT>>' | |
# Same, but Alpaca format. Doesn't appear to work as well as the LimaRP format. | |
seq_system = '### Instruction:' | |
seq_human = '### Input:' | |
seq_aibot = '### Response:' | |
# Character placeholders baked in the files; **DO NOT CHANGE** | |
placeholder_user_old = '<SECOND>' | |
placeholder_bot_old = '<FIRST>' | |
# System prompt options | |
two_char_system_prompts = [ | |
"""Enter roleplay mode. You are currently %{having a conversation|in conversation|in a roleplay chat} with <SECOND>, whose %{traits are|persona is|characteristics are}: | |
<SECOND PERSONA> | |
%{You are|Play the role of|Take the role of} <FIRST> with the following %{persona|definitions|character sheet|traits}: | |
<FIRST PERSONA> | |
%{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}: | |
<SCENARIO>""", | |
"""<SECOND>'s Persona: <SECOND PERSONA> | |
<FIRST>'s Persona: <FIRST PERSONA> | |
You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions} for <FIRST>, you must engage in %{a roleplay conversation|an RP chat} with <SECOND>. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}: | |
<SCENARIO>""", | |
] | |
single_char_system_prompts = [ | |
"""Enter roleplay mode. %{You are|Play the role of|Take the role of|Become the character} <FIRST> with the following %{persona|definitions|character sheet|traits}: | |
<FIRST PERSONA> | |
%{In addition|Additionally|Also}, %{keep the following scenario in mind|remember this scenario|pay attention to this scenario}: | |
<SCENARIO>""", | |
"""<FIRST>'s Persona: <FIRST PERSONA> | |
You are <FIRST>. Using the above %{persona|traits|character sheet|character definitions}, you must engage in %{a roleplay conversation|an RP chat}. %{Keep the following scenario in mind|Remember this scenario|Pay attention to this scenario}: | |
<SCENARIO>""" | |
] | |
# Use two-character or one-character system prompts | |
use_two_char_sys = True | |
# -1 = both prompts, 0 = system prompt A, 1 = system prompt B | |
system_prompt_index = 1 | |
tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_path) | |
def substitute_participants(input_string): | |
''' | |
Replace placeholder usernames with different names | |
TODO: the implementation could be improved by *not* relying on external variables. | |
''' | |
#if is_system: | |
input_string = input_string.replace("<FIRST PERSONA>", source['persona']['<FIRST>']) | |
input_string = input_string.replace("<SECOND PERSONA>", source['persona']['<SECOND>']) | |
input_string = input_string.replace("<SCENARIO>", source['scenario']) | |
input_string = input_string.replace("<SECOND>", placeholder_user) | |
input_string = input_string.replace("<FIRST>", placeholder_bot) | |
#if is_system: | |
#print(input_string) | |
return input_string | |
def fix_punctuation(input_string): | |
''' | |
Replace fancy/incorrect punctuation with simpler/correct one | |
TODO: more effective regexes, options for controlling what should be changed. | |
''' | |
# Fix excessive horizontal whitespace. This should go before everything else. | |
input_string = re.sub(r' {2,}', ' ', input_string) | |
# General puncuation fixes | |
input_string = input_string.replace(' !', '!') | |
input_string = input_string.replace(' ?', '?') | |
input_string = input_string.replace('’', "'") | |
input_string = input_string.replace('‘', "'") | |
input_string = input_string.replace('“', '"') | |
input_string = input_string.replace('”', '"') | |
input_string = input_string.replace('…', '...') | |
# Replace em-dash surrogates `---` in the source files with actual | |
# em-dashes, since some people apparently dislike them. | |
input_string = input_string.replace('---', '—') | |
# Fix incorrect ellipsis. This should preferably be fixed in the | |
# source files themselves | |
input_string = re.sub(r'(\w)\.{2,8}(\w)', r'\1... \2', input_string) | |
input_string = re.sub(r'(\w)\.{3,8}', r'\1...', input_string) | |
return input_string | |
def detect_single_newlines(input_string): | |
''' | |
Single newlines are most of the time unwanted | |
''' | |
# Removing double newlines first | |
input_string = input_string.replace('\n\n', '||') | |
# Now detect single newline presence | |
if input_string.find('\n') >= 0: | |
return True | |
else: | |
return False | |
def validate_placeholders(input_string, placeholder_list): | |
''' | |
Verifies that placeholder (baked in the files) have been used correctly. | |
They must not be open and not have typos. | |
''' | |
search_placeholders = re.findall('<.*?>', input_string) | |
if search_placeholders: | |
# If the variable is nonzero, there are strings detected as placeholders. | |
for item in search_placeholders: | |
# Compare the strings to the placeholders. They must be the same. | |
if item not in placeholder_list: | |
return False | |
# A naive check is counting the number of '<' and '>'. They must match, although | |
# this means that these symbols cannot be used for other things, which was already | |
# the assumption anyway. | |
str_lt = input_string.count('<') | |
str_gt = input_string.count('>') | |
if str_lt != str_gt: | |
return False | |
# All checks passed | |
return True | |
def get_prompt(prompt_string): | |
pattern = re.compile(r'%{(.+?)}') | |
for m in re.finditer(pattern, prompt_string): | |
match = m.group(0) | |
replace = random.choice(m.group(1).split("|")) | |
prompt_string = prompt_string.replace(match, replace) | |
return prompt_string | |
message_count = 0 | |
entries = [] | |
token_lengths = [] | |
count = 0 | |
for num, file in enumerate(files): | |
with open(file, 'r', encoding='utf-8') as f: | |
source = yaml.safe_load(f) | |
if use_original_names: | |
label_user = source['names']['<SECOND>'] | |
label_bot = source['names']['<FIRST>'] | |
placeholder_user = source['names']['<SECOND>'] | |
placeholder_bot = source['names']['<FIRST>'] | |
if label_user is None: | |
raise NameError(f"{file}: USER must have a name.") | |
elif label_bot is None: | |
raise NameError(f"{file}: CHAR must have a name.") | |
elif (3 > len(label_user) > 20) or (3 > len(label_bot) > 20): | |
raise NameError(f"{file}: unusual character name length: possible issue.") | |
elif ('SECOND' in source['names']['<SECOND>']) or ('FIRST' in source['names']['<FIRST>']): | |
# No need to check for the exact string here, just if there's a clearly | |
# defective name that likely resulted from user error while manually | |
# processing the files. | |
raise NameError(f"{file}: Incorrect character names.") | |
elif (source['persona']['<FIRST>'] is None) or (len(source['persona']['<FIRST>']) < 20): | |
raise SyntaxError(f'{file}: no persona defined for <FIRST>.') | |
elif (source['persona']['<SECOND>'] is None) or (len(source['persona']['<SECOND>']) < 20): | |
raise SyntaxError(f'{file}: no persona defined for <SECOND>.') | |
elif source['scenario'] is None: | |
raise SyntaxError(f'{file}: Scenario missing.') | |
elif len(source['scenario']) < 100: | |
raise SyntaxError(f'{file}: Probable error in scenario (too short).') | |
# Perform various per-message checks | |
for message in source['conversation']: | |
# Check if there are open (unpaired) quotation marks | |
quotation_marks = 0 | |
quotation_marks += message['text'].count('"') | |
if (quotation_marks % 2) != 0: | |
print((f'\n{file}: Open quotation marks\n{message["text"]}\n\n')) | |
# Check if there are unpaired asterisks (for inner thoughts, etc) | |
paired_asterisks = 0 | |
paired_asterisks += message['text'].count('*') | |
if (paired_asterisks % 2) != 0: | |
print((f'\n{file}: Unpaired asterisks\n{message["text"]}\n\n')) | |
# Check for single newlines | |
if detect_single_newlines(message['text']): | |
print((f'\n{file}: Single newline\n{message["text"]}\n\n')) | |
# Check if placeholders have been used correctly in the messages | |
if not validate_placeholders(message['text'], [placeholder_user_old, placeholder_bot_old]): | |
print((f'\n{file}: Incorrect placeholder\n{message["text"]}\n\n')) | |
# Storing the length of BOT's messages in words. | |
message_lengths = [] | |
if message['name'] == '<FIRST>': | |
message_lengths.append(len(message['text'].split())) | |
# Check if placeholders have been used correctly in personas and scenarios | |
if not validate_placeholders(source['scenario'], [placeholder_user_old, placeholder_bot_old]): | |
print((f'\n{file}: Incorrect placeholder in scenario (or: < > not allowed)\n{source["scenario"]}\n\n')) | |
if not validate_placeholders(source['persona']['<FIRST>'], [placeholder_user_old, placeholder_bot_old]): | |
print((f'\n{file}: Incorrect placeholder in <FIRST> Persona (or: < > not allowed)\n{source["persona"]["<FIRST>"]}\n\n')) | |
if not validate_placeholders(source["persona"]["<SECOND>"], [placeholder_user_old, placeholder_bot_old]): | |
print((f'\n{file}: Incorrect placeholder in <SECOND> Persona (or: < > not allowed)\n{source["persona"]["<SECOND>"]}\n\n')) | |
# This section tries to limit the total number of token below a predefined limit. | |
# It operates on a message basis. The current algorithm slows down parsing as | |
# it repeats tokenization - it was made quickly to test the idea. | |
if limit_data: | |
# First = GPT, Second = Human | |
tokens_persona_1 = len(tokenizer(source['persona']['<FIRST>'])['input_ids']) | |
tokens_persona_2 = len(tokenizer(source['persona']['<SECOND>'])['input_ids']) | |
tokens_scenario = len(tokenizer(source['scenario'])['input_ids']) | |
tokens_extra = 20 + 45 # Extra tokens to take into account newlines and and preamble | |
tokens_header = tokens_persona_1 + tokens_persona_2 + tokens_scenario + tokens_extra | |
tokens_messages = [] | |
total_tokens = tokens_header | |
for message in source['conversation']: | |
message_tokens = 0 | |
message_tokens += 16 # Worst-case scenario for newlines, system sequences and character names | |
message_tokens += len(tokenizer(message['text'])['input_ids']) | |
tokens_messages.append(message_tokens) | |
# Naive search for the smallest subset of messages whose token length | |
# summed with that of the header is lower than the limit. | |
for i_start in range(0, len(source['conversation']) - 1): | |
totaltokens = tokens_header + sum(tokens_messages[i_start:]) | |
if totaltokens < limit_data_length: | |
break | |
if totaltokens > limit_data_length: | |
# Drop the messages if they exceed the threshold | |
print(f'[{num+1}]:DROP', end=' ') | |
continue | |
else: | |
# To be used as a starting index for the next big loop | |
i_start = 0 | |
sharegpt_text = { | |
"roles": [label_user, label_bot], | |
"conversations": [] | |
} | |
current_user = None | |
previous_user = None | |
if not limit_data: | |
assert i_start == 0 | |
# Adding messages for composing the input (prompt and chat history). Make | |
# sure to skip the last message, because that's to be used for the output. | |
# Unless reducing message length, i_start *must* be zero. | |
for message in source['conversation']: | |
message_count += 1 | |
current_user = message['name'] | |
message_text = "" | |
if previous_user == current_user: | |
# Message usernames must alternate. Error out if they don't, as this indicates a problem | |
raise ValueError(f'{file}: Consecutive messages from the same person or incorrect indentation.') | |
if len(message['text']) < 25: | |
# An arbitrarily low threshold that should include most copy/paste errors. | |
raise ValueError(f'{file}: Unusually short message in the conversation.') | |
message_text = message["text"].strip() | |
message_text = substitute_participants(message_text) | |
message_text = fix_punctuation(message_text) | |
sharegpt_text["conversations"].append({ | |
"from": "gpt" if message["name"] == "<FIRST>" else "human", | |
"value": re.sub(r'[\n]+', '', message_text) | |
}) | |
previous_user = current_user | |
average_message_length = statistics.mean(message_lengths) | |
header = [] | |
header.append(f'{seq_system}') | |
header.append(f"{placeholder_bot}'s Persona: {source['persona']['<FIRST>']}\n") | |
header.append(f"{placeholder_user}'s Persona: {source['persona']['<SECOND>']}\n") | |
header.append(f"Scenario: {source['scenario']}\n") | |
selected_prompt_array = two_char_system_prompts if use_two_char_sys else single_char_system_prompts | |
prompt_string = random.choice(selected_prompt_array) if system_prompt_index == -1 else selected_prompt_array[system_prompt_index] | |
prompt_string = substitute_participants(prompt_string) | |
prompt_string = fix_punctuation(prompt_string) | |
sharegpt_text['conversations'].insert(0, { | |
"from": "system", | |
"value": get_prompt(prompt_string) | |
}) | |
# XXX: this must be taken elsewhere as it repeats the tokenization performed | |
# initially to limit message length. | |
# len_total = len(tokenizer(header + conversation_text)['input_ids']) | |
# Since the limiting process has some wiggle room, make sure that the | |
# final data doesn't exceed the length limit | |
# if len_total > limit_data_length: | |
# raise OverflowError(f"{file} exceeds the context length limit") | |
# token_lengths.append(len_total) | |
entry = sharegpt_text | |
entries.append(entry) | |
# break | |
# print(f'[{num+1}]:{text_average_message_length}:{len_total}', end=' ') | |
print(f"\n\nTotal conversations: {len(entries)}\nTotal messages: {message_count}\n") | |
# f"Longest sequence length: {max(token_lengths)} tokens\n" | |
# f"Mean sequence length: {statistics.mean(token_lengths):.1f} tokens\n" | |
# f"Total training tokens: {sum(token_lengths):,} tokens") | |
# Create a dataframe and shuffle it, resetting the index. | |
df = pandas.DataFrame(entries) | |
df = df.sample(frac=1).reset_index(drop=True) | |
df.to_json(outfile, orient='records', lines=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment