Last active
September 23, 2023 08:53
-
-
Save billju/151fca95002f17e96a023cce5877d4f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, json | |
from glob import glob | |
prompt_input = ( | |
"Below is an instruction that describes a task, paired with an input that provides further context. " | |
"Write a response that appropriately completes the request.\n\n" | |
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" | |
) | |
prompt_no_input = ( | |
"Below is an instruction that describes a task. " | |
"Write a response that appropriately completes the request.\n\n" | |
"### Instruction:\n{instruction}\n\n### Response:" | |
) | |
rows = [] | |
for path in glob('*.json'): | |
for row in json.load(open(path,'r',encoding='utf8')): | |
rows.append(row[0] if isinstance(row, list) else row) | |
for path in glob('*.jsonl'): | |
for line in open(path,'r',encoding='utf8').read().splitlines(): | |
rows.append(json.loads(line)) | |
for path in glob('*.md'): | |
for block in open(path,'r',encoding='utf8').read().split('\n\n'): | |
lines = block.splitlines() | |
rows.append({'instruction':lines[0],'input':'','output':'\n'.join(lines[1:])}) | |
train_json = [{'text':(prompt_input if row['input'] else prompt_no_input).format(**row)} for row in rows] | |
if not os.path.exists('data'): os.mkdir('data') | |
json.dump(train_json, open('data/train.json','w',encoding='utf8'),ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment