Created
August 19, 2019 21:04
-
-
Save erickrf/b6d306779b08339d882b5e1aea410b33 to your computer and use it in GitHub Desktop.
Script to join text from JSON files for training GPT-2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input_dir', help='Directory with wiki json files') | |
parser.add_argument('output', help='Txt file output') | |
args = parser.parse_args() | |
for filename in os.listdir(args.input_dir): | |
if not filename.startswith('wiki'): | |
continue | |
path = os.path.join(args.input_dir, filename) | |
with open(path, 'r') as fin, open(args.output, 'a') as fout: | |
for line in fin: | |
data = json.loads(line) | |
# remove non-processed templates | |
text = data['text'].replace('()', '') | |
fout.write(text) | |
fout.write('<|endoftext|>\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment