Skip to content

Instantly share code, notes, and snippets.

@CoffeeVampir3
Last active July 8, 2023 21:52
Show Gist options
  • Save CoffeeVampir3/0f482f6681c06721da9fcd9990caf6b7 to your computer and use it in GitHub Desktop.
Save CoffeeVampir3/0f482f6681c06721da9fcd9990caf6b7 to your computer and use it in GitHub Desktop.
dataset_chonker
import json
import re
import string
import sys
def clean_text(text):
# Strip leading and trailing white space
text = text.strip()
# Replace consecutive white space characters with a single space
text = re.sub(r'\s+', ' ', text)
# Remove long sequences of underscores
text = re.sub(r'_{2,}', '', text)
# Remove non-printable characters
text = ''.join(filter(lambda x: x in string.printable, text))
return text
def clean_json_file(input_file, output_file):
with open(input_file, 'r') as f_in:
data = json.load(f_in)
cleaned_data = [{"text": clean_text(item["text"])} for item in data]
with open(output_file, 'w') as f_out:
json.dump(cleaned_data, f_out)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <input_json_file> <output_json_file>")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
clean_json_file(input_file, output_file)
import os
import sys
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import LlamaTokenizer
import json
def print_file_contents(target_folder):
tokenizer = LlamaTokenizer.from_pretrained("/home/blackroot/Desktop/langchain/WizardLM")
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=400, chunk_overlap=20, separators=['\n\n', '\n', ' ', ''])
i = 0
all_splits = []
for dirpath, dirnames, filenames in os.walk(target_folder):
for filename in filenames:
if "combined" not in filename.lower():
filepath = os.path.join(dirpath, filename)
try:
with open(filepath, 'r') as file:
split = text_splitter.split_text(file.read())
all_splits.extend([{"text": text} for text in split])
except Exception as e:
print("Error: Cannot read file ", filepath)
print("Exception type:", type(e))
print("Exception message:", e)
with open("out.json", 'w') as json_file:
json.dump(all_splits, json_file)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <target_folder>")
sys.exit(1)
target_folder = sys.argv[1]
if not os.path.isdir(target_folder):
print("Error: target folder does not exist")
sys.exit(1)
print_file_contents(target_folder)
import json
import sys
def is_valid_json(file_path):
try:
with open(file_path, 'r') as file:
json.load(file)
return True
except ValueError as e:
print("Invalid JSON:", e)
return False
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <json_file>")
sys.exit(1)
json_file = sys.argv[1]
if is_valid_json(json_file):
print(f"{json_file} is valid JSON.")
else:
print(f"{json_file} is not valid JSON.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment