CoffeeVampir3 · July 8, 2023 21:52
diff --git a/clean_json.py b/clean_json.py
 import json
 import re
 import string
 import sys

 def clean_text(text):
    # Strip leading and trailing white space
    text = text.strip()
    
    # Replace consecutive white space characters with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove long sequences of underscores
    text = re.sub(r'_{2,}', '', text)
    
    # Remove non-printable characters
    text = ''.join(filter(lambda x: x in string.printable, text))
    
    return text

 def clean_json_file(input_file, output_file):
    with open(input_file, 'r') as f_in:
        data = json.load(f_in)
        
    cleaned_data = [{"text": clean_text(item["text"])} for item in data]

    with open(output_file, 'w') as f_out:
        json.dump(cleaned_data, f_out)

 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <input_json_file> <output_json_file>")
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2]

    clean_json_file(input_file, output_file)
diff --git a/dataset_chonker.py b/dataset_chonker.py
 import os
 import sys
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from transformers import LlamaTokenizer
 import json
      
 def print_file_contents(target_folder):
    tokenizer = LlamaTokenizer.from_pretrained("/home/blackroot/Desktop/langchain/WizardLM")
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=400, chunk_overlap=20, separators=['\n\n', '\n', ' ', ''])
    i = 0
    all_splits = []
    for dirpath, dirnames, filenames in os.walk(target_folder):
        for filename in filenames:
            if "combined" not in filename.lower():
                filepath = os.path.join(dirpath, filename)
                try:
                    with open(filepath, 'r') as file:
                        split = text_splitter.split_text(file.read())
                        all_splits.extend([{"text": text} for text in split])
                except Exception as e:
                    print("Error: Cannot read file ", filepath)
                    print("Exception type:", type(e))
                    print("Exception message:", e)
                    
    with open("out.json", 'w') as json_file:
        json.dump(all_splits, json_file)


 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <target_folder>")
        sys.exit(1)

    target_folder = sys.argv[1]

    if not os.path.isdir(target_folder):
        print("Error: target folder does not exist")
        sys.exit(1)

    print_file_contents(target_folder)
diff --git a/validate_json.py b/validate_json.py
 import json
 import sys

 def is_valid_json(file_path):
    try:
        with open(file_path, 'r') as file:
            json.load(file)
        return True
    except ValueError as e:
        print("Invalid JSON:", e)
        return False

 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <json_file>")
        sys.exit(1)

    json_file = sys.argv[1]

    if is_valid_json(json_file):
        print(f"{json_file} is valid JSON.")
    else:
        print(f"{json_file} is not valid JSON.")
	import json
	import re
	import string
	import sys

	def clean_text(text):
	# Strip leading and trailing white space
	text = text.strip()

	# Replace consecutive white space characters with a single space
	text = re.sub(r'\s+', ' ', text)

	# Remove long sequences of underscores
	text = re.sub(r'_{2,}', '', text)

	# Remove non-printable characters
	text = ''.join(filter(lambda x: x in string.printable, text))

	return text

	def clean_json_file(input_file, output_file):
	with open(input_file, 'r') as f_in:
	data = json.load(f_in)

	cleaned_data = [{"text": clean_text(item["text"])} for item in data]

	with open(output_file, 'w') as f_out:
	json.dump(cleaned_data, f_out)

	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print("Usage: python script.py <input_json_file> <output_json_file>")
	sys.exit(1)

	input_file = sys.argv[1]
	output_file = sys.argv[2]

	clean_json_file(input_file, output_file)
	import os
	import sys
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from transformers import LlamaTokenizer
	import json

	def print_file_contents(target_folder):
	tokenizer = LlamaTokenizer.from_pretrained("/home/blackroot/Desktop/langchain/WizardLM")
	text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=400, chunk_overlap=20, separators=['\n\n', '\n', ' ', ''])
	i = 0
	all_splits = []
	for dirpath, dirnames, filenames in os.walk(target_folder):
	for filename in filenames:
	if "combined" not in filename.lower():
	filepath = os.path.join(dirpath, filename)
	try:
	with open(filepath, 'r') as file:
	split = text_splitter.split_text(file.read())
	all_splits.extend([{"text": text} for text in split])
	except Exception as e:
	print("Error: Cannot read file ", filepath)
	print("Exception type:", type(e))
	print("Exception message:", e)

	with open("out.json", 'w') as json_file:
	json.dump(all_splits, json_file)


	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python script.py <target_folder>")
	sys.exit(1)

	target_folder = sys.argv[1]

	if not os.path.isdir(target_folder):
	print("Error: target folder does not exist")
	sys.exit(1)

	print_file_contents(target_folder)
	import json
	import sys

	def is_valid_json(file_path):
	try:
	with open(file_path, 'r') as file:
	json.load(file)
	return True
	except ValueError as e:
	print("Invalid JSON:", e)
	return False

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python script.py <json_file>")
	sys.exit(1)

	json_file = sys.argv[1]

	if is_valid_json(json_file):
	print(f"{json_file} is valid JSON.")
	else:
	print(f"{json_file} is not valid JSON.")