Last active
July 8, 2023 21:52
-
-
Save CoffeeVampir3/0f482f6681c06721da9fcd9990caf6b7 to your computer and use it in GitHub Desktop.
dataset_chonker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import string | |
import sys | |
def clean_text(text): | |
# Strip leading and trailing white space | |
text = text.strip() | |
# Replace consecutive white space characters with a single space | |
text = re.sub(r'\s+', ' ', text) | |
# Remove long sequences of underscores | |
text = re.sub(r'_{2,}', '', text) | |
# Remove non-printable characters | |
text = ''.join(filter(lambda x: x in string.printable, text)) | |
return text | |
def clean_json_file(input_file, output_file): | |
with open(input_file, 'r') as f_in: | |
data = json.load(f_in) | |
cleaned_data = [{"text": clean_text(item["text"])} for item in data] | |
with open(output_file, 'w') as f_out: | |
json.dump(cleaned_data, f_out) | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python script.py <input_json_file> <output_json_file>") | |
sys.exit(1) | |
input_file = sys.argv[1] | |
output_file = sys.argv[2] | |
clean_json_file(input_file, output_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from transformers import LlamaTokenizer | |
import json | |
def print_file_contents(target_folder): | |
tokenizer = LlamaTokenizer.from_pretrained("/home/blackroot/Desktop/langchain/WizardLM") | |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=400, chunk_overlap=20, separators=['\n\n', '\n', ' ', '']) | |
i = 0 | |
all_splits = [] | |
for dirpath, dirnames, filenames in os.walk(target_folder): | |
for filename in filenames: | |
if "combined" not in filename.lower(): | |
filepath = os.path.join(dirpath, filename) | |
try: | |
with open(filepath, 'r') as file: | |
split = text_splitter.split_text(file.read()) | |
all_splits.extend([{"text": text} for text in split]) | |
except Exception as e: | |
print("Error: Cannot read file ", filepath) | |
print("Exception type:", type(e)) | |
print("Exception message:", e) | |
with open("out.json", 'w') as json_file: | |
json.dump(all_splits, json_file) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <target_folder>") | |
sys.exit(1) | |
target_folder = sys.argv[1] | |
if not os.path.isdir(target_folder): | |
print("Error: target folder does not exist") | |
sys.exit(1) | |
print_file_contents(target_folder) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sys | |
def is_valid_json(file_path): | |
try: | |
with open(file_path, 'r') as file: | |
json.load(file) | |
return True | |
except ValueError as e: | |
print("Invalid JSON:", e) | |
return False | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <json_file>") | |
sys.exit(1) | |
json_file = sys.argv[1] | |
if is_valid_json(json_file): | |
print(f"{json_file} is valid JSON.") | |
else: | |
print(f"{json_file} is not valid JSON.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment