Created
July 22, 2023 23:12
-
-
Save twobob/e317fa5296858563b00e6302c01f4a68 to your computer and use it in GitHub Desktop.
hecks the char set unions and if valid jsonl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
def is_valid_jsonl(file_path): | |
with open(file_path, 'r', encoding='utf-8') as f: | |
for line_number, line in enumerate(f, start=1): | |
try: | |
json.loads(line) | |
except json.JSONDecodeError: | |
print(f'Invalid JSON on line {line_number}: {line}') | |
return False | |
return True | |
def compare_chars(file1, file2): | |
with open(file1, 'r', encoding='utf-8') as f: | |
set1 = set(f.read()) | |
with open(file2, 'r', encoding='utf-8') as f: | |
set2 = set(f.read()) | |
# Union of characters | |
union_chars = set1.union(set2) | |
# Characters that are not part of the union | |
not_in_union_chars = union_chars.difference(set2) | |
# Characters in file1 not in file2 and vice versa | |
set1_diff = set1.difference(set2) | |
set2_diff = set2.difference(set1) | |
return not_in_union_chars, set1_diff, set2_diff | |
file_path = 'robotics_stackexchange_com_QLORA_train.jsonl' #(or whatever) | |
# Example usage: | |
not_in_union, only_in_file1, only_in_file2 = compare_chars('novel17_eval.jsonl',file_path) | |
#print("Characters not in union:", not_in_union) | |
#print("Characters only in novel17_eval.jsonl:", only_in_file1) | |
#print(not_in_union == only_in_file1) # this is true | |
print("Characters only in robotics.stackexchange.com_en_all_2022-23_eval.jsonl:", only_in_file2) | |
# Example usage: | |
if is_valid_jsonl(file_path): | |
print(f'{file_path} is a valid JSONL file') | |
else: | |
print(f'{file_path} is not a valid JSONL file') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment