Skip to content

Instantly share code, notes, and snippets.

@twobob
Created July 22, 2023 23:12
Show Gist options
  • Save twobob/e317fa5296858563b00e6302c01f4a68 to your computer and use it in GitHub Desktop.
Save twobob/e317fa5296858563b00e6302c01f4a68 to your computer and use it in GitHub Desktop.
hecks the char set unions and if valid jsonl
import json
def is_valid_jsonl(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, start=1):
try:
json.loads(line)
except json.JSONDecodeError:
print(f'Invalid JSON on line {line_number}: {line}')
return False
return True
def compare_chars(file1, file2):
with open(file1, 'r', encoding='utf-8') as f:
set1 = set(f.read())
with open(file2, 'r', encoding='utf-8') as f:
set2 = set(f.read())
# Union of characters
union_chars = set1.union(set2)
# Characters that are not part of the union
not_in_union_chars = union_chars.difference(set2)
# Characters in file1 not in file2 and vice versa
set1_diff = set1.difference(set2)
set2_diff = set2.difference(set1)
return not_in_union_chars, set1_diff, set2_diff
file_path = 'robotics_stackexchange_com_QLORA_train.jsonl' #(or whatever)
# Example usage:
not_in_union, only_in_file1, only_in_file2 = compare_chars('novel17_eval.jsonl',file_path)
#print("Characters not in union:", not_in_union)
#print("Characters only in novel17_eval.jsonl:", only_in_file1)
#print(not_in_union == only_in_file1) # this is true
print("Characters only in robotics.stackexchange.com_en_all_2022-23_eval.jsonl:", only_in_file2)
# Example usage:
if is_valid_jsonl(file_path):
print(f'{file_path} is a valid JSONL file')
else:
print(f'{file_path} is not a valid JSONL file')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment