Last active
March 20, 2025 23:12
-
-
Save thewh1teagle/a0c337798ff9c168b8bc0eafba95b3f5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
wget https://huggingface.co/datasets/thewh1teagle/hebright/resolve/main/knesset.txt.zip | |
unzip knesset.txt.zip | |
uv run main.py | |
""" | |
from pathlib import Path | |
import time | |
import requests | |
api_key = '' | |
file = 'knesset.txt' | |
output_file = 'knesset_niqqud.txt' | |
max_batch_size = 30000 # 30,000 characters | |
cur_batch = '' | |
def get_niqqud(text, api_key, genre="modern", add_morph=True, match_partial=True, keep_metagim=False, keep_qq=False): | |
url = "https://nakdan-5-3.loadbalancer.dicta.org.il/addnikud" | |
headers = {'Content-Type': 'text/plain;charset=utf-8'} | |
payload = { | |
"task": "nakdan", | |
"useTokenization": True, | |
"genre": genre, | |
"data": text, | |
"addmorph": add_morph, | |
"matchpartial": match_partial, | |
"keepmetagim": keep_metagim, | |
"keepqq": keep_qq, | |
"apiKey": api_key | |
} | |
try: | |
response = requests.post(url, headers=headers, json=payload) | |
response.raise_for_status() | |
return response.json() | |
except requests.exceptions.RequestException as e: | |
print(f"Error contacting Nakdan API: {e}") | |
return None | |
with open(file, 'r') as in_fp, open(output_file, 'a+') as out_fp: | |
out_fp.seek(0) | |
completed_lines = sum(1 for _ in out_fp) | |
for _ in range(completed_lines): | |
next(in_fp, None) # skip line already used | |
for line in in_fp: | |
line = line.strip() | |
if len(cur_batch) + len(line) > max_batch_size: | |
# Send batch | |
niqqud_batch = get_niqqud(cur_batch, api_key) | |
if not niqqud_batch: | |
continue | |
# Write to out file | |
for niqqud_line in niqqud_batch.split('$'): | |
niqqud_line = niqqud_line.strip() | |
if niqqud_line: | |
out_fp.write(niqqud_line + '\n') | |
out_fp.flush() | |
if line: | |
cur_batch = line # don't miss the line | |
else: | |
# Add to batch | |
cur_batch += ('$' if cur_batch else '') + line | |
# Ensure Last Batch is Sent | |
if cur_batch: | |
niqqud_batch = get_niqqud(cur_batch, api_key) | |
for niqqud_line in niqqud_batch.split('$'): | |
niqqud_line = niqqud_line.strip() | |
if niqqud_line: | |
out_fp.write(niqqud_line + '\n') | |
out_fp.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment