Skip to content

Instantly share code, notes, and snippets.

@thewh1teagle
Last active March 20, 2025 23:12
Show Gist options
  • Save thewh1teagle/a0c337798ff9c168b8bc0eafba95b3f5 to your computer and use it in GitHub Desktop.
Save thewh1teagle/a0c337798ff9c168b8bc0eafba95b3f5 to your computer and use it in GitHub Desktop.
"""
wget https://huggingface.co/datasets/thewh1teagle/hebright/resolve/main/knesset.txt.zip
unzip knesset.txt.zip
uv run main.py
"""
from pathlib import Path
import time
import requests
api_key = ''
file = 'knesset.txt'
output_file = 'knesset_niqqud.txt'
max_batch_size = 30000 # 30,000 characters
cur_batch = ''
def get_niqqud(text, api_key, genre="modern", add_morph=True, match_partial=True, keep_metagim=False, keep_qq=False):
url = "https://nakdan-5-3.loadbalancer.dicta.org.il/addnikud"
headers = {'Content-Type': 'text/plain;charset=utf-8'}
payload = {
"task": "nakdan",
"useTokenization": True,
"genre": genre,
"data": text,
"addmorph": add_morph,
"matchpartial": match_partial,
"keepmetagim": keep_metagim,
"keepqq": keep_qq,
"apiKey": api_key
}
try:
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error contacting Nakdan API: {e}")
return None
with open(file, 'r') as in_fp, open(output_file, 'a+') as out_fp:
out_fp.seek(0)
completed_lines = sum(1 for _ in out_fp)
for _ in range(completed_lines):
next(in_fp, None) # skip line already used
for line in in_fp:
line = line.strip()
if len(cur_batch) + len(line) > max_batch_size:
# Send batch
niqqud_batch = get_niqqud(cur_batch, api_key)
if not niqqud_batch:
continue
# Write to out file
for niqqud_line in niqqud_batch.split('$'):
niqqud_line = niqqud_line.strip()
if niqqud_line:
out_fp.write(niqqud_line + '\n')
out_fp.flush()
if line:
cur_batch = line # don't miss the line
else:
# Add to batch
cur_batch += ('$' if cur_batch else '') + line
# Ensure Last Batch is Sent
if cur_batch:
niqqud_batch = get_niqqud(cur_batch, api_key)
for niqqud_line in niqqud_batch.split('$'):
niqqud_line = niqqud_line.strip()
if niqqud_line:
out_fp.write(niqqud_line + '\n')
out_fp.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment