Created
December 23, 2024 07:06
-
-
Save mim-Armand/fbd8e7fd1f771836f481f5f5e30e50c0 to your computer and use it in GitHub Desktop.
Convert a css file to a dictionary to be used with AI/ML (MFA)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
def create_hebrew_dictionary( | |
input_csv="hebrew_frequency.csv", | |
output_dict="hebrew_dictionary.txt" | |
): | |
""" | |
Reads the CSV file from the scrape script and creates | |
a Hebrew -> Transliteration dictionary (or any mapping you choose). | |
Saves it in a simple text file for demonstration. | |
""" | |
# Adjust these column names/indexes based on the CSV from step 1 | |
# E.g., if your headers are: ["Rank", "Hebrew", "Transliteration", "Part of Speech", "English"] | |
# We might want: Hebrew -> Transliteration | |
# hebrew_col_name = "Hebrew" | |
hebrew_col_name = "Transliteration" | |
translit_col_name = "Transliteration" | |
# Load the CSV | |
with open(input_csv, "r", encoding="utf-8") as f: | |
reader = csv.DictReader(f) | |
# Confirm the columns exist | |
if hebrew_col_name not in reader.fieldnames or translit_col_name not in reader.fieldnames: | |
raise ValueError(f"CSV missing required columns {hebrew_col_name} or {translit_col_name}") | |
# Build a dictionary in Python (Hebrew -> Transliteration) | |
hebrew_dict = {} | |
for row in reader: | |
hebrew_word = row[hebrew_col_name].strip() | |
transliteration = row[translit_col_name].strip() | |
# Store in a Python dict | |
if hebrew_word: | |
hebrew_dict[hebrew_word] = transliteration | |
# Save dictionary to a text file | |
with open(output_dict, "w", encoding="utf-8") as out_f: | |
for hebrew_word, transliteration in hebrew_dict.items(): | |
line = f"{hebrew_word} : {transliteration}\n" | |
out_f.write(line) | |
print(f"Dictionary file saved to {output_dict}") | |
if __name__ == "__main__": | |
create_hebrew_dictionary("hebrew_frequency.csv", "hebrew_dictionary.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment