Created
December 23, 2024 07:01
-
-
Save mim-Armand/8a12672b55bd721a2151fb1143665f32 to your computer and use it in GitHub Desktop.
Scrape a page for Hebrew words dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
def scrape_hebrew_frequency_list(output_csv="hebrew_frequency.csv"): | |
""" | |
Scrapes the Hebrew frequency table from TeachMeHebrew, merges rows with missing 'Rank', | |
and saves all table columns to a CSV file. | |
""" | |
url = "https://www.teachmehebrew.com/hebrew-frequency-list.html" | |
response = requests.get(url) | |
response.raise_for_status() # Raise an error if request failed | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Find the table with id="words" | |
table = soup.find("table", {"id": "words"}) | |
if not table: | |
raise ValueError("Could not find the frequency table on the page (id='words').") | |
print(f"Found table with {len(table.find_all('tr'))} rows.") | |
# Get all <tr> rows in the table | |
rows = table.find_all("tr") | |
# Look for the first row that contains <th> elements | |
header_tr = None | |
for row in rows: | |
if row.find("th"): | |
header_tr = row | |
break | |
if not header_tr: | |
raise ValueError("Could not find a header row (no <th> elements).") | |
headers = [th.get_text(strip=True) for th in header_tr.find_all("th")] | |
print(f"Headers: {headers}") | |
# Extract data rows | |
rows = table.find_all("tr")[1:] # Skip the header row | |
data = [] | |
previous_row = None | |
for row in rows: | |
cols = [col.get_text(strip=True) for col in row.find_all("td")] | |
# If the first column (Rank) is empty, merge this row with the previous one | |
if not cols or not cols[0].strip(): | |
if previous_row: | |
# Merge columns into the previous row | |
for i in range(1, len(cols)): # Start from index 1 (skip Rank column) | |
if cols[i]: # Only merge non-empty cells | |
previous_row[i] += f" {cols[i]}" | |
else: | |
# Add the current row as a new entry | |
data.append(cols) | |
previous_row = cols # Keep track of the current row | |
# Write to CSV | |
with open(output_csv, mode="w", encoding="utf-8", newline="") as f: | |
writer = csv.writer(f) | |
writer.writerow(headers) # Write headers | |
writer.writerows(data) # Write rows | |
print(f"Scraped data saved to {output_csv}") | |
if __name__ == "__main__": | |
scrape_hebrew_frequency_list("hebrew_frequency.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment