Skip to content

Instantly share code, notes, and snippets.

@mim-Armand
Created December 23, 2024 07:01
Show Gist options
  • Save mim-Armand/8a12672b55bd721a2151fb1143665f32 to your computer and use it in GitHub Desktop.
Save mim-Armand/8a12672b55bd721a2151fb1143665f32 to your computer and use it in GitHub Desktop.
Scrape a page for Hebrew words dictionary
import requests
from bs4 import BeautifulSoup
import csv
def scrape_hebrew_frequency_list(output_csv="hebrew_frequency.csv"):
"""
Scrapes the Hebrew frequency table from TeachMeHebrew, merges rows with missing 'Rank',
and saves all table columns to a CSV file.
"""
url = "https://www.teachmehebrew.com/hebrew-frequency-list.html"
response = requests.get(url)
response.raise_for_status() # Raise an error if request failed
soup = BeautifulSoup(response.text, "html.parser")
# Find the table with id="words"
table = soup.find("table", {"id": "words"})
if not table:
raise ValueError("Could not find the frequency table on the page (id='words').")
print(f"Found table with {len(table.find_all('tr'))} rows.")
# Get all <tr> rows in the table
rows = table.find_all("tr")
# Look for the first row that contains <th> elements
header_tr = None
for row in rows:
if row.find("th"):
header_tr = row
break
if not header_tr:
raise ValueError("Could not find a header row (no <th> elements).")
headers = [th.get_text(strip=True) for th in header_tr.find_all("th")]
print(f"Headers: {headers}")
# Extract data rows
rows = table.find_all("tr")[1:] # Skip the header row
data = []
previous_row = None
for row in rows:
cols = [col.get_text(strip=True) for col in row.find_all("td")]
# If the first column (Rank) is empty, merge this row with the previous one
if not cols or not cols[0].strip():
if previous_row:
# Merge columns into the previous row
for i in range(1, len(cols)): # Start from index 1 (skip Rank column)
if cols[i]: # Only merge non-empty cells
previous_row[i] += f" {cols[i]}"
else:
# Add the current row as a new entry
data.append(cols)
previous_row = cols # Keep track of the current row
# Write to CSV
with open(output_csv, mode="w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(headers) # Write headers
writer.writerows(data) # Write rows
print(f"Scraped data saved to {output_csv}")
if __name__ == "__main__":
scrape_hebrew_frequency_list("hebrew_frequency.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment