mim-Armand · December 23, 2024 07:01
diff --git a/scrape_hebrew_frequency.py b/scrape_hebrew_frequency.py
 import requests
 from bs4 import BeautifulSoup
 import csv

 def scrape_hebrew_frequency_list(output_csv="hebrew_frequency.csv"):
    """
    Scrapes the Hebrew frequency table from TeachMeHebrew, merges rows with missing 'Rank',
    and saves all table columns to a CSV file.
    """
    url = "https://www.teachmehebrew.com/hebrew-frequency-list.html"
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if request failed

    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table with id="words"
    table = soup.find("table", {"id": "words"})
    if not table:
        raise ValueError("Could not find the frequency table on the page (id='words').")

    print(f"Found table with {len(table.find_all('tr'))} rows.")

    # Get all <tr> rows in the table
    rows = table.find_all("tr")

    # Look for the first row that contains <th> elements
    header_tr = None
    for row in rows:
        if row.find("th"):
            header_tr = row
            break

    if not header_tr:
        raise ValueError("Could not find a header row (no <th> elements).")
    headers = [th.get_text(strip=True) for th in header_tr.find_all("th")]

    print(f"Headers: {headers}")

    # Extract data rows
    rows = table.find_all("tr")[1:]  # Skip the header row
    data = []
    previous_row = None

    for row in rows:
        cols = [col.get_text(strip=True) for col in row.find_all("td")]

        # If the first column (Rank) is empty, merge this row with the previous one
        if not cols or not cols[0].strip():
            if previous_row:
                # Merge columns into the previous row
                for i in range(1, len(cols)):  # Start from index 1 (skip Rank column)
                    if cols[i]:  # Only merge non-empty cells
                        previous_row[i] += f" {cols[i]}"
        else:
            # Add the current row as a new entry
            data.append(cols)
            previous_row = cols  # Keep track of the current row

    # Write to CSV
    with open(output_csv, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)  # Write headers
        writer.writerows(data)  # Write rows

    print(f"Scraped data saved to {output_csv}")


 if __name__ == "__main__":
    scrape_hebrew_frequency_list("hebrew_frequency.csv")
	import requests
	from bs4 import BeautifulSoup
	import csv

	def scrape_hebrew_frequency_list(output_csv="hebrew_frequency.csv"):
	"""
	Scrapes the Hebrew frequency table from TeachMeHebrew, merges rows with missing 'Rank',
	and saves all table columns to a CSV file.
	"""
	url = "https://www.teachmehebrew.com/hebrew-frequency-list.html"
	response = requests.get(url)
	response.raise_for_status() # Raise an error if request failed

	soup = BeautifulSoup(response.text, "html.parser")

	# Find the table with id="words"
	table = soup.find("table", {"id": "words"})
	if not table:
	raise ValueError("Could not find the frequency table on the page (id='words').")

	print(f"Found table with {len(table.find_all('tr'))} rows.")

	# Get all <tr> rows in the table
	rows = table.find_all("tr")

	# Look for the first row that contains <th> elements
	header_tr = None
	for row in rows:
	if row.find("th"):
	header_tr = row
	break

	if not header_tr:
	raise ValueError("Could not find a header row (no <th> elements).")
	headers = [th.get_text(strip=True) for th in header_tr.find_all("th")]

	print(f"Headers: {headers}")

	# Extract data rows
	rows = table.find_all("tr")[1:] # Skip the header row
	data = []
	previous_row = None

	for row in rows:
	cols = [col.get_text(strip=True) for col in row.find_all("td")]

	# If the first column (Rank) is empty, merge this row with the previous one
	if not cols or not cols[0].strip():
	if previous_row:
	# Merge columns into the previous row
	for i in range(1, len(cols)): # Start from index 1 (skip Rank column)
	if cols[i]: # Only merge non-empty cells
	previous_row[i] += f" {cols[i]}"
	else:
	# Add the current row as a new entry
	data.append(cols)
	previous_row = cols # Keep track of the current row

	# Write to CSV
	with open(output_csv, mode="w", encoding="utf-8", newline="") as f:
	writer = csv.writer(f)
	writer.writerow(headers) # Write headers
	writer.writerows(data) # Write rows

	print(f"Scraped data saved to {output_csv}")


	if __name__ == "__main__":
	scrape_hebrew_frequency_list("hebrew_frequency.csv")