rneiss · January 13, 2025 23:11
diff --git a/klein_scrape.py b/klein_scrape.py
 import requests
 import json
 import time
 import csv
 import urllib.parse

 def fetch_data(url, retries=5, backoff_factor=1):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url)
            
            if response.status_code == 200:
                return response.json()
            
            elif 400 <= response.status_code < 600:
                print(f"Received error {response.status_code} from {url}. Retrying...")
                attempt += 1
                sleep_time = backoff_factor * (2 ** attempt)  # Exponential backoff
                print(f"Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print(f"Received unexpected status {response.status_code} from {url}.")
                return None

        except requests.RequestException as e:
            # Catch any network-related or other requests errors
            print(f"Request failed with error: {e}. Retrying...")
            attempt += 1
            sleep_time = backoff_factor * (2 ** attempt)
            print(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

    print(f"Failed to fetch data from {url} after {retries} attempts.")
    return None

 def main():
    base_url = "https://www.sefaria.org/api/v3/texts/"
    initial_url = base_url + "Klein_Dictionary,_א.1"
    
    def write_to_csv(ref, text):
        with open('output.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([ref.replace('Klein Dictionary, ', ''), text])

    def process_data(url):
        data = fetch_data(url)
        if data:
            write_to_csv(data['ref'], data['versions'][0]['text'][0])
            return data.get('next')
        return None

    next_ref = process_data(initial_url)
    while next_ref:
        next_url = base_url + next_ref

        print(f"Fetching {next_url}")
        next_ref = process_data(next_url)

 if __name__ == "__main__":
    main()
	import requests
	import json
	import time
	import csv
	import urllib.parse

	def fetch_data(url, retries=5, backoff_factor=1):
	attempt = 0
	while attempt < retries:
	try:
	response = requests.get(url)

	if response.status_code == 200:
	return response.json()

	elif 400 <= response.status_code < 600:
	print(f"Received error {response.status_code} from {url}. Retrying...")
	attempt += 1
	sleep_time = backoff_factor * (2 ** attempt) # Exponential backoff
	print(f"Retrying in {sleep_time} seconds...")
	time.sleep(sleep_time)
	else:
	print(f"Received unexpected status {response.status_code} from {url}.")
	return None

	except requests.RequestException as e:
	# Catch any network-related or other requests errors
	print(f"Request failed with error: {e}. Retrying...")
	attempt += 1
	sleep_time = backoff_factor * (2 ** attempt)
	print(f"Retrying in {sleep_time} seconds...")
	time.sleep(sleep_time)

	print(f"Failed to fetch data from {url} after {retries} attempts.")
	return None

	def main():
	base_url = "https://www.sefaria.org/api/v3/texts/"
	initial_url = base_url + "Klein_Dictionary,_א.1"

	def write_to_csv(ref, text):
	with open('output.csv', 'a', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow([ref.replace('Klein Dictionary, ', ''), text])

	def process_data(url):
	data = fetch_data(url)
	if data:
	write_to_csv(data['ref'], data['versions'][0]['text'][0])
	return data.get('next')
	return None

	next_ref = process_data(initial_url)
	while next_ref:
	next_url = base_url + next_ref

	print(f"Fetching {next_url}")
	next_ref = process_data(next_url)

	if __name__ == "__main__":
	main()