MarcoForte · October 27, 2024 18:36
diff --git a/ukc_ticklist_scraper.py b/ukc_ticklist_scraper.py
 import requests
 from lxml import html
 import pandas as pd

 def scrape_table_with_links(url):
    """
    Scrapes a table from a webpage, extracting both text and URLs from cells.
    Specifically targets //form/table/tbody/tr[td] and extracts URLs from column 3.
    
    Args:
        url (str): The webpage URL to scrape
        
    Returns:
        pandas.DataFrame: DataFrame containing the scraped data with URLs
    """
    try:
        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML
        tree = html.fromstring(response.content)
        
        BASE_URL = 'https://www.ukclimbing.com'
        
        # Get all rows
        rows = tree.xpath('//tr')
        data = []
        
        for row in rows:
            tds = row.xpath('.//td')
            if len(tds) >= 9:  # Make sure we have all columns
                # Get climb link and name
                climb_link = tds[2].xpath('.//a')
                if not climb_link:
                    continue
                    
                # Get crag link and name
                crag_link = tds[8].xpath('.//a')
                if not crag_link:
                    continue
                
                # Extract grade and count stars
                grade_text = tds[3].text_content().strip()
                stars = grade_text.count('*')
                grade = grade_text.replace('*', '').strip()
                
                # Extract number of pitches (if exists)
                pitches = tds[6].text_content().strip()
                if '•' in pitches:
                    pitches = pitches.replace('•', '').strip()
                else:
                    pitches = ''
                
                row_data = {
                    'Add': tds[0].text_content().strip(),
                    'Climb name': climb_link[0].text_content().strip(),
                    'Climb URL': BASE_URL + climb_link[0].get('href', ''),
                    'Grade': grade,
                    'Stars': stars,
                    'Ticks': tds[4].text_content().strip(),
                    'Height': tds[5].text_content().strip(),
                    'Pitches': pitches,
                    'Crag name': crag_link[0].text_content().strip(),
                    'Crag URL': BASE_URL + crag_link[0].get('href', '')
                }
                data.append(row_data)
        
        df = pd.DataFrame(data)
        df = df.dropna(subset=['Climb name']).reset_index(drop=True)
        
        # Add a blank row at the start
        blank_row = pd.DataFrame([{col: '' for col in df.columns}])
        df = pd.concat([blank_row, df], ignore_index=True)
        
        return df
        
    except Exception as e:
        print(f"Error processing the data: {e}")
        print("Full error:", str(e))
        return None

 def main():
    # Example usage
    url = "https://www.ukclimbing.com/logbook/ticklists/weird_rock_tour-8687"  # Replace with your target URL
    df = scrape_table_with_links(url)
    
    if df is not None:
        print("Scraped Data:")
        print(df)
        
        # Optionally save to CSV
        df.to_csv('scraped_data.csv', index=False)

 if __name__ == "__main__":
    main()
	import requests
	from lxml import html
	import pandas as pd

	def scrape_table_with_links(url):
	"""
	Scrapes a table from a webpage, extracting both text and URLs from cells.
	Specifically targets //form/table/tbody/tr[td] and extracts URLs from column 3.

	Args:
	url (str): The webpage URL to scrape

	Returns:
	pandas.DataFrame: DataFrame containing the scraped data with URLs
	"""
	try:
	# Fetch the webpage
	response = requests.get(url)
	response.raise_for_status()

	# Parse the HTML
	tree = html.fromstring(response.content)

	BASE_URL = 'https://www.ukclimbing.com'

	# Get all rows
	rows = tree.xpath('//tr')
	data = []

	for row in rows:
	tds = row.xpath('.//td')
	if len(tds) >= 9: # Make sure we have all columns
	# Get climb link and name
	climb_link = tds[2].xpath('.//a')
	if not climb_link:
	continue

	# Get crag link and name
	crag_link = tds[8].xpath('.//a')
	if not crag_link:
	continue

	# Extract grade and count stars
	grade_text = tds[3].text_content().strip()
	stars = grade_text.count('*')
	grade = grade_text.replace('*', '').strip()

	# Extract number of pitches (if exists)
	pitches = tds[6].text_content().strip()
	if '•' in pitches:
	pitches = pitches.replace('•', '').strip()
	else:
	pitches = ''

	row_data = {
	'Add': tds[0].text_content().strip(),
	'Climb name': climb_link[0].text_content().strip(),
	'Climb URL': BASE_URL + climb_link[0].get('href', ''),
	'Grade': grade,
	'Stars': stars,
	'Ticks': tds[4].text_content().strip(),
	'Height': tds[5].text_content().strip(),
	'Pitches': pitches,
	'Crag name': crag_link[0].text_content().strip(),
	'Crag URL': BASE_URL + crag_link[0].get('href', '')
	}
	data.append(row_data)

	df = pd.DataFrame(data)
	df = df.dropna(subset=['Climb name']).reset_index(drop=True)

	# Add a blank row at the start
	blank_row = pd.DataFrame([{col: '' for col in df.columns}])
	df = pd.concat([blank_row, df], ignore_index=True)

	return df

	except Exception as e:
	print(f"Error processing the data: {e}")
	print("Full error:", str(e))
	return None

	def main():
	# Example usage
	url = "https://www.ukclimbing.com/logbook/ticklists/weird_rock_tour-8687" # Replace with your target URL
	df = scrape_table_with_links(url)

	if df is not None:
	print("Scraped Data:")
	print(df)

	# Optionally save to CSV
	df.to_csv('scraped_data.csv', index=False)

	if __name__ == "__main__":
	main()