Skip to content

Instantly share code, notes, and snippets.

@MarcoForte
Created October 27, 2024 18:36
Show Gist options
  • Save MarcoForte/e6f9719d3ae9e1592a4066d4fa64f6a9 to your computer and use it in GitHub Desktop.
Save MarcoForte/e6f9719d3ae9e1592a4066d4fa64f6a9 to your computer and use it in GitHub Desktop.
UKC ticklist to CSV
import requests
from lxml import html
import pandas as pd
def scrape_table_with_links(url):
"""
Scrapes a table from a webpage, extracting both text and URLs from cells.
Specifically targets //form/table/tbody/tr[td] and extracts URLs from column 3.
Args:
url (str): The webpage URL to scrape
Returns:
pandas.DataFrame: DataFrame containing the scraped data with URLs
"""
try:
# Fetch the webpage
response = requests.get(url)
response.raise_for_status()
# Parse the HTML
tree = html.fromstring(response.content)
BASE_URL = 'https://www.ukclimbing.com'
# Get all rows
rows = tree.xpath('//tr')
data = []
for row in rows:
tds = row.xpath('.//td')
if len(tds) >= 9: # Make sure we have all columns
# Get climb link and name
climb_link = tds[2].xpath('.//a')
if not climb_link:
continue
# Get crag link and name
crag_link = tds[8].xpath('.//a')
if not crag_link:
continue
# Extract grade and count stars
grade_text = tds[3].text_content().strip()
stars = grade_text.count('*')
grade = grade_text.replace('*', '').strip()
# Extract number of pitches (if exists)
pitches = tds[6].text_content().strip()
if '•' in pitches:
pitches = pitches.replace('•', '').strip()
else:
pitches = ''
row_data = {
'Add': tds[0].text_content().strip(),
'Climb name': climb_link[0].text_content().strip(),
'Climb URL': BASE_URL + climb_link[0].get('href', ''),
'Grade': grade,
'Stars': stars,
'Ticks': tds[4].text_content().strip(),
'Height': tds[5].text_content().strip(),
'Pitches': pitches,
'Crag name': crag_link[0].text_content().strip(),
'Crag URL': BASE_URL + crag_link[0].get('href', '')
}
data.append(row_data)
df = pd.DataFrame(data)
df = df.dropna(subset=['Climb name']).reset_index(drop=True)
# Add a blank row at the start
blank_row = pd.DataFrame([{col: '' for col in df.columns}])
df = pd.concat([blank_row, df], ignore_index=True)
return df
except Exception as e:
print(f"Error processing the data: {e}")
print("Full error:", str(e))
return None
def main():
# Example usage
url = "https://www.ukclimbing.com/logbook/ticklists/weird_rock_tour-8687" # Replace with your target URL
df = scrape_table_with_links(url)
if df is not None:
print("Scraped Data:")
print(df)
# Optionally save to CSV
df.to_csv('scraped_data.csv', index=False)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment