Created
October 27, 2024 18:36
-
-
Save MarcoForte/e6f9719d3ae9e1592a4066d4fa64f6a9 to your computer and use it in GitHub Desktop.
UKC ticklist to CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
import pandas as pd | |
def scrape_table_with_links(url): | |
""" | |
Scrapes a table from a webpage, extracting both text and URLs from cells. | |
Specifically targets //form/table/tbody/tr[td] and extracts URLs from column 3. | |
Args: | |
url (str): The webpage URL to scrape | |
Returns: | |
pandas.DataFrame: DataFrame containing the scraped data with URLs | |
""" | |
try: | |
# Fetch the webpage | |
response = requests.get(url) | |
response.raise_for_status() | |
# Parse the HTML | |
tree = html.fromstring(response.content) | |
BASE_URL = 'https://www.ukclimbing.com' | |
# Get all rows | |
rows = tree.xpath('//tr') | |
data = [] | |
for row in rows: | |
tds = row.xpath('.//td') | |
if len(tds) >= 9: # Make sure we have all columns | |
# Get climb link and name | |
climb_link = tds[2].xpath('.//a') | |
if not climb_link: | |
continue | |
# Get crag link and name | |
crag_link = tds[8].xpath('.//a') | |
if not crag_link: | |
continue | |
# Extract grade and count stars | |
grade_text = tds[3].text_content().strip() | |
stars = grade_text.count('*') | |
grade = grade_text.replace('*', '').strip() | |
# Extract number of pitches (if exists) | |
pitches = tds[6].text_content().strip() | |
if '•' in pitches: | |
pitches = pitches.replace('•', '').strip() | |
else: | |
pitches = '' | |
row_data = { | |
'Add': tds[0].text_content().strip(), | |
'Climb name': climb_link[0].text_content().strip(), | |
'Climb URL': BASE_URL + climb_link[0].get('href', ''), | |
'Grade': grade, | |
'Stars': stars, | |
'Ticks': tds[4].text_content().strip(), | |
'Height': tds[5].text_content().strip(), | |
'Pitches': pitches, | |
'Crag name': crag_link[0].text_content().strip(), | |
'Crag URL': BASE_URL + crag_link[0].get('href', '') | |
} | |
data.append(row_data) | |
df = pd.DataFrame(data) | |
df = df.dropna(subset=['Climb name']).reset_index(drop=True) | |
# Add a blank row at the start | |
blank_row = pd.DataFrame([{col: '' for col in df.columns}]) | |
df = pd.concat([blank_row, df], ignore_index=True) | |
return df | |
except Exception as e: | |
print(f"Error processing the data: {e}") | |
print("Full error:", str(e)) | |
return None | |
def main(): | |
# Example usage | |
url = "https://www.ukclimbing.com/logbook/ticklists/weird_rock_tour-8687" # Replace with your target URL | |
df = scrape_table_with_links(url) | |
if df is not None: | |
print("Scraped Data:") | |
print(df) | |
# Optionally save to CSV | |
df.to_csv('scraped_data.csv', index=False) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment