Last active
May 28, 2024 04:36
-
-
Save kashaziz/dbdb8f39e5c99a8a5bcbfdc01553d66b to your computer and use it in GitHub Desktop.
Example of web scraping using Python and BeautifulSoup.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Example of web scraping using Python and BeautifulSoup. | |
Sraping ESPN College Football data | |
http://www.espn.com/college-sports/football/recruiting/databaseresults/_/sportid/24/class/2006/sort/school/starsfilter/GT/ratingfilter/GT/statuscommit/Commitments/statusuncommit/Uncommited | |
The script will loop through a defined number of pages to extract footballer data. | |
''' | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
import os.path | |
import csv | |
import time | |
def writerows(rows, filename): | |
with open(filename, 'a', encoding='utf-8') as toWrite: | |
writer = csv.writer(toWrite) | |
writer.writerows(rows) | |
def getlistings(listingurl): | |
''' | |
scrap footballer data from the page and write to CSV | |
''' | |
# prepare headers | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'} | |
# fetching the url, raising error if operation fails | |
try: | |
response = requests.get(listingurl, headers=headers) | |
except requests.exceptions.RequestException as e: | |
print(e) | |
exit() | |
soup = BeautifulSoup(response.text, "html.parser") | |
listings = [] | |
# loop through the table, get data from the columns | |
for rows in soup.find_all("tr"): | |
if ("oddrow" in rows["class"]) or ("evenrow" in rows["class"]): | |
name = rows.find("div", class_="name").a.get_text() | |
hometown = rows.find_all("td")[1].get_text() | |
school = hometown[hometown.find(",")+4:] | |
city = hometown[:hometown.find(",")+4] | |
position = rows.find_all("td")[2].get_text() | |
grade = rows.find_all("td")[4].get_text() | |
# append data to the list | |
listings.append([name, school, city, position, grade]) | |
return listings | |
if __name__ == "__main__": | |
''' | |
Set CSV file name. | |
Remove if file alreay exists to ensure a fresh start | |
''' | |
filename = "footballers.csv" | |
if os.path.exists(filename): | |
os.remove(filename) | |
''' | |
Url to fetch consists of 3 parts: | |
baseurl, page number, year, remaining url | |
''' | |
baseurl = "http://www.espn.com/college-sports/football/recruiting/databaseresults/_/page/" | |
page = 1 | |
parturl = "/sportid/24/class/2006/sort/school/starsfilter/GT/ratingfilter/GT/statuscommit/Commitments/statusuncommit/Uncommited" | |
# scrap all pages | |
while page < 259: | |
listingurl = baseurl + str(page) + parturl | |
listings = getlistings(listingurl) | |
# write to CSV | |
writerows(listings, filename) | |
# take a break | |
time.sleep(3) | |
page += 1 | |
if page > 1: | |
print("Listings fetched successfully.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment