Skip to content

Instantly share code, notes, and snippets.

@ishashankverma
Created February 9, 2018 04:23
Show Gist options
  • Save ishashankverma/585b829341fd95d0da94ffe2086f83ad to your computer and use it in GitHub Desktop.
Save ishashankverma/585b829341fd95d0da94ffe2086f83ad to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup as bs
import requests
import re
import csv
url_list = []
### Can choose which years you want to scrape for
### Add one year to end (ex: range(1987-1990) will search 1987, 88, and 89)
for num in range(1987, 2018):
url_list.append(f"http://nflcombineresults.com/nflcombinedata_expanded.php?year={num}&pos=&college=")
# print(url_list)
filename = "playercombinedata.csv"
def openURL(input_list):
# Added a list to store the data from different pages
new_list = []
for url in input_list:
r = requests.get(url)
page_text = bs(r.content, 'html.parser')
# Appending the data to this list
new_list.append(page_text)
# print(new_list)
return new_list
def getHeaderData(soup):
### This function identifies the wanted table within the given and passed webpage from function openURL()
### After identification with beautifulsoup, it will load the table into the multidim. list data[]
table = soup.find('table')
table_header = table.find('thead')
data = []
rows = table_header.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # We need to check if the element is there as this is for header
for row in data:
for elem in row:
return data
def getPlayerData(soup):
# This function identifies the wanted table within the given and passed webpage from function openPage()
# After identification with beautiful soup, it will loads the table into the multidim. list data[]
table = soup.find('table')
table_body = table.find('tbody')
data = []
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols])
for row in data:
for elem in row:
return data
def ExportToCSV(data, header):
# Added a parameter to remove the newlines in the output file
with open(filename, 'w', newline = '') as f:
writer = csv.writer(f, delimiter=',')
for h in header:
# This writes the header in the file
writer.writerow(h)
for row in data:
# Writing to the csv column wise
for column in row:
writer.writerow(column)
# writer.writerows(data)
print('Player Data data variable has been saved to file', filename)
soup = openURL(url_list)
# List to store the data from multiple pages
multi_pages = []
header = []
# For the list items in the list
for li in soup:
if(not header):
header = getHeaderData(li) # This gets the header data from your last page assuming that all pages have the same headers.
player_data = getPlayerData(li)
multi_pages.append(player_data)
# print(header)
ExportToCSV(multi_pages, header)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment