Created
February 9, 2018 04:23
-
-
Save ishashankverma/585b829341fd95d0da94ffe2086f83ad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup as bs | |
| import requests | |
| import re | |
| import csv | |
| url_list = [] | |
| ### Can choose which years you want to scrape for | |
| ### Add one year to end (ex: range(1987-1990) will search 1987, 88, and 89) | |
| for num in range(1987, 2018): | |
| url_list.append(f"http://nflcombineresults.com/nflcombinedata_expanded.php?year={num}&pos=&college=") | |
| # print(url_list) | |
| filename = "playercombinedata.csv" | |
| def openURL(input_list): | |
| # Added a list to store the data from different pages | |
| new_list = [] | |
| for url in input_list: | |
| r = requests.get(url) | |
| page_text = bs(r.content, 'html.parser') | |
| # Appending the data to this list | |
| new_list.append(page_text) | |
| # print(new_list) | |
| return new_list | |
| def getHeaderData(soup): | |
| ### This function identifies the wanted table within the given and passed webpage from function openURL() | |
| ### After identification with beautifulsoup, it will load the table into the multidim. list data[] | |
| table = soup.find('table') | |
| table_header = table.find('thead') | |
| data = [] | |
| rows = table_header.find_all('tr') | |
| for row in rows: | |
| cols = row.find_all('td') | |
| cols = [ele.text.strip() for ele in cols] | |
| data.append([ele for ele in cols if ele]) # We need to check if the element is there as this is for header | |
| for row in data: | |
| for elem in row: | |
| return data | |
| def getPlayerData(soup): | |
| # This function identifies the wanted table within the given and passed webpage from function openPage() | |
| # After identification with beautiful soup, it will loads the table into the multidim. list data[] | |
| table = soup.find('table') | |
| table_body = table.find('tbody') | |
| data = [] | |
| rows = table_body.find_all('tr') | |
| for row in rows: | |
| cols = row.find_all('td') | |
| cols = [ele.text.strip() for ele in cols] | |
| data.append([ele for ele in cols]) | |
| for row in data: | |
| for elem in row: | |
| return data | |
| def ExportToCSV(data, header): | |
| # Added a parameter to remove the newlines in the output file | |
| with open(filename, 'w', newline = '') as f: | |
| writer = csv.writer(f, delimiter=',') | |
| for h in header: | |
| # This writes the header in the file | |
| writer.writerow(h) | |
| for row in data: | |
| # Writing to the csv column wise | |
| for column in row: | |
| writer.writerow(column) | |
| # writer.writerows(data) | |
| print('Player Data data variable has been saved to file', filename) | |
| soup = openURL(url_list) | |
| # List to store the data from multiple pages | |
| multi_pages = [] | |
| header = [] | |
| # For the list items in the list | |
| for li in soup: | |
| if(not header): | |
| header = getHeaderData(li) # This gets the header data from your last page assuming that all pages have the same headers. | |
| player_data = getPlayerData(li) | |
| multi_pages.append(player_data) | |
| # print(header) | |
| ExportToCSV(multi_pages, header) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment