Last active
March 4, 2016 16:54
-
-
Save vinovator/fc8ac49fd81717db854d to your computer and use it in GitHub Desktop.
Scrap Workd T20 schedule from ICC website using BeautifulSoup & Requests and format the excel output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# world_t20_itinerary.py | |
# Python 2.7.6 | |
""" | |
Scrap Workd T20 schedule from ICC website using BeautifulSoup & Requests | |
Load the schedule into an excel file using pandas | |
Format the excel file using openpyxl | |
- Apply border, wrap text and color headers | |
- Highlight India matches | |
""" | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
import pandas as pd | |
import openpyxl as op | |
# Contstants | |
url = "http://www.espncricinfo.com/icc-world-twenty20-2016/content/series/901359.html?template=fixtures" | |
xl_name = "World T20 Schedule.xlsx" | |
sheet_name = "Itinerary" | |
def scrap_schedule(): | |
""" | |
Scrap WT20 schedule from cricinfo website | |
""" | |
resp = requests.get(url) | |
# Scrap schedule | |
soup = BeautifulSoup(resp.content) | |
# print soup.prettify() | |
schedule_ul = soup.findAll("ul", {"class": "large-20 columns"}) | |
match_list = list() | |
for section in schedule_ul: | |
schedule_li = section.findAll( | |
"li", {"class": "large-20 medium-20 columns"}) | |
for li in schedule_li: | |
match = dict() | |
divs = li.findAll("div") | |
for div in divs: | |
# Match date and time details | |
if (div["class"] == "large-5 medium-5 small-20 columns"): | |
fixture = div.findAll("span", {"class": "fixture_date"}) | |
match["date"] = fixture[0].getText() | |
match["time"] = fixture[1].getText().replace(" ", " ") | |
local_time = div.getText().split("(")[-1] | |
local_time = local_time.split(")")[0].replace( | |
" ", " ") | |
match["local time"] = local_time | |
# Teams and venue details | |
elif (div["class"] == "large-11 medium-11 small-20 columns"): | |
# print div.getText() | |
spans = div.findAll("span") | |
match["match details"] = spans[0].getText() | |
match["venue"] = spans[1].getText() | |
match["live"] = spans[2].getText() | |
if len(spans) > 3: | |
match["match type"] = spans[3].getText() | |
else: | |
match["match type"] = "D" | |
# This div is just there for empty space; so pass | |
elif (div["class"] == | |
"large-4 medium-4 columns hide-for-small"): | |
pass | |
match_list.append(match) | |
return match_list # List of all matches | |
def load_to_excel(match_list): | |
""" | |
Load the match list into excel file using pandas | |
""" | |
# Load list to dataframe | |
wt20_df = pd.DataFrame(match_list) | |
wt20_df.index += 1 # So that the excel column starts from 1 | |
# Write dataframe to excel | |
xlw = pd.ExcelWriter(xl_name) | |
wt20_df.to_excel(xlw, sheet_name=sheet_name, index_label="#") | |
xlw.save() | |
def format_excel(xl, sheet="Sheet1"): | |
""" | |
Get the excel file path and format the file | |
If no sheet name is passed, by default take Sheet1 | |
""" | |
# Open the excel file | |
wb = op.load_workbook(xl) | |
ws = wb.get_sheet_by_name(sheet) | |
# Freeze panes | |
ws.freeze_panes = "B2" | |
# Adjust column width | |
cols = ("A", "E", "H") | |
widths = (5, 50, 40) | |
for combo in zip(cols, widths): | |
ws.column_dimensions[combo[0]].width = combo[1] | |
# define color formmatting | |
blue_fill = op.styles.PatternFill(start_color="00aadd", | |
fill_type='solid') | |
green_fill = op.styles.PatternFill(start_color="00ff00", | |
fill_type='solid') | |
# define border style | |
thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'), | |
right=op.styles.Side(style='thin'), | |
top=op.styles.Side(style='thin'), | |
bottom=op.styles.Side(style='thin')) | |
# define Text wrap | |
text_wrap = op.styles.Alignment(wrap_text=True) | |
# Format the header row | |
for row in range(1, 2): # Loop only the 1st row | |
for col in range(1, ws.max_column + 1): # loop through all columns | |
ws.cell(row=row, column=col).fill = blue_fill | |
# Format all cells | |
for row in ws.iter_rows(): | |
for cell in row: | |
# Draw borders | |
cell.border = thin_border | |
# Wrap all columns | |
cell.alignment = text_wrap | |
# Highlight India matches | |
if (cell.col_idx == 5) and (cell.value.find("India") > 0): | |
cell.fill = green_fill | |
# Save back as same file name | |
wb.save(xl) | |
if __name__ == "__main__": | |
""" Starting block """ | |
# Scrap the schedule from cricinfo website | |
schedule = scrap_schedule() | |
# Load the match list to excel | |
load_to_excel(schedule) | |
# Format the excel file | |
format_excel(xl_name, sheet_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment