Created
March 29, 2017 09:20
-
-
Save vinovator/18c681a71c356ce42e9473c3d9ed579d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# uk_mba.py | |
# Python 2.7.6 | |
""" | |
Extract business schools in UK with AACSB, AMBA and/or EQUIS accredition only | |
Scapring from http://find-mba.com/ | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import openpyxl as op | |
base_url = "http://find-mba.com" | |
start_url = "/schools/uk-ireland/uk?keyword=&rank=false&accredition=true&cities=&specs=&sort=popularity&numberperpage=50&page=1#list-school" | |
xl_name = "UK Bschools.xlsx" | |
sheet_name = "info" | |
# Declaring the list as public parameter to avoid being overwritten | |
# with each recursive call | |
school_list = list() | |
def get_basic_info(url): | |
""" | |
Fetch basic bschool information | |
""" | |
print("Fetching basic info from " + url) | |
resp = requests.get(url) | |
soup = BeautifulSoup(resp.content, "lxml") | |
# print soup.title.string | |
# print soup.prettify() | |
school_div = soup.findAll("div", {"class": "row school-list-item"}) | |
for school in school_div: | |
school_dict = dict() | |
row = school.find("div", {"class": "row"}) | |
title_div = row.find("div", {"class": "col-xs-11 school-list-title"}) | |
school_dict["Name"] = title_div("a")[0].get("title").encode("utf-8") | |
school_dict["URL"] = title_div("a")[0].get("href") | |
detail = school.find("div", {"class": "school-list-details"}) | |
location_span = detail.find("span", {"class": "school-list-location"}) | |
school_dict["Location"] = location_span.getText().encode("utf-8") | |
program = school.find("div", {"class": ""}) | |
# program_offer = program[0] | |
desc_p = program("div")[4].find("p") | |
if desc_p: # Sometimes the description could be blank | |
school_dict["Description"] = desc_p.getText().encode("utf-8") | |
else: | |
school_dict["Description"] = "" | |
offer_p = program("div")[6] | |
if offer_p: | |
school_dict["Programs offered"] = offer_p.getText().split( | |
":")[1].strip().encode("utf-8") | |
else: | |
school_dict["Programs offered"] = "" | |
school_list.append(school_dict) | |
# Keep navigating to the next page until the last page | |
next_page_div = soup.find("div", {"class": "visible-xs"}) | |
if next_page_div: | |
next_page_url = next_page_div("a")[0].get("href") | |
# recursive call to fetch basic info from next page | |
get_basic_info(base_url + next_page_url) | |
return school_list | |
def get_detailed_info(df): | |
""" | |
Fetch detailed information from url of each school | |
""" | |
school_url = df["URL"] # returns pandas series | |
school_det_list = list() | |
for url in school_url: | |
school_det_dict = dict() | |
school_det_dict["URL"] = url | |
print("Extracting info from " + url) | |
resp = requests.get(url) | |
soup = BeautifulSoup(resp.content, "lxml") | |
# Extract summary profile information | |
full_div = soup.find("div", {"class": "row school-details pattern"}) | |
if full_div: | |
stats_div = full_div.find( | |
"div", {"class": "col-xs-6 school-details-stats"}) | |
for p in stats_div.findAll("p"): | |
spans = p.findAll("span") | |
school_det_dict[spans[0].getText().strip().replace("\n", " ").encode("utf-8")] = spans[1].getText().strip().replace("\n", " ").encode("utf-8") | |
other_details_div = full_div.findAll("div", {"class": "col-xs-6"}) | |
school_det_dict["Other Details"] = other_details_div[0].getText().encode("utf-8") | |
""" | |
# Extract inforamtion for each program | |
programs_div = soup.find("div", {"id": "school-programs"}) | |
for item in programs_div.findAll("div", {"class": "program-item-holder"}): | |
prog_title_div = item.find("div", {"class": "row program-item"}) | |
title_divs = prog_title_div.findAll("div") | |
program_item = title_divs[0].getText() | |
school_det_dict[program_item] = title_divs[1].getText() | |
prod_detail_div = item.find( | |
"div", {"class": "accordion-body program-detail"}) | |
for row in prod_detail_div.findAll("div", {"class": "row"}): | |
detail_divs = row.findAll("div") | |
school_det_dict[ | |
program_item + " " + detail_divs[0].getText()] = detail_divs[1].getText() | |
""" | |
school_det_list.append(school_det_dict) | |
return school_det_list | |
def load_to_excel(df): | |
""" | |
Load the school list into excel file using pandas | |
""" | |
# Load list to dataframe | |
# df = pd.DataFrame(school_list) | |
df.index += 1 # So that the excel column starts from 1 | |
# Write dataframe to excel | |
xlw = pd.ExcelWriter(xl_name, engine="openpyxl") | |
df.to_excel(xlw, sheet_name=sheet_name, index_label="#", encoding="utf-8") | |
# columns=["Name", "Location", "Description", "Programs offered", "URL"]) | |
xlw.save() | |
def format_excel(xl, sheet="Sheet1"): | |
""" | |
Get the excel file path and format the file | |
If no sheet name is passed, by default take Sheet1 | |
""" | |
# Open the excel file | |
wb = op.load_workbook(xl) | |
ws = wb.get_sheet_by_name(sheet) | |
# Freeze panes | |
ws.freeze_panes = "C2" | |
# Adjust column width | |
cols = ("A", "B", "C", "D", "E", "F") | |
widths = (5, 20, 15, 50, 20, 30) | |
for combo in zip(cols, widths): | |
ws.column_dimensions[combo[0]].width = combo[1] | |
# define color formatting | |
blue_fill = op.styles.PatternFill(start_color="00aadd", | |
fill_type='solid') | |
green_fill = op.styles.PatternFill(start_color="00ff00", | |
fill_type='solid') | |
# define border style | |
thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'), | |
right=op.styles.Side(style='thin'), | |
top=op.styles.Side(style='thin'), | |
bottom=op.styles.Side(style='thin')) | |
# define Text wrap | |
text_wrap = op.styles.Alignment(wrap_text=True) | |
# Format the header row | |
for row in range(1, 2): # Loop only the 1st row | |
for col in range(1, ws.max_column + 1): # loop through all columns | |
ws.cell(row=row, column=col).fill = blue_fill | |
# Format all cells | |
for row in ws.iter_rows(): | |
for cell in row: | |
# Draw borders | |
cell.border = thin_border | |
# Wrap all columns | |
cell.alignment = text_wrap | |
# Highlight schools offering executive MBA | |
if (cell.col_idx == 5) and (cell.value.find("Executive") >= 0): | |
cell.fill = green_fill | |
# Save back as same file name | |
wb.save(xl) | |
if __name__ == "__main__": | |
""" Starting block """ | |
# Fetch basic inforamtion for the 1st page | |
schools_basic = get_basic_info(base_url + start_url) | |
# Load basic info list to dataframe | |
basic_df = pd.DataFrame(schools_basic) | |
schools_det = get_detailed_info(basic_df) | |
# Load details info list to dataframe | |
det_df = pd.DataFrame(schools_det) | |
combined_df = pd.merge( | |
left=basic_df[ | |
["Name", "Location", "Description", "Programs offered", "URL"]], | |
right=det_df, how="inner", on=["URL"]) | |
# Load the list to an excel file | |
load_to_excel(combined_df) | |
# Format the excel file | |
format_excel(xl_name, sheet_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment