vinovator · March 29, 2017 09:20
diff --git a/find-mba.py b/find-mba.py
 # uk_mba.py
 # Python 2.7.6

 """
 Extract business schools in UK with AACSB, AMBA and/or EQUIS accredition only
 Scapring from http://find-mba.com/
 """

 import requests
 from bs4 import BeautifulSoup
 import pandas as pd
 import openpyxl as op

 base_url = "http://find-mba.com"
 start_url = "/schools/uk-ireland/uk?keyword=&rank=false&accredition=true&cities=&specs=&sort=popularity&numberperpage=50&page=1#list-school"

 xl_name = "UK Bschools.xlsx"
 sheet_name = "info"

 # Declaring the list as public parameter to avoid being overwritten
 # with each recursive call
 school_list = list()


 def get_basic_info(url):
    """
    Fetch basic bschool information
    """
    print("Fetching basic info from " + url)

    resp = requests.get(url)

    soup = BeautifulSoup(resp.content, "lxml")
    # print soup.title.string

    # print soup.prettify()

    school_div = soup.findAll("div", {"class": "row school-list-item"})

    for school in school_div:
        school_dict = dict()

        row = school.find("div", {"class": "row"})
        title_div = row.find("div", {"class": "col-xs-11 school-list-title"})
        school_dict["Name"] = title_div("a")[0].get("title").encode("utf-8")
        school_dict["URL"] = title_div("a")[0].get("href")

        detail = school.find("div", {"class": "school-list-details"})
        location_span = detail.find("span", {"class": "school-list-location"})
        school_dict["Location"] = location_span.getText().encode("utf-8")

        program = school.find("div", {"class": ""})
        # program_offer = program[0]
        desc_p = program("div")[4].find("p")
        if desc_p:  # Sometimes the description could be blank
            school_dict["Description"] = desc_p.getText().encode("utf-8")
        else:
            school_dict["Description"] = ""

        offer_p = program("div")[6]
        if offer_p:
            school_dict["Programs offered"] = offer_p.getText().split(
                ":")[1].strip().encode("utf-8")
        else:
            school_dict["Programs offered"] = ""

        school_list.append(school_dict)

    # Keep navigating to the next page until the last page
    next_page_div = soup.find("div", {"class": "visible-xs"})
    if next_page_div:
        next_page_url = next_page_div("a")[0].get("href")

        # recursive call to fetch basic info from next page
        get_basic_info(base_url + next_page_url)

    return school_list


 def get_detailed_info(df):
    """
    Fetch detailed information from url of each school
    """

    school_url = df["URL"]  # returns pandas series

    school_det_list = list()

    for url in school_url:
        school_det_dict = dict()
        school_det_dict["URL"] = url

        print("Extracting info from " + url)

        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, "lxml")

        # Extract summary profile information
        full_div = soup.find("div", {"class": "row school-details pattern"})
        if full_div:
            stats_div = full_div.find(
                "div", {"class": "col-xs-6 school-details-stats"})
            for p in stats_div.findAll("p"):
                spans = p.findAll("span")
                school_det_dict[spans[0].getText().strip().replace("\n", " ").encode("utf-8")] = spans[1].getText().strip().replace("\n", " ").encode("utf-8")

            other_details_div = full_div.findAll("div", {"class": "col-xs-6"})
            school_det_dict["Other Details"] = other_details_div[0].getText().encode("utf-8")

        """
        # Extract inforamtion for each program
        programs_div = soup.find("div", {"id": "school-programs"})
        for item in programs_div.findAll("div", {"class": "program-item-holder"}):
            prog_title_div = item.find("div", {"class": "row program-item"})
            title_divs = prog_title_div.findAll("div")
            program_item = title_divs[0].getText()
            school_det_dict[program_item] = title_divs[1].getText()
            prod_detail_div = item.find(
                "div", {"class": "accordion-body program-detail"})
            for row in prod_detail_div.findAll("div", {"class": "row"}):
                detail_divs = row.findAll("div")
                school_det_dict[
                    program_item + " " + detail_divs[0].getText()] = detail_divs[1].getText()
        """

        school_det_list.append(school_det_dict)

    return school_det_list


 def load_to_excel(df):
    """
    Load the school list into excel file using pandas
    """
    # Load list to dataframe
    # df = pd.DataFrame(school_list)
    df.index += 1  # So that the excel column starts from 1

    # Write dataframe to excel
    xlw = pd.ExcelWriter(xl_name, engine="openpyxl")
    df.to_excel(xlw, sheet_name=sheet_name, index_label="#", encoding="utf-8")
    # columns=["Name", "Location", "Description", "Programs offered", "URL"])
    xlw.save()


 def format_excel(xl, sheet="Sheet1"):
    """
    Get the excel file path and format the file
    If no sheet name is passed, by default take Sheet1
    """
    # Open the excel file
    wb = op.load_workbook(xl)
    ws = wb.get_sheet_by_name(sheet)

    # Freeze panes
    ws.freeze_panes = "C2"

    # Adjust column width
    cols = ("A", "B", "C", "D", "E", "F")
    widths = (5, 20, 15, 50, 20, 30)

    for combo in zip(cols, widths):
        ws.column_dimensions[combo[0]].width = combo[1]

    # define color formatting
    blue_fill = op.styles.PatternFill(start_color="00aadd",
                                      fill_type='solid')

    green_fill = op.styles.PatternFill(start_color="00ff00",
                                       fill_type='solid')

    # define border style
    thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'),
                                           right=op.styles.Side(style='thin'),
                                           top=op.styles.Side(style='thin'),
                                           bottom=op.styles.Side(style='thin'))

    # define Text wrap
    text_wrap = op.styles.Alignment(wrap_text=True)

    # Format the header row
    for row in range(1, 2):  # Loop only the 1st row
        for col in range(1, ws.max_column + 1):  # loop through all columns
            ws.cell(row=row, column=col).fill = blue_fill

    # Format all cells
    for row in ws.iter_rows():
        for cell in row:
            # Draw borders
            cell.border = thin_border

            # Wrap all columns
            cell.alignment = text_wrap

            # Highlight schools offering executive MBA
            if (cell.col_idx == 5) and (cell.value.find("Executive") >= 0):
                cell.fill = green_fill

    # Save back as same file name
    wb.save(xl)


 if __name__ == "__main__":
    """ Starting block """

    # Fetch basic inforamtion for the 1st page
    schools_basic = get_basic_info(base_url + start_url)

    # Load basic info list to dataframe
    basic_df = pd.DataFrame(schools_basic)

    schools_det = get_detailed_info(basic_df)

    # Load details info list to dataframe
    det_df = pd.DataFrame(schools_det)

    combined_df = pd.merge(
        left=basic_df[
            ["Name", "Location", "Description", "Programs offered", "URL"]],
        right=det_df, how="inner", on=["URL"])

    # Load the list to an excel file
    load_to_excel(combined_df)

    # Format the excel file
    format_excel(xl_name, sheet_name)
	# uk_mba.py
	# Python 2.7.6

	"""
	Extract business schools in UK with AACSB, AMBA and/or EQUIS accredition only
	Scapring from http://find-mba.com/
	"""

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import openpyxl as op

	base_url = "http://find-mba.com"
	start_url = "/schools/uk-ireland/uk?keyword=&rank=false&accredition=true&cities=&specs=&sort=popularity&numberperpage=50&page=1#list-school"

	xl_name = "UK Bschools.xlsx"
	sheet_name = "info"

	# Declaring the list as public parameter to avoid being overwritten
	# with each recursive call
	school_list = list()


	def get_basic_info(url):
	"""
	Fetch basic bschool information
	"""
	print("Fetching basic info from " + url)

	resp = requests.get(url)

	soup = BeautifulSoup(resp.content, "lxml")
	# print soup.title.string

	# print soup.prettify()

	school_div = soup.findAll("div", {"class": "row school-list-item"})

	for school in school_div:
	school_dict = dict()

	row = school.find("div", {"class": "row"})
	title_div = row.find("div", {"class": "col-xs-11 school-list-title"})
	school_dict["Name"] = title_div("a")[0].get("title").encode("utf-8")
	school_dict["URL"] = title_div("a")[0].get("href")

	detail = school.find("div", {"class": "school-list-details"})
	location_span = detail.find("span", {"class": "school-list-location"})
	school_dict["Location"] = location_span.getText().encode("utf-8")

	program = school.find("div", {"class": ""})
	# program_offer = program[0]
	desc_p = program("div")[4].find("p")
	if desc_p: # Sometimes the description could be blank
	school_dict["Description"] = desc_p.getText().encode("utf-8")
	else:
	school_dict["Description"] = ""

	offer_p = program("div")[6]
	if offer_p:
	school_dict["Programs offered"] = offer_p.getText().split(
	":")[1].strip().encode("utf-8")
	else:
	school_dict["Programs offered"] = ""

	school_list.append(school_dict)

	# Keep navigating to the next page until the last page
	next_page_div = soup.find("div", {"class": "visible-xs"})
	if next_page_div:
	next_page_url = next_page_div("a")[0].get("href")

	# recursive call to fetch basic info from next page
	get_basic_info(base_url + next_page_url)

	return school_list


	def get_detailed_info(df):
	"""
	Fetch detailed information from url of each school
	"""

	school_url = df["URL"] # returns pandas series

	school_det_list = list()

	for url in school_url:
	school_det_dict = dict()
	school_det_dict["URL"] = url

	print("Extracting info from " + url)

	resp = requests.get(url)
	soup = BeautifulSoup(resp.content, "lxml")

	# Extract summary profile information
	full_div = soup.find("div", {"class": "row school-details pattern"})
	if full_div:
	stats_div = full_div.find(
	"div", {"class": "col-xs-6 school-details-stats"})
	for p in stats_div.findAll("p"):
	spans = p.findAll("span")
	school_det_dict[spans[0].getText().strip().replace("\n", " ").encode("utf-8")] = spans[1].getText().strip().replace("\n", " ").encode("utf-8")

	other_details_div = full_div.findAll("div", {"class": "col-xs-6"})
	school_det_dict["Other Details"] = other_details_div[0].getText().encode("utf-8")

	"""
	# Extract inforamtion for each program
	programs_div = soup.find("div", {"id": "school-programs"})
	for item in programs_div.findAll("div", {"class": "program-item-holder"}):
	prog_title_div = item.find("div", {"class": "row program-item"})
	title_divs = prog_title_div.findAll("div")
	program_item = title_divs[0].getText()
	school_det_dict[program_item] = title_divs[1].getText()
	prod_detail_div = item.find(
	"div", {"class": "accordion-body program-detail"})
	for row in prod_detail_div.findAll("div", {"class": "row"}):
	detail_divs = row.findAll("div")
	school_det_dict[
	program_item + " " + detail_divs[0].getText()] = detail_divs[1].getText()
	"""

	school_det_list.append(school_det_dict)

	return school_det_list


	def load_to_excel(df):
	"""
	Load the school list into excel file using pandas
	"""
	# Load list to dataframe
	# df = pd.DataFrame(school_list)
	df.index += 1 # So that the excel column starts from 1

	# Write dataframe to excel
	xlw = pd.ExcelWriter(xl_name, engine="openpyxl")
	df.to_excel(xlw, sheet_name=sheet_name, index_label="#", encoding="utf-8")
	# columns=["Name", "Location", "Description", "Programs offered", "URL"])
	xlw.save()


	def format_excel(xl, sheet="Sheet1"):
	"""
	Get the excel file path and format the file
	If no sheet name is passed, by default take Sheet1
	"""
	# Open the excel file
	wb = op.load_workbook(xl)
	ws = wb.get_sheet_by_name(sheet)

	# Freeze panes
	ws.freeze_panes = "C2"

	# Adjust column width
	cols = ("A", "B", "C", "D", "E", "F")
	widths = (5, 20, 15, 50, 20, 30)

	for combo in zip(cols, widths):
	ws.column_dimensions[combo[0]].width = combo[1]

	# define color formatting
	blue_fill = op.styles.PatternFill(start_color="00aadd",
	fill_type='solid')

	green_fill = op.styles.PatternFill(start_color="00ff00",
	fill_type='solid')

	# define border style
	thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'),
	right=op.styles.Side(style='thin'),
	top=op.styles.Side(style='thin'),
	bottom=op.styles.Side(style='thin'))

	# define Text wrap
	text_wrap = op.styles.Alignment(wrap_text=True)

	# Format the header row
	for row in range(1, 2): # Loop only the 1st row
	for col in range(1, ws.max_column + 1): # loop through all columns
	ws.cell(row=row, column=col).fill = blue_fill

	# Format all cells
	for row in ws.iter_rows():
	for cell in row:
	# Draw borders
	cell.border = thin_border

	# Wrap all columns
	cell.alignment = text_wrap

	# Highlight schools offering executive MBA
	if (cell.col_idx == 5) and (cell.value.find("Executive") >= 0):
	cell.fill = green_fill

	# Save back as same file name
	wb.save(xl)


	if __name__ == "__main__":
	""" Starting block """

	# Fetch basic inforamtion for the 1st page
	schools_basic = get_basic_info(base_url + start_url)

	# Load basic info list to dataframe
	basic_df = pd.DataFrame(schools_basic)

	schools_det = get_detailed_info(basic_df)

	# Load details info list to dataframe
	det_df = pd.DataFrame(schools_det)

	combined_df = pd.merge(
	left=basic_df[
	["Name", "Location", "Description", "Programs offered", "URL"]],
	right=det_df, how="inner", on=["URL"])

	# Load the list to an excel file
	load_to_excel(combined_df)

	# Format the excel file
	format_excel(xl_name, sheet_name)