Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save panicpotatoe/02678ea10dbb3e7abc025213c3488860 to your computer and use it in GitHub Desktop.
Save panicpotatoe/02678ea10dbb3e7abc025213c3488860 to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Peer-Graded Assignment : Segmenting and Clustering Neighborhoods in Toronto (Part 1)\n",
"- Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto.\n",
"***\n",
"### 1. Import libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Libraries imported.\n"
]
}
],
"source": [
"import numpy as np # library to handle data in a vectorized manner\n",
"\n",
"import pandas as pd # library for data analsysis\n",
"pd.set_option(\"display.max_columns\", None)\n",
"pd.set_option(\"display.max_rows\", None)\n",
"\n",
"import json # library to handle JSON files\n",
"\n",
"from geopy.geocoders import Nominatim # convert an address into latitude and longitude values\n",
"\n",
"import requests # library to handle requests\n",
"from bs4 import BeautifulSoup # library to parse HTML and XML documents\n",
"\n",
"from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe\n",
"\n",
"# Matplotlib and associated plotting modules\n",
"import matplotlib.cm as cm\n",
"import matplotlib.colors as colors\n",
"\n",
"# import k-means from clustering stage\n",
"from sklearn.cluster import KMeans\n",
"\n",
"import folium # map rendering library\n",
"\n",
"print(\"Libraries imported.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Scrap data from Wikipedia page into a DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# send the GET request\n",
"data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# parse data from the html into a beautifulsoup object\n",
"soup = BeautifulSoup(data, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# create three lists to store table data\n",
"postalCodeList = []\n",
"boroughList = []\n",
"neighborhoodList = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Using BeautifulSoup**\n",
"\n",
"```python\n",
"# find the table\n",
"soup.find('table').find_all('tr')\n",
"\n",
"# find all the rows of the table\n",
"soup.find('table').find_all('tr')\n",
"\n",
"# for each row of the table, find all the table data\n",
"for row in soup.find('table').find_all('tr'):\n",
" cells = row.find_all('td')\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# append the data into the respective lists\n",
"for row in soup.find('table').find_all('tr'):\n",
" cells = row.find_all('td')\n",
" if(len(cells) > 0):\n",
" postalCodeList.append(cells[0].text)\n",
" boroughList.append(cells[1].text)\n",
" neighborhoodList.append(cells[2].text.rstrip('\\n')) # avoid new lines in neighborhood cell"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1A Not assigned Not assigned\n",
"1 M2A Not assigned Not assigned\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Harbourfront"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a new DataFrame from the three lists\n",
"toronto_df = pd.DataFrame({\"PostalCode\": postalCodeList,\n",
" \"Borough\": boroughList,\n",
" \"Neighborhood\": neighborhoodList})\n",
"\n",
"toronto_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. Drop cells with a borough that is \"Not assigned\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M3A North York Parkwoods\n",
"1 M4A North York Victoria Village\n",
"2 M5A Downtown Toronto Harbourfront\n",
"3 M6A North York Lawrence Heights\n",
"4 M6A North York Lawrence Manor"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop cells with a borough that is Not assigned\n",
"toronto_df_dropna = toronto_df[toronto_df.Borough != \"Not assigned\"].reset_index(drop=True)\n",
"toronto_df_dropna.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 4. Group neighborhoods in the same borough\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge, Malvern</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Highland Creek, Rouge Hill, Port Union</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood, Morningside, West Hill</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1B Scarborough Rouge, Malvern\n",
"1 M1C Scarborough Highland Creek, Rouge Hill, Port Union\n",
"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
"3 M1G Scarborough Woburn\n",
"4 M1H Scarborough Cedarbrae"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# group neighborhoods in the same borough\n",
"toronto_df_grouped = toronto_df_dropna.groupby([\"PostalCode\", \"Borough\"], as_index=False).agg(lambda x: \", \".join(x))\n",
"toronto_df_grouped.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5. For Neighborhood=\"Not assigned\", make the value the same as Borough"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge, Malvern</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M1C</td>\n",
" <td>Scarborough</td>\n",
" <td>Highland Creek, Rouge Hill, Port Union</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M1E</td>\n",
" <td>Scarborough</td>\n",
" <td>Guildwood, Morningside, West Hill</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1G</td>\n",
" <td>Scarborough</td>\n",
" <td>Woburn</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M1H</td>\n",
" <td>Scarborough</td>\n",
" <td>Cedarbrae</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough Neighborhood\n",
"0 M1B Scarborough Rouge, Malvern\n",
"1 M1C Scarborough Highland Creek, Rouge Hill, Port Union\n",
"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
"3 M1G Scarborough Woburn\n",
"4 M1H Scarborough Cedarbrae"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# for Neighborhood=\"Not assigned\", make the value the same as Borough\n",
"for index, row in toronto_df_grouped.iterrows():\n",
" if row[\"Neighborhood\"] == \"Not assigned\":\n",
" row[\"Neighborhood\"] = row[\"Borough\"]\n",
" \n",
"toronto_df_grouped.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6. Check whether it is the same as required by the question"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PostalCode</th>\n",
" <th>Borough</th>\n",
" <th>Neighborhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M5G</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Central Bay Street</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2H</td>\n",
" <td>North York</td>\n",
" <td>Hillcrest Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M4B</td>\n",
" <td>East York</td>\n",
" <td>Woodbine Gardens, Parkview Hill</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M1J</td>\n",
" <td>Scarborough</td>\n",
" <td>Scarborough Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M4G</td>\n",
" <td>East York</td>\n",
" <td>Leaside</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M4M</td>\n",
" <td>East Toronto</td>\n",
" <td>Studio District</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M1R</td>\n",
" <td>Scarborough</td>\n",
" <td>Maryvale, Wexford</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>M9V</td>\n",
" <td>Etobicoke</td>\n",
" <td>Albion Gardens, Beaumond Heights, Humbergate, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>M9L</td>\n",
" <td>North York</td>\n",
" <td>Humber Summit</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>M5V</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>CN Tower, Bathurst Quay, Island airport, Harbo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>M1B</td>\n",
" <td>Scarborough</td>\n",
" <td>Rouge, Malvern</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PostalCode Borough \\\n",
"0 M5G Downtown Toronto \n",
"1 M2H North York \n",
"2 M4B East York \n",
"3 M1J Scarborough \n",
"4 M4G East York \n",
"5 M4M East Toronto \n",
"6 M1R Scarborough \n",
"7 M9V Etobicoke \n",
"8 M9L North York \n",
"9 M5V Downtown Toronto \n",
"10 M1B Scarborough \n",
"11 M5A Downtown Toronto \n",
"\n",
" Neighborhood \n",
"0 Central Bay Street \n",
"1 Hillcrest Village \n",
"2 Woodbine Gardens, Parkview Hill \n",
"3 Scarborough Village \n",
"4 Leaside \n",
"5 Studio District \n",
"6 Maryvale, Wexford \n",
"7 Albion Gardens, Beaumond Heights, Humbergate, ... \n",
"8 Humber Summit \n",
"9 CN Tower, Bathurst Quay, Island airport, Harbo... \n",
"10 Rouge, Malvern \n",
"11 Harbourfront "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a new test dataframe\n",
"column_names = [\"PostalCode\", \"Borough\", \"Neighborhood\"]\n",
"test_df = pd.DataFrame(columns=column_names)\n",
"\n",
"test_list = [\"M5G\", \"M2H\", \"M4B\", \"M1J\", \"M4G\", \"M4M\", \"M1R\", \"M9V\", \"M9L\", \"M5V\", \"M1B\", \"M5A\"]\n",
"\n",
"for postcode in test_list:\n",
" test_df = test_df.append(toronto_df_grouped[toronto_df_grouped[\"PostalCode\"]==postcode], ignore_index=True)\n",
" \n",
"test_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7. Finally, print the number of rows of the cleaned dataframe"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 3)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# print the number of rows of the cleaned dataframe\n",
"toronto_df_grouped.shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment