panicpotatoe · April 16, 2020 03:53
diff --git a/Lab - Part-1-Segmenting and Clustering Neighborhoods in Toronto.ipynb b/Lab - Part-1-Segmenting and Clustering Neighborhoods in Toronto.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Peer-Graded Assignment : Segmenting and Clustering Neighborhoods in Toronto (Part 1)\n",
    "- Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto.\n",
    "***\n",
    "### 1. Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Libraries imported.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np # library to handle data in a vectorized manner\n",
    "\n",
    "import pandas as pd # library for data analsysis\n",
    "pd.set_option(\"display.max_columns\", None)\n",
    "pd.set_option(\"display.max_rows\", None)\n",
    "\n",
    "import json # library to handle JSON files\n",
    "\n",
    "from geopy.geocoders import Nominatim # convert an address into latitude and longitude values\n",
    "\n",
    "import requests # library to handle requests\n",
    "from bs4 import BeautifulSoup # library to parse HTML and XML documents\n",
    "\n",
    "from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe\n",
    "\n",
    "# Matplotlib and associated plotting modules\n",
    "import matplotlib.cm as cm\n",
    "import matplotlib.colors as colors\n",
    "\n",
    "# import k-means from clustering stage\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "import folium # map rendering library\n",
    "\n",
    "print(\"Libraries imported.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Scrap data from Wikipedia page into a DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# send the GET request\n",
    "data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse data from the html into a beautifulsoup object\n",
    "soup = BeautifulSoup(data, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create three lists to store table data\n",
    "postalCodeList = []\n",
    "boroughList = []\n",
    "neighborhoodList = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Using BeautifulSoup**\n",
    "\n",
    "```python\n",
    "# find the table\n",
    "soup.find('table').find_all('tr')\n",
    "\n",
    "# find all the rows of the table\n",
    "soup.find('table').find_all('tr')\n",
    "\n",
    "# for each row of the table, find all the table data\n",
    "for row in soup.find('table').find_all('tr'):\n",
    "    cells = row.find_all('td')\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# append the data into the respective lists\n",
    "for row in soup.find('table').find_all('tr'):\n",
    "    cells = row.find_all('td')\n",
    "    if(len(cells) > 0):\n",
    "        postalCodeList.append(cells[0].text)\n",
    "        boroughList.append(cells[1].text)\n",
    "        neighborhoodList.append(cells[2].text.rstrip('\\n')) # avoid new lines in neighborhood cell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighborhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1A</td>\n",
       "      <td>Not assigned</td>\n",
       "      <td>Not assigned</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M2A</td>\n",
       "      <td>Not assigned</td>\n",
       "      <td>Not assigned</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M4A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Victoria Village</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M5A</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Harbourfront</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough      Neighborhood\n",
       "0        M1A      Not assigned      Not assigned\n",
       "1        M2A      Not assigned      Not assigned\n",
       "2        M3A        North York         Parkwoods\n",
       "3        M4A        North York  Victoria Village\n",
       "4        M5A  Downtown Toronto      Harbourfront"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new DataFrame from the three lists\n",
    "toronto_df = pd.DataFrame({\"PostalCode\": postalCodeList,\n",
    "                           \"Borough\": boroughList,\n",
    "                           \"Neighborhood\": neighborhoodList})\n",
    "\n",
    "toronto_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Drop cells with a borough that is \"Not assigned\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighborhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M3A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Parkwoods</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M4A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Victoria Village</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M5A</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Harbourfront</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M6A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Lawrence Heights</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M6A</td>\n",
       "      <td>North York</td>\n",
       "      <td>Lawrence Manor</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode           Borough      Neighborhood\n",
       "0        M3A        North York         Parkwoods\n",
       "1        M4A        North York  Victoria Village\n",
       "2        M5A  Downtown Toronto      Harbourfront\n",
       "3        M6A        North York  Lawrence Heights\n",
       "4        M6A        North York    Lawrence Manor"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop cells with a borough that is Not assigned\n",
    "toronto_df_dropna = toronto_df[toronto_df.Borough != \"Not assigned\"].reset_index(drop=True)\n",
    "toronto_df_dropna.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Group neighborhoods in the same borough\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighborhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Rouge, Malvern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Highland Creek, Rouge Hill, Port Union</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Guildwood, Morningside, West Hill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Cedarbrae</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode      Borough                            Neighborhood\n",
       "0        M1B  Scarborough                          Rouge, Malvern\n",
       "1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union\n",
       "2        M1E  Scarborough       Guildwood, Morningside, West Hill\n",
       "3        M1G  Scarborough                                  Woburn\n",
       "4        M1H  Scarborough                               Cedarbrae"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# group neighborhoods in the same borough\n",
    "toronto_df_grouped = toronto_df_dropna.groupby([\"PostalCode\", \"Borough\"], as_index=False).agg(lambda x: \", \".join(x))\n",
    "toronto_df_grouped.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. For Neighborhood=\"Not assigned\", make the value the same as Borough"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighborhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M1B</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Rouge, Malvern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M1C</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Highland Creek, Rouge Hill, Port Union</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M1E</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Guildwood, Morningside, West Hill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1G</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Woburn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M1H</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Cedarbrae</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  PostalCode      Borough                            Neighborhood\n",
       "0        M1B  Scarborough                          Rouge, Malvern\n",
       "1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union\n",
       "2        M1E  Scarborough       Guildwood, Morningside, West Hill\n",
       "3        M1G  Scarborough                                  Woburn\n",
       "4        M1H  Scarborough                               Cedarbrae"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# for Neighborhood=\"Not assigned\", make the value the same as Borough\n",
    "for index, row in toronto_df_grouped.iterrows():\n",
    "    if row[\"Neighborhood\"] == \"Not assigned\":\n",
    "        row[\"Neighborhood\"] = row[\"Borough\"]\n",
    "        \n",
    "toronto_df_grouped.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. Check whether it is the same as required by the question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PostalCode</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Neighborhood</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M5G</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Central Bay Street</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M2H</td>\n",
       "      <td>North York</td>\n",
       "      <td>Hillcrest Village</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>M4B</td>\n",
       "      <td>East York</td>\n",
       "      <td>Woodbine Gardens, Parkview Hill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M1J</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Scarborough Village</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>M4G</td>\n",
       "      <td>East York</td>\n",
       "      <td>Leaside</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>M4M</td>\n",
       "      <td>East Toronto</td>\n",
       "      <td>Studio District</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>M1R</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Maryvale, Wexford</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>M9V</td>\n",
       "      <td>Etobicoke</td>\n",
       "      <td>Albion Gardens, Beaumond Heights, Humbergate, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>M9L</td>\n",
       "      <td>North York</td>\n",
       "      <td>Humber Summit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>M5V</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>CN Tower, Bathurst Quay, Island airport, Harbo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>M1B</td>\n",
       "      <td>Scarborough</td>\n",
       "      <td>Rouge, Malvern</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>M5A</td>\n",
       "      <td>Downtown Toronto</td>\n",
       "      <td>Harbourfront</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PostalCode           Borough  \\\n",
       "0         M5G  Downtown Toronto   \n",
       "1         M2H        North York   \n",
       "2         M4B         East York   \n",
       "3         M1J       Scarborough   \n",
       "4         M4G         East York   \n",
       "5         M4M      East Toronto   \n",
       "6         M1R       Scarborough   \n",
       "7         M9V         Etobicoke   \n",
       "8         M9L        North York   \n",
       "9         M5V  Downtown Toronto   \n",
       "10        M1B       Scarborough   \n",
       "11        M5A  Downtown Toronto   \n",
       "\n",
       "                                         Neighborhood  \n",
       "0                                  Central Bay Street  \n",
       "1                                   Hillcrest Village  \n",
       "2                     Woodbine Gardens, Parkview Hill  \n",
       "3                                 Scarborough Village  \n",
       "4                                             Leaside  \n",
       "5                                     Studio District  \n",
       "6                                   Maryvale, Wexford  \n",
       "7   Albion Gardens, Beaumond Heights, Humbergate, ...  \n",
       "8                                       Humber Summit  \n",
       "9   CN Tower, Bathurst Quay, Island airport, Harbo...  \n",
       "10                                     Rouge, Malvern  \n",
       "11                                       Harbourfront  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new test dataframe\n",
    "column_names = [\"PostalCode\", \"Borough\", \"Neighborhood\"]\n",
    "test_df = pd.DataFrame(columns=column_names)\n",
    "\n",
    "test_list = [\"M5G\", \"M2H\", \"M4B\", \"M1J\", \"M4G\", \"M4M\", \"M1R\", \"M9V\", \"M9L\", \"M5V\", \"M1B\", \"M5A\"]\n",
    "\n",
    "for postcode in test_list:\n",
    "    test_df = test_df.append(toronto_df_grouped[toronto_df_grouped[\"PostalCode\"]==postcode], ignore_index=True)\n",
    "    \n",
    "test_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. Finally, print the number of rows of the cleaned dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(103, 3)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the number of rows of the cleaned dataframe\n",
    "toronto_df_grouped.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Peer-Graded Assignment : Segmenting and Clustering Neighborhoods in Toronto (Part 1)\n",
	"- Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name in Toronto.\n",
	"***\n",
	"### 1. Import libraries"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Libraries imported.\n"
	]
	}
	],
	"source": [
	"import numpy as np # library to handle data in a vectorized manner\n",
	"\n",
	"import pandas as pd # library for data analsysis\n",
	"pd.set_option(\"display.max_columns\", None)\n",
	"pd.set_option(\"display.max_rows\", None)\n",
	"\n",
	"import json # library to handle JSON files\n",
	"\n",
	"from geopy.geocoders import Nominatim # convert an address into latitude and longitude values\n",
	"\n",
	"import requests # library to handle requests\n",
	"from bs4 import BeautifulSoup # library to parse HTML and XML documents\n",
	"\n",
	"from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe\n",
	"\n",
	"# Matplotlib and associated plotting modules\n",
	"import matplotlib.cm as cm\n",
	"import matplotlib.colors as colors\n",
	"\n",
	"# import k-means from clustering stage\n",
	"from sklearn.cluster import KMeans\n",
	"\n",
	"import folium # map rendering library\n",
	"\n",
	"print(\"Libraries imported.\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 2. Scrap data from Wikipedia page into a DataFrame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# send the GET request\n",
	"data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"# parse data from the html into a beautifulsoup object\n",
	"soup = BeautifulSoup(data, 'html.parser')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create three lists to store table data\n",
	"postalCodeList = []\n",
	"boroughList = []\n",
	"neighborhoodList = []"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Using BeautifulSoup\n",
	"\n",
	"```python\n",
	"# find the table\n",
	"soup.find('table').find_all('tr')\n",
	"\n",
	"# find all the rows of the table\n",
	"soup.find('table').find_all('tr')\n",
	"\n",
	"# for each row of the table, find all the table data\n",
	"for row in soup.find('table').find_all('tr'):\n",
	" cells = row.find_all('td')\n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# append the data into the respective lists\n",
	"for row in soup.find('table').find_all('tr'):\n",
	" cells = row.find_all('td')\n",
	" if(len(cells) > 0):\n",
	" postalCodeList.append(cells[0].text)\n",
	" boroughList.append(cells[1].text)\n",
	" neighborhoodList.append(cells[2].text.rstrip('\\n')) # avoid new lines in neighborhood cell"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PostalCode</th>\n",
	" <th>Borough</th>\n",
	" <th>Neighborhood</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>M1A</td>\n",
	" <td>Not assigned</td>\n",
	" <td>Not assigned</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>M2A</td>\n",
	" <td>Not assigned</td>\n",
	" <td>Not assigned</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>M3A</td>\n",
	" <td>North York</td>\n",
	" <td>Parkwoods</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>M4A</td>\n",
	" <td>North York</td>\n",
	" <td>Victoria Village</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>M5A</td>\n",
	" <td>Downtown Toronto</td>\n",
	" <td>Harbourfront</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" PostalCode Borough Neighborhood\n",
	"0 M1A Not assigned Not assigned\n",
	"1 M2A Not assigned Not assigned\n",
	"2 M3A North York Parkwoods\n",
	"3 M4A North York Victoria Village\n",
	"4 M5A Downtown Toronto Harbourfront"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# create a new DataFrame from the three lists\n",
	"toronto_df = pd.DataFrame({\"PostalCode\": postalCodeList,\n",
	" \"Borough\": boroughList,\n",
	" \"Neighborhood\": neighborhoodList})\n",
	"\n",
	"toronto_df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 3. Drop cells with a borough that is \"Not assigned\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PostalCode</th>\n",
	" <th>Borough</th>\n",
	" <th>Neighborhood</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>M3A</td>\n",
	" <td>North York</td>\n",
	" <td>Parkwoods</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>M4A</td>\n",
	" <td>North York</td>\n",
	" <td>Victoria Village</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>M5A</td>\n",
	" <td>Downtown Toronto</td>\n",
	" <td>Harbourfront</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>M6A</td>\n",
	" <td>North York</td>\n",
	" <td>Lawrence Heights</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>M6A</td>\n",
	" <td>North York</td>\n",
	" <td>Lawrence Manor</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" PostalCode Borough Neighborhood\n",
	"0 M3A North York Parkwoods\n",
	"1 M4A North York Victoria Village\n",
	"2 M5A Downtown Toronto Harbourfront\n",
	"3 M6A North York Lawrence Heights\n",
	"4 M6A North York Lawrence Manor"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# drop cells with a borough that is Not assigned\n",
	"toronto_df_dropna = toronto_df[toronto_df.Borough != \"Not assigned\"].reset_index(drop=True)\n",
	"toronto_df_dropna.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 4. Group neighborhoods in the same borough\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PostalCode</th>\n",
	" <th>Borough</th>\n",
	" <th>Neighborhood</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>M1B</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Rouge, Malvern</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>M1C</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Highland Creek, Rouge Hill, Port Union</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>M1E</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Guildwood, Morningside, West Hill</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>M1G</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Woburn</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>M1H</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Cedarbrae</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" PostalCode Borough Neighborhood\n",
	"0 M1B Scarborough Rouge, Malvern\n",
	"1 M1C Scarborough Highland Creek, Rouge Hill, Port Union\n",
	"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
	"3 M1G Scarborough Woburn\n",
	"4 M1H Scarborough Cedarbrae"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# group neighborhoods in the same borough\n",
	"toronto_df_grouped = toronto_df_dropna.groupby([\"PostalCode\", \"Borough\"], as_index=False).agg(lambda x: \", \".join(x))\n",
	"toronto_df_grouped.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 5. For Neighborhood=\"Not assigned\", make the value the same as Borough"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PostalCode</th>\n",
	" <th>Borough</th>\n",
	" <th>Neighborhood</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>M1B</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Rouge, Malvern</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>M1C</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Highland Creek, Rouge Hill, Port Union</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>M1E</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Guildwood, Morningside, West Hill</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>M1G</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Woburn</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>M1H</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Cedarbrae</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" PostalCode Borough Neighborhood\n",
	"0 M1B Scarborough Rouge, Malvern\n",
	"1 M1C Scarborough Highland Creek, Rouge Hill, Port Union\n",
	"2 M1E Scarborough Guildwood, Morningside, West Hill\n",
	"3 M1G Scarborough Woburn\n",
	"4 M1H Scarborough Cedarbrae"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# for Neighborhood=\"Not assigned\", make the value the same as Borough\n",
	"for index, row in toronto_df_grouped.iterrows():\n",
	" if row[\"Neighborhood\"] == \"Not assigned\":\n",
	" row[\"Neighborhood\"] = row[\"Borough\"]\n",
	" \n",
	"toronto_df_grouped.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 6. Check whether it is the same as required by the question"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>PostalCode</th>\n",
	" <th>Borough</th>\n",
	" <th>Neighborhood</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>M5G</td>\n",
	" <td>Downtown Toronto</td>\n",
	" <td>Central Bay Street</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>M2H</td>\n",
	" <td>North York</td>\n",
	" <td>Hillcrest Village</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>M4B</td>\n",
	" <td>East York</td>\n",
	" <td>Woodbine Gardens, Parkview Hill</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>M1J</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Scarborough Village</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>M4G</td>\n",
	" <td>East York</td>\n",
	" <td>Leaside</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>M4M</td>\n",
	" <td>East Toronto</td>\n",
	" <td>Studio District</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>M1R</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Maryvale, Wexford</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>M9V</td>\n",
	" <td>Etobicoke</td>\n",
	" <td>Albion Gardens, Beaumond Heights, Humbergate, ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>M9L</td>\n",
	" <td>North York</td>\n",
	" <td>Humber Summit</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>M5V</td>\n",
	" <td>Downtown Toronto</td>\n",
	" <td>CN Tower, Bathurst Quay, Island airport, Harbo...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>M1B</td>\n",
	" <td>Scarborough</td>\n",
	" <td>Rouge, Malvern</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>M5A</td>\n",
	" <td>Downtown Toronto</td>\n",
	" <td>Harbourfront</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" PostalCode Borough \\\n",
	"0 M5G Downtown Toronto \n",
	"1 M2H North York \n",
	"2 M4B East York \n",
	"3 M1J Scarborough \n",
	"4 M4G East York \n",
	"5 M4M East Toronto \n",
	"6 M1R Scarborough \n",
	"7 M9V Etobicoke \n",
	"8 M9L North York \n",
	"9 M5V Downtown Toronto \n",
	"10 M1B Scarborough \n",
	"11 M5A Downtown Toronto \n",
	"\n",
	" Neighborhood \n",
	"0 Central Bay Street \n",
	"1 Hillcrest Village \n",
	"2 Woodbine Gardens, Parkview Hill \n",
	"3 Scarborough Village \n",
	"4 Leaside \n",
	"5 Studio District \n",
	"6 Maryvale, Wexford \n",
	"7 Albion Gardens, Beaumond Heights, Humbergate, ... \n",
	"8 Humber Summit \n",
	"9 CN Tower, Bathurst Quay, Island airport, Harbo... \n",
	"10 Rouge, Malvern \n",
	"11 Harbourfront "
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# create a new test dataframe\n",
	"column_names = [\"PostalCode\", \"Borough\", \"Neighborhood\"]\n",
	"test_df = pd.DataFrame(columns=column_names)\n",
	"\n",
	"test_list = [\"M5G\", \"M2H\", \"M4B\", \"M1J\", \"M4G\", \"M4M\", \"M1R\", \"M9V\", \"M9L\", \"M5V\", \"M1B\", \"M5A\"]\n",
	"\n",
	"for postcode in test_list:\n",
	" test_df = test_df.append(toronto_df_grouped[toronto_df_grouped[\"PostalCode\"]==postcode], ignore_index=True)\n",
	" \n",
	"test_df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 7. Finally, print the number of rows of the cleaned dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(103, 3)"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# print the number of rows of the cleaned dataframe\n",
	"toronto_df_grouped.shape"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}