Skip to content

Instantly share code, notes, and snippets.

@alanzchen
Created July 20, 2020 18:55
Show Gist options
  • Save alanzchen/0dc0e6b8c48135e2354fb892934cc87c to your computer and use it in GitHub Desktop.
Save alanzchen/0dc0e6b8c48135e2354fb892934cc87c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "hk_immd_table_parser.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "_e70Gm0zf3hk",
"colab_type": "code",
"colab": {}
},
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "3_q1hYtugehM",
"colab_type": "code",
"colab": {}
},
"source": [
"url = \"https://www.immd.gov.hk/hkt/stat_20200625.html\"\n",
"r = requests.get(url)\n",
"r.encoding = \"utf-8\"\n",
"html = r.text\n",
"dom = BeautifulSoup(html, \"html.parser\")\n",
"table = dom.find_all(\"table\")[0]"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_GDpQ7bZgiFA",
"colab_type": "code",
"colab": {}
},
"source": [
"rows = table.find_all(\"tr\")[4:]"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "zxjWvQg1ioeo",
"colab_type": "code",
"colab": {}
},
"source": [
"table_data = [[cell.text for cell in row(\"td\")]\n",
" for row in rows]"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "jtaA7QNsixzC",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 306
},
"outputId": "c62c57de-0a56-46ea-a746-4b880cfce29f"
},
"source": [
"table_data"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[['香港居民', '內地訪客', '其他訪客', '總計', '香港居民', '內地訪客', '其他訪客', '總計'],\n",
" ['機場', '815 ', '2 ', '46 ', '863 ', '716 ', '11 ', '68 ', '795 '],\n",
" ['高鐵西九龍', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['紅磡', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['羅湖', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['落馬洲支線', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['港珠澳大橋', '49 ', '2 ', '1 ', '52 ', '57 ', '3 ', '3 ', '63 '],\n",
" ['落馬洲', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['文錦渡', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['沙頭角', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['深圳灣', '501 ', '93 ', '9 ', '603 ', '383 ', '56 ', '0 ', '439 '],\n",
" ['中國客運碼頭', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['港口管制', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['啓德郵輪碼頭', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['港澳客輪碼頭', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['屯門客運碼頭', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 ', '0 '],\n",
" ['總計', '1,365 ', '97 ', '56 ', '1,518 ', '1,156 ', '70 ', '71 ', '1,297 ']]"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "9tb1lPDnh-93",
"colab_type": "code",
"colab": {}
},
"source": [
"for i in range(len(table_data[0])):\n",
" if i < 4:\n",
" table_data[0][i] += \"(入)\"\n",
" else:\n",
" table_data[0][i] += \"(出)\""
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bnrnzZaEi-rH",
"colab_type": "code",
"colab": {}
},
"source": [
"d = {}\n",
"for i in table_data[1:]:\n",
" d[i[0]] = {\n",
" k: v for k, v in zip(table_data[0], i[1:])\n",
" }"
],
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "sA4Nthdnj_3R",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 545
},
"outputId": "96d911c5-657d-4744-8a5a-f8284e81b310"
},
"source": [
"pd.DataFrame(d).transpose()"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>香港居民(入)</th>\n",
" <th>內地訪客(入)</th>\n",
" <th>其他訪客(入)</th>\n",
" <th>總計(入)</th>\n",
" <th>香港居民(出)</th>\n",
" <th>內地訪客(出)</th>\n",
" <th>其他訪客(出)</th>\n",
" <th>總計(出)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>機場</th>\n",
" <td>815</td>\n",
" <td>2</td>\n",
" <td>46</td>\n",
" <td>863</td>\n",
" <td>716</td>\n",
" <td>11</td>\n",
" <td>68</td>\n",
" <td>795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>高鐵西九龍</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>紅磡</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>羅湖</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>落馬洲支線</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港珠澳大橋</th>\n",
" <td>49</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>52</td>\n",
" <td>57</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>落馬洲</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>文錦渡</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>沙頭角</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>深圳灣</th>\n",
" <td>501</td>\n",
" <td>93</td>\n",
" <td>9</td>\n",
" <td>603</td>\n",
" <td>383</td>\n",
" <td>56</td>\n",
" <td>0</td>\n",
" <td>439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>中國客運碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港口管制</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>啓德郵輪碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港澳客輪碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>屯門客運碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>總計</th>\n",
" <td>1,365</td>\n",
" <td>97</td>\n",
" <td>56</td>\n",
" <td>1,518</td>\n",
" <td>1,156</td>\n",
" <td>70</td>\n",
" <td>71</td>\n",
" <td>1,297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 香港居民(入) 內地訪客(入) 其他訪客(入) 總計(入) 香港居民(出) 內地訪客(出) 其他訪客(出) 總計(出)\n",
"機場 815 2 46 863 716 11 68 795 \n",
"高鐵西九龍 0 0 0 0 0 0 0 0 \n",
"紅磡 0 0 0 0 0 0 0 0 \n",
"羅湖 0 0 0 0 0 0 0 0 \n",
"落馬洲支線 0 0 0 0 0 0 0 0 \n",
"港珠澳大橋 49 2 1 52 57 3 3 63 \n",
"落馬洲 0 0 0 0 0 0 0 0 \n",
"文錦渡 0 0 0 0 0 0 0 0 \n",
"沙頭角 0 0 0 0 0 0 0 0 \n",
"深圳灣 501 93 9 603 383 56 0 439 \n",
"中國客運碼頭 0 0 0 0 0 0 0 0 \n",
"港口管制 0 0 0 0 0 0 0 0 \n",
"啓德郵輪碼頭 0 0 0 0 0 0 0 0 \n",
"港澳客輪碼頭 0 0 0 0 0 0 0 0 \n",
"屯門客運碼頭 0 0 0 0 0 0 0 0 \n",
"總計 1,365 97 56 1,518 1,156 70 71 1,297 "
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "AwJhIaFBf8aA",
"colab_type": "code",
"colab": {}
},
"source": [
"def get_table(date):\n",
" url = \"https://www.immd.gov.hk/hkt/stat_\" + date + \".html\"\n",
" r = requests.get(url)\n",
" r.encoding = \"utf-8\"\n",
" html = r.text\n",
" dom = BeautifulSoup(html, \"html.parser\")\n",
" table = dom.find_all(\"table\")[0]\n",
" rows = table.find_all(\"tr\")[4:]\n",
" table_data = [[cell.text for cell in row(\"td\")]\n",
" for row in rows]\n",
" for i in range(len(table_data[0])):\n",
" if i < 4:\n",
" table_data[0][i] += \"(入)\"\n",
" else:\n",
" table_data[0][i] += \"(出)\"\n",
" d = {}\n",
" for i in table_data[1:]:\n",
" d[i[0]] = {\n",
" k: v for k, v in zip(table_data[0], i[1:])\n",
" }\n",
" return pd.DataFrame(d).transpose()"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OMtKPo5akaUr",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 545
},
"outputId": "e443fa94-c2f8-40b8-b165-71e32eb47e91"
},
"source": [
"get_table(\"20200311\")"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>香港居民(入)</th>\n",
" <th>內地訪客(入)</th>\n",
" <th>其他訪客(入)</th>\n",
" <th>總計(入)</th>\n",
" <th>香港居民(出)</th>\n",
" <th>內地訪客(出)</th>\n",
" <th>其他訪客(出)</th>\n",
" <th>總計(出)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>機場</th>\n",
" <td>7,610</td>\n",
" <td>838</td>\n",
" <td>1,539</td>\n",
" <td>9,987</td>\n",
" <td>3,443</td>\n",
" <td>205</td>\n",
" <td>841</td>\n",
" <td>4,489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>高鐵西九龍</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>紅磡</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>羅湖</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>落馬洲支線</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港珠澳大橋</th>\n",
" <td>5,248</td>\n",
" <td>25</td>\n",
" <td>733</td>\n",
" <td>6,006</td>\n",
" <td>5,258</td>\n",
" <td>122</td>\n",
" <td>765</td>\n",
" <td>6,145</td>\n",
" </tr>\n",
" <tr>\n",
" <th>落馬洲</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>文錦渡</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>沙頭角</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>深圳灣</th>\n",
" <td>1,418</td>\n",
" <td>244</td>\n",
" <td>2</td>\n",
" <td>1,664</td>\n",
" <td>1,632</td>\n",
" <td>866</td>\n",
" <td>443</td>\n",
" <td>2,941</td>\n",
" </tr>\n",
" <tr>\n",
" <th>中國客運碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港口管制</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>啓德郵輪碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>港澳客輪碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>屯門客運碼頭</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>總計</th>\n",
" <td>14,276</td>\n",
" <td>1,107</td>\n",
" <td>2,274</td>\n",
" <td>17,657</td>\n",
" <td>10,333</td>\n",
" <td>1,193</td>\n",
" <td>2,049</td>\n",
" <td>13,575</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 香港居民(入) 內地訪客(入) 其他訪客(入) 總計(入) 香港居民(出) 內地訪客(出) 其他訪客(出) 總計(出)\n",
"機場 7,610 838 1,539 9,987 3,443 205 841 4,489 \n",
"高鐵西九龍 0 0 0 0 0 0 0 0 \n",
"紅磡 0 0 0 0 0 0 0 0 \n",
"羅湖 0 0 0 0 0 0 0 0 \n",
"落馬洲支線 0 0 0 0 0 0 0 0 \n",
"港珠澳大橋 5,248 25 733 6,006 5,258 122 765 6,145 \n",
"落馬洲 0 0 0 0 0 0 0 0 \n",
"文錦渡 0 0 0 0 0 0 0 0 \n",
"沙頭角 0 0 0 0 0 0 0 0 \n",
"深圳灣 1,418 244 2 1,664 1,632 866 443 2,941 \n",
"中國客運碼頭 0 0 0 0 0 0 0 0 \n",
"港口管制 0 0 0 0 0 0 0 0 \n",
"啓德郵輪碼頭 0 0 0 0 0 0 0 0 \n",
"港澳客輪碼頭 0 0 0 0 0 0 0 0 \n",
"屯門客運碼頭 0 0 0 0 0 0 0 0 \n",
"總計 14,276 1,107 2,274 17,657 10,333 1,193 2,049 13,575 "
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Mkbi0PCFkdwD",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": 10,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment