Created
August 19, 2018 02:10
-
-
Save onelittlenightmusic/b3c2c2d3c2835e4df11bcf13caa570a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# HTML scraping and GraphQL\n", | |
"\n", | |
"## Goal\n", | |
"\n", | |
"- Scrape Wikipedia HTML (ex. Cities in Japan) and generate pandas DataFrame\n", | |
"- Insert data of DataFrame into database via GraphQL" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparation\n", | |
"\n", | |
"Install libraries in terminal.\n", | |
"\n", | |
"```sh\n", | |
"pip install graphqlclient numpy pandas lxml\n", | |
"```\n", | |
"\n", | |
"In notebook,\n", | |
"\n", | |
"```py\n", | |
"!pip install graphqlclient numpy pandas lxml\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", | |
" return f(*args, **kwds)\n" | |
] | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# HTML scraping" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"WIKI_URL = \"https://en.wikipedia.org/wiki/List_of_cities_in_Japan\"\n", | |
"dfs = pd.read_html(WIKI_URL, header=0) # set header with row #0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get 3rd table\n", | |
"df = dfs[2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# add country column\n", | |
"df['Country']='Japan'\n", | |
"df = df.drop('Homepage', axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>City (Special Ward)</th>\n", | |
" <th>Japanese</th>\n", | |
" <th>Prefecture</th>\n", | |
" <th>Population</th>\n", | |
" <th>Area</th>\n", | |
" <th>Density</th>\n", | |
" <th>Founded</th>\n", | |
" <th>Country</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>354</th>\n", | |
" <td>Yokohama</td>\n", | |
" <td>横浜市</td>\n", | |
" <td>Kanagawa</td>\n", | |
" <td>3697894</td>\n", | |
" <td>437.38</td>\n", | |
" <td>8500.0</td>\n", | |
" <td>1889-04-01</td>\n", | |
" <td>Japan</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" City (Special Ward) Japanese Prefecture Population Area Density \\\n", | |
"354 Yokohama 横浜市 Kanagawa 3697894 437.38 8500.0 \n", | |
"\n", | |
" Founded Country \n", | |
"354 1889-04-01 Japan " | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# check\n", | |
"df[df['City (Special Ward)']=='Yokohama']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# GraphQL\n", | |
"\n", | |
"## Preparation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from graphqlclient import GraphQLClient\n", | |
"import json\n", | |
"\n", | |
"client = GraphQLClient('http://10.0.2.2:4000')\n", | |
"\n", | |
"def execMutationCreateLocation(address, country):\n", | |
" location = {\n", | |
" 'address': address,\n", | |
" 'country': country\n", | |
" }\n", | |
" result = client.execute('''\n", | |
" mutation CreateLocationMutation($address: String!, $country: String!) {\n", | |
" createLocation(data: {\n", | |
" address: $address\n", | |
" country: $country\n", | |
" }) {\n", | |
" address\n", | |
" country\n", | |
" }\n", | |
" }\n", | |
" ''', variables=json.dumps(location))\n", | |
"\n", | |
"def iterateExecMutation(df):\n", | |
" for _, row in df.iterrows():\n", | |
" execMutationCreateLocation(address=row['City (Special Ward)'], country=row['Country'])\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## GraphQL Execution (a. Single thread input)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.37 s, sys: 831 ms, total: 2.2 s\n", | |
"Wall time: 28.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"iterateExecMutation(df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## GraphQL Execution (b. Parallel input)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# split dataframe into 4 slices\n", | |
"dflist = []\n", | |
"SLICE_NUM = 4\n", | |
"for i in range(SLICE_NUM):\n", | |
" dflist.append(df.iloc[i::SLICE_NUM])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 78.4 ms, sys: 116 ms, total: 195 ms\n", | |
"Wall time: 40.2 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"from multiprocessing import Pool\n", | |
"p = Pool(4)\n", | |
"p.map(iterateExecMutation, dflist)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Appendix\n", | |
"\n", | |
"## Processing time (single thread input)\n", | |
"\n", | |
"creation rows #: 815\n", | |
"\n", | |
"### Batch \n", | |
"\n", | |
"CPU times: user 1.64 s, sys: 816 ms, total: 2.46 s\n", | |
"Wall time: 43.3 s\n", | |
"\n", | |
"CPU times: user 1.64 s, sys: 832 ms, total: 2.48 s\n", | |
"Wall time: 34.5 s\n", | |
"\n", | |
"CPU times: user 1.91 s, sys: 858 ms, total: 2.77 s\n", | |
"Wall time: 38.5 s\n", | |
"\n", | |
"### Single\n", | |
"\n", | |
"CPU times: user 1.28 s, sys: 555 ms, total: 1.84 s\n", | |
"Wall time: 27.6 s\n", | |
"\n", | |
"CPU times: user 1.45 s, sys: 681 ms, total: 2.13 s\n", | |
"Wall time: 30.2 s\n", | |
"\n", | |
"CPU times: user 1.63 s, sys: 784 ms, total: 2.42 s\n", | |
"Wall time: 29.4 s" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Processing time (parallel input)\n", | |
"\n", | |
"creation rows #: 815\n", | |
"\n", | |
"### Batch \n", | |
"\n", | |
"CPU times: user 77 ms, sys: 297 ms, total: 374 ms\n", | |
"Wall time: 20.4 s\n", | |
"\n", | |
"CPU times: user 106 ms, sys: 219 ms, total: 325 ms\n", | |
"Wall time: 16.9 s\n", | |
"\n", | |
"CPU times: user 142 ms, sys: 102 ms, total: 245 ms\n", | |
"Wall time: 15.4 s\n", | |
"\n", | |
"CPU times: user 150 ms, sys: 562 ms, total: 712 ms\n", | |
"Wall time: 18.8 s\n", | |
"\n", | |
"### Single\n", | |
"\n", | |
"CPU times: user 258 ms, sys: 182 ms, total: 440 ms\n", | |
"Wall time: 20.3 s\n", | |
"\n", | |
"CPU times: user 242 ms, sys: 425 ms, total: 667 ms\n", | |
"Wall time: 21.9 s\n", | |
"\n", | |
"CPU times: user 302 ms, sys: 427 ms, total: 730 ms\n", | |
"Wall time: 21.7 s\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Memo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\"data\":{\"users\":[{\"name\":\"user1\",\"id\":\"cjkrq5vtv0048073925e1yf6s\",\"__typename\":\"User\"},{\"name\":\"user2\",\"id\":\"cjkrq6319004c0739o64pqjb4\",\"__typename\":\"User\"},{\"name\":\"user3\",\"id\":\"cjkrq6dda004g0739milhdgsb\",\"__typename\":\"User\"},{\"name\":\"user4\",\"id\":\"cjkyiszsx000o07394qoto31s\",\"__typename\":\"User\"}]}}\n" | |
] | |
} | |
], | |
"source": [ | |
"from graphqlclient import GraphQLClient\n", | |
"\n", | |
"client = GraphQLClient('http://10.0.2.2:4000')\n", | |
"\n", | |
"result = client.execute('''\n", | |
"{\n", | |
" users {\n", | |
" ...namename\n", | |
" }\n", | |
"}\n", | |
"\n", | |
"fragment namename on User {\n", | |
" name\n", | |
" id\n", | |
" __typename\n", | |
"}\n", | |
"''')\n", | |
"\n", | |
"print(result)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\"data\":{\"createUser\":{\"id\":\"cjkyiszsx000o07394qoto31s\",\"name\":\"user4\",\"address\":\"Yokohama\"}}}\n" | |
] | |
} | |
], | |
"source": [ | |
"from graphqlclient import GraphQLClient\n", | |
"\n", | |
"client = GraphQLClient('http://10.0.2.2:4000')\n", | |
"\n", | |
"result = client.execute('''\n", | |
"mutation ExampleMutation($name: String!, $address: String!) {\n", | |
" createUser(data: {\n", | |
" name: $name\n", | |
" address: $address\n", | |
" }) {\n", | |
" id\n", | |
" name\n", | |
" address\n", | |
" }\n", | |
"}\n", | |
"''', variables='''\n", | |
"{\n", | |
" \"name\": \"user4\",\n", | |
" \"address\": \"Yokohama\"\n", | |
"}\n", | |
"''')\n", | |
"\n", | |
"print(result)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment