Last active
August 29, 2015 14:25
-
-
Save mkcor/2a9d470eaf65cd1e7b3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Graph MySQL data in Python using MySQLdb and Plotly" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This notebook uses the MySQL world database, which can be downloaded here: http://dev.mysql.com/doc/index-other.html" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Instructions for setting up the world database in MySQL are here: https://dev.mysql.com/doc/world-setup/en/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This notebook was created for this article in <em>Modern Data</em>: http://mod.plot.ly/graph-data-from-mysql-database-in-python/ " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Questions or comments? Tweet to @plotlygraphs or email feedback[at]plot[dot]ly" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import MySQLdb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import plotly.plotly as py\n", | |
"import plotly.graph_objs as pgo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"conn = MySQLdb.connect(host=\"localhost\", user=\"root\", passwd=\"\", db=\"world\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"cursor = conn.cursor()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"239L" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cursor.execute('SELECT name, continent, population, lifeexpectancy, gnp from Country;')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rows = cursor.fetchall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(('Aruba', 'North America', 103000L, 78.4, 828.0),\n", | |
" ('Afghanistan', 'Asia', 22720000L, 45.9, 5976.0),\n", | |
" ('Angola', 'Africa', 12878000L, 38.3, 6648.0),\n", | |
" ('Anguilla', 'North America', 8000L, 76.1, 63.2),\n", | |
" ('Albania', 'Europe', 3401200L, 71.6, 3205.0),\n", | |
" ('Andorra', 'Europe', 78000L, 83.5, 1630.0),\n", | |
" ('Netherlands Antilles', 'North America', 217000L, 74.7, 1941.0),\n", | |
" ('United Arab Emirates', 'Asia', 2441000L, 74.1, 37966.0),\n", | |
" ('Argentina', 'South America', 37032000L, 75.1, 340238.0),\n", | |
" ('Armenia', 'Asia', 3520000L, 66.4, 1813.0))" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rows[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame([[ij for ij in i] for i in rows])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df.columns = ['Name', 'Continent', 'Population', 'LifeExpectancy', 'GNP']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = df.sort(['LifeExpectancy'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Some country names cause serialization errors in early versions of Plotly's Python client. The code block below takes care of this." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning:\n", | |
"\n", | |
"\n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
"\n", | |
"/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:6: SettingWithCopyWarning:\n", | |
"\n", | |
"\n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"country_names = df['Name']\n", | |
"for i in range(len(country_names)):\n", | |
" try:\n", | |
" country_names[i] = str(country_names[i]).decode('utf-8')\n", | |
" except:\n", | |
" country_names[i] = 'Country name decode error'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"237 Zambia\n", | |
"143 Mozambique\n", | |
"148 Malawi\n", | |
"238 Zimbabwe\n", | |
"2 Angola\n", | |
"35 Botswana\n", | |
"182 Rwanda\n", | |
"201 Swaziland\n", | |
"153 Niger\n", | |
"151 Namibia\n", | |
"219 Uganda\n", | |
"36 Central African Republic\n", | |
"68 Ethiopia\n", | |
"42 Country name decode error\n", | |
"191 Sierra Leone\n", | |
"80 Guinea\n", | |
"1 Afghanistan\n", | |
"211 East Timor\n", | |
"17 Burundi\n", | |
"194 Somalia\n", | |
"20 Burkina Faso\n", | |
"138 Mali\n", | |
"45 Congo\n", | |
"111 Kenya\n", | |
"44 Congo, The Democratic Republic of the\n", | |
"83 Guinea-Bissau\n", | |
"96 Haiti\n", | |
"65 Western Sahara\n", | |
"75 Gabon\n", | |
"19 Benin\n", | |
" ... \n", | |
"79 Gibraltar\n", | |
"106 Italy\n", | |
"37 Canada\n", | |
"104 Iceland\n", | |
"92 Hong Kong\n", | |
"200 Sweden\n", | |
"39 Switzerland\n", | |
"14 Australia\n", | |
"186 Singapore\n", | |
"109 Japan\n", | |
"193 San Marino\n", | |
"129 Macao\n", | |
"5 Andorra\n", | |
"11 Antarctica\n", | |
"12 French Southern territories\n", | |
"34 Bouvet Island\n", | |
"38 Cocos (Keeling) Islands\n", | |
"52 Christmas Island\n", | |
"71 Falkland Islands\n", | |
"93 Heard Island and McDonald Islands\n", | |
"100 British Indian Ocean Territory\n", | |
"154 Norfolk Island\n", | |
"157 Niue\n", | |
"166 Pitcairn\n", | |
"187 South Georgia and the South Sandwich Islands\n", | |
"189 Svalbard and Jan Mayen\n", | |
"209 Tokelau\n", | |
"221 United States Minor Outlying Islands\n", | |
"225 Holy See (Vatican City State)\n", | |
"232 Wallis and Futuna\n", | |
"Name: Name, dtype: object" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['Name'].apply(lambda x: x.decode('utf-8'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"trace1 = pgo.Scatter(\n", | |
" x=df['GNP'],\n", | |
" y=df['LifeExpectancy'],\n", | |
" text=df['Name'],\n", | |
" mode='markers'\n", | |
")\n", | |
"layout = pgo.Layout(\n", | |
" title='Life expectancy vs GNP from MySQL world database',\n", | |
" xaxis=pgo.XAxis( type='log', title='GNP'),\n", | |
" yaxis=pgo.YAxis( title='Life expectancy'),\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\"seamless=\"seamless\" src=\"https://plot.ly/~marianne2/2202.embed\" height=\"525\" width=\"100%\"></iframe>" | |
], | |
"text/plain": [ | |
"<plotly.tools.PlotlyDisplay object>" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data = pgo.Data([trace1])\n", | |
"fig = pgo.Figure(data=data, layout=layout)\n", | |
"py.iplot(fig, filename='world GNP vs life expectancy')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<iframe id=\"igraph\" scrolling=\"no\" style=\"border:none;\"seamless=\"seamless\" src=\"https://plot.ly/~marianne2/2205.embed\" height=\"525\" width=\"100%\"></iframe>" | |
], | |
"text/plain": [ | |
"<plotly.tools.PlotlyDisplay object>" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# (!) Set 'size' values to be proportional to rendered area,\n", | |
"# instead of diameter. This makes the range of bubble sizes smaller\n", | |
"sizemode = 'area' \n", | |
"\n", | |
"# (!) Set a reference for 'size' values (i.e. a population-to-pixel scaling).\n", | |
"# Here the max bubble area will be on the order of 100 pixels\n", | |
"sizeref = df['Population'].max() / 1e2**2\n", | |
"\n", | |
"colors = {\n", | |
" 'Asia':\"rgb(255,65,54)\",\n", | |
" 'Europe':\"rgb(133,20,75)\",\n", | |
" 'Africa':\"rgb(0,116,217)\",\n", | |
" 'North America':\"rgb(255,133,27)\",\n", | |
" 'South America':\"rgb(23,190,207)\",\n", | |
" 'Antarctica':\"rgb(61,153,112)\",\n", | |
" 'Oceania':\"rgb(255,220,0)\",\n", | |
"}\n", | |
"\n", | |
"# Define a hover-text generating function (returns a list of strings)\n", | |
"def make_text(X):\n", | |
" return 'Country: %s\\\n", | |
" <br>Life Expectancy: %s years\\\n", | |
" <br>Population: %s million'\\\n", | |
" % (X['Name'], X['LifeExpectancy'], X['Population'] / 1e6) \n", | |
"\n", | |
"# Define a trace-generating function (returns a Scatter object)\n", | |
"def make_trace(X, continent, sizes, color): \n", | |
" return pgo.Scatter(\n", | |
" x=X['GNP'], # GDP on the x-xaxis\n", | |
" y=X['LifeExpectancy'], # life Exp on th y-axis\n", | |
" name=continent, # label continent names on hover\n", | |
" mode='markers', # (!) point markers only on this plot\n", | |
" text=X.apply(make_text, axis=1).tolist(),\n", | |
" marker=pgo.Marker(\n", | |
" color=color, # marker color\n", | |
" size=sizes, # (!) marker sizes (sizes is a list)\n", | |
" sizeref=sizeref, # link sizeref\n", | |
" sizemode=sizemode, # link sizemode\n", | |
" opacity=0.6, # (!) partly transparent markers\n", | |
" line=pgo.Line(width=3,color=\"white\") # marker borders\n", | |
" )\n", | |
" )\n", | |
"\n", | |
"# Initialize data object \n", | |
"data = pgo.Data()\n", | |
"\n", | |
"# Group data frame by continent sub-dataframe (named X), \n", | |
"# make one trace object per continent and append to data object\n", | |
"for continent, X in df.groupby('Continent'):\n", | |
" \n", | |
" sizes = X['Population'] # get population array \n", | |
" color = colors[continent] # get bubble color\n", | |
" \n", | |
" data.append(make_trace(X, continent, sizes, color)) # append trace to data object\n", | |
"\n", | |
" # Set plot and axis titles\n", | |
"title = \"Life expectancy vs GNP from MySQL world database (bubble chart)\"\n", | |
"x_title = \"Gross National Product\"\n", | |
"y_title = \"Life Expectancy [in years]\"\n", | |
"\n", | |
"# Define a dictionary of axis style options\n", | |
"axis_style = dict( \n", | |
" type='log',\n", | |
" zeroline=False, # remove thick zero line\n", | |
" gridcolor='#FFFFFF', # white grid lines\n", | |
" ticks='outside', # draw ticks outside axes \n", | |
" ticklen=8, # tick length\n", | |
" tickwidth=1.5 # and width\n", | |
")\n", | |
"\n", | |
"# Make layout object\n", | |
"layout = pgo.Layout(\n", | |
" title=title, # set plot title\n", | |
" plot_bgcolor='#EFECEA', # set plot color to grey\n", | |
" hovermode=\"closest\",\n", | |
" xaxis=pgo.XAxis(\n", | |
" axis_style, # add axis style dictionary\n", | |
" title=x_title, # x-axis title\n", | |
" range=[2.0,7.2], # log of min and max x limits\n", | |
" ),\n", | |
" yaxis=pgo.YAxis(\n", | |
" axis_style, # add axis style dictionary\n", | |
" title=y_title, # y-axis title\n", | |
" )\n", | |
")\n", | |
"\n", | |
"# Make Figure object\n", | |
"fig = pgo.Figure(data=data, layout=layout)\n", | |
"\n", | |
"# (@) Send to Plotly and show in notebook\n", | |
"py.iplot(fig, filename='s3_life-gdp')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment