Created
June 12, 2020 20:27
-
-
Save serjee/b0e856e86efd28e49af287d768b990af to your computer and use it in GitHub Desktop.
Python: Pandas instead SQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Python: Pandas instead SQL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"# test data: https://ourairports.com/data/\n", | |
"airports = pd.read_csv('https://ourairports.com/data/airports.csv')\n", | |
"airport_freq = pd.read_csv('https://ourairports.com/data/airport-frequencies.csv')\n", | |
"runways = pd.read_csv('https://ourairports.com/data/runways.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## SELECT, WHERE, DISTINCT, LIMIT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airports\n", | |
"airports" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airports limit 3\n", | |
"airports.head(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select id from airports where ident = 'KLAX'\n", | |
"airports[airports.ident == 'KLAX'].id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select distinct type from airport\n", | |
"airports.type.unique()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'\n", | |
"airports[(airports.iso_region == 'US-CA') & (airports.type == 'seaplane_base')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'\n", | |
"airports[(airports.iso_region == 'US-CA') & (airports.type == 'large_airport')][['ident', 'name', 'municipality']]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## ORDER BY" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type\n", | |
"airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type desc\n", | |
"airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## IN и NOT IN" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airports where type in ('heliport', 'balloonport')\n", | |
"airports[airports.type.isin(['heliport', 'balloonport'])]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select * from airports where type not in ('heliport', 'balloonport')\n", | |
"airports[~airports.type.isin(['heliport', 'balloonport'])]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## GROUP BY, COUNT, ORDER BY " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n", | |
"airports.groupby(['iso_country', 'type']).size()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc\n", | |
"airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n", | |
"airports.groupby(['iso_country', 'type']).size()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc\n", | |
"airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## HAVING" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select type, count(*) from airports where iso_country = 'US' group by type having count(*) > 1000 order by count(*) desc\n", | |
"airports[airports.iso_country == 'US'].groupby('type').filter(lambda g: len(g) > 1000).groupby('type').size().sort_values(ascending=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## First N records" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country from by_country order by size desc limit 10\n", | |
"by_country.nlargest(10, columns='airport_count')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select iso_country from by_country order by size desc limit 10 offset 10\n", | |
"by_country.nlargest(20, columns='airport_count').tail(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Aggregate Functions: MIN, MAX, MEAN" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select max(length_ft), min(length_ft), mean(length_ft), median(length_ft) from runways\n", | |
"runways.agg({'length_ft': ['min', 'max', 'mean', 'median']})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## JOIN" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'\n", | |
"airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], left_on='airport_ref', right_on='id', how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## UNION ALL и UNION" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'\n", | |
"pd.concat([airports[airports.ident == 'KLAX'][['name', 'municipality']], airports[airports.ident == 'KLGB'][['name', 'municipality']]])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## INSERT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: create table heroes (id integer, name text);\n", | |
"# SQL: insert into heroes values (1, 'Harry Potter');\n", | |
"# SQL: insert into heroes values (2, 'Ron Weasley');\n", | |
"# SQL: insert into heroes values (3, 'Hermione Granger');\n", | |
"df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})\n", | |
"df2 = pd.DataFrame({'id': [3], 'name': ['Hermione Granger']})\n", | |
"pd.concat([df1, df2]).reset_index(drop=True)\n", | |
"\n", | |
"# Note: By default more Pandas operators return new object.\n", | |
"# But some operators has accept param: inplace=True.\n", | |
"# It's mean that we'll work with current dataframe object instead new.\n", | |
"# If you need reset index for current datafreme object, you should use:\n", | |
"# df.reset_index(drop=True, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## UPDATE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX'\n", | |
"airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## DELETE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SQL: delete from lax_freq where type = 'MISC'\n", | |
"lax_freq = lax_freq[lax_freq.type != 'MISC']\n", | |
"lax_freq.drop(lax_freq[lax_freq.type == 'MISC'].index)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Pandas exprot to different formats " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv (...) # to the csv file\n", | |
"df.to_hdf (...) # to HDF5 file\n", | |
"df.to_pickle (...) # in a serialized object\n", | |
"df.to_sql (...) # to the SQL database\n", | |
"df.to_excel (...) # to Excel file\n", | |
"df.to_json (...) # to JSON string\n", | |
"df.to_html (...) # display as an HTML table\n", | |
"df.to_feather (...) # in binary feather format\n", | |
"df.to_latex (...) # to the table environment\n", | |
"df.to_stata (...) # to Stata binary data files\n", | |
"df.to_msgpack (...) # msgpack object (serialization)\n", | |
"df.to_gbq (...) # in the BigQuery table (Google)\n", | |
"df.to_string (...) # to console output\n", | |
"df.to_clipboard (...) # to the clipboard that can be pasted into Excel" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create Graphs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"top_10.plot(\n", | |
" x='iso_country', \n", | |
" y='airport_count',\n", | |
" kind='barh',\n", | |
" figsize=(10, 7),\n", | |
" title='Top 10 countries with most airports')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment