serjee · June 12, 2020 20:27
diff --git a/pandas-instead-sql.ipynb b/pandas-instead-sql.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python: Pandas instead SQL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# test data: https://ourairports.com/data/\n",
    "airports = pd.read_csv('https://ourairports.com/data/airports.csv')\n",
    "airport_freq = pd.read_csv('https://ourairports.com/data/airport-frequencies.csv')\n",
    "runways = pd.read_csv('https://ourairports.com/data/runways.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SELECT, WHERE, DISTINCT, LIMIT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# SQL: select * from airports\n",
    "airports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airports limit 3\n",
    "airports.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select id from airports where ident = 'KLAX'\n",
    "airports[airports.ident == 'KLAX'].id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select distinct type from airport\n",
    "airports.type.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'\n",
    "airports[(airports.iso_region == 'US-CA') & (airports.type == 'seaplane_base')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'\n",
    "airports[(airports.iso_region == 'US-CA') & (airports.type == 'large_airport')][['ident', 'name', 'municipality']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ORDER BY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type\n",
    "airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type desc\n",
    "airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IN и NOT IN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airports where type in ('heliport', 'balloonport')\n",
    "airports[airports.type.isin(['heliport', 'balloonport'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select * from airports where type not in ('heliport', 'balloonport')\n",
    "airports[~airports.type.isin(['heliport', 'balloonport'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GROUP BY, COUNT, ORDER BY "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n",
    "airports.groupby(['iso_country', 'type']).size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc\n",
    "airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n",
    "airports.groupby(['iso_country', 'type']).size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc\n",
    "airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HAVING"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select type, count(*) from airports where iso_country = 'US' group by type having count(*) > 1000 order by count(*) desc\n",
    "airports[airports.iso_country == 'US'].groupby('type').filter(lambda g: len(g) > 1000).groupby('type').size().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First N records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country from by_country order by size desc limit 10\n",
    "by_country.nlargest(10, columns='airport_count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select iso_country from by_country order by size desc limit 10 offset 10\n",
    "by_country.nlargest(20, columns='airport_count').tail(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregate Functions: MIN, MAX, MEAN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select max(length_ft), min(length_ft), mean(length_ft), median(length_ft) from runways\n",
    "runways.agg({'length_ft': ['min', 'max', 'mean', 'median']})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## JOIN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'\n",
    "airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], left_on='airport_ref', right_on='id', how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UNION ALL и UNION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'\n",
    "pd.concat([airports[airports.ident == 'KLAX'][['name', 'municipality']], airports[airports.ident == 'KLGB'][['name', 'municipality']]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## INSERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: create table heroes (id integer, name text);\n",
    "# SQL: insert into heroes values (1, 'Harry Potter');\n",
    "# SQL: insert into heroes values (2, 'Ron Weasley');\n",
    "# SQL: insert into heroes values (3, 'Hermione Granger');\n",
    "df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})\n",
    "df2 = pd.DataFrame({'id': [3], 'name': ['Hermione Granger']})\n",
    "pd.concat([df1, df2]).reset_index(drop=True)\n",
    "\n",
    "# Note: By default more Pandas operators return new object.\n",
    "# But some operators has accept param: inplace=True.\n",
    "# It's mean that we'll work with current dataframe object instead new.\n",
    "# If you need reset index for current datafreme object, you should use:\n",
    "#  df.reset_index(drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UPDATE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX'\n",
    "airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DELETE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL: delete from lax_freq where type = 'MISC'\n",
    "lax_freq = lax_freq[lax_freq.type != 'MISC']\n",
    "lax_freq.drop(lax_freq[lax_freq.type == 'MISC'].index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pandas exprot to different formats "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv (...) # to the csv file\n",
    "df.to_hdf (...) # to HDF5 file\n",
    "df.to_pickle (...) # in a serialized object\n",
    "df.to_sql (...) # to the SQL database\n",
    "df.to_excel (...) # to Excel file\n",
    "df.to_json (...) # to JSON string\n",
    "df.to_html (...) # display as an HTML table\n",
    "df.to_feather (...) # in binary feather format\n",
    "df.to_latex (...) # to the table environment\n",
    "df.to_stata (...) # to Stata binary data files\n",
    "df.to_msgpack (...) # msgpack object (serialization)\n",
    "df.to_gbq (...) # in the BigQuery table (Google)\n",
    "df.to_string (...) # to console output\n",
    "df.to_clipboard (...) # to the clipboard that can be pasted into Excel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Graphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_10.plot(\n",
    "    x='iso_country', \n",
    "    y='airport_count',\n",
    "    kind='barh',\n",
    "    figsize=(10, 7),\n",
    "    title='Top 10 countries with most airports')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Python: Pandas instead SQL"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"\n",
	"# test data: https://ourairports.com/data/\n",
	"airports = pd.read_csv('https://ourairports.com/data/airports.csv')\n",
	"airport_freq = pd.read_csv('https://ourairports.com/data/airport-frequencies.csv')\n",
	"runways = pd.read_csv('https://ourairports.com/data/runways.csv')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## SELECT, WHERE, DISTINCT, LIMIT"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"# SQL: select * from airports\n",
	"airports"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airports limit 3\n",
	"airports.head(3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select id from airports where ident = 'KLAX'\n",
	"airports[airports.ident == 'KLAX'].id"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select distinct type from airport\n",
	"airports.type.unique()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'\n",
	"airports[(airports.iso_region == 'US-CA') & (airports.type == 'seaplane_base')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'\n",
	"airports[(airports.iso_region == 'US-CA') & (airports.type == 'large_airport')][['ident', 'name', 'municipality']]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## ORDER BY"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type\n",
	"airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airport_freq where airport_ident = 'KLAX' order by type desc\n",
	"airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## IN и NOT IN"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airports where type in ('heliport', 'balloonport')\n",
	"airports[airports.type.isin(['heliport', 'balloonport'])]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select * from airports where type not in ('heliport', 'balloonport')\n",
	"airports[~airports.type.isin(['heliport', 'balloonport'])]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## GROUP BY, COUNT, ORDER BY "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n",
	"airports.groupby(['iso_country', 'type']).size()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country, type, count() from airports group by iso_country, type order by iso_country, count() desc\n",
	"airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type\n",
	"airports.groupby(['iso_country', 'type']).size()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country, type, count() from airports group by iso_country, type order by iso_country, count() desc\n",
	"airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## HAVING"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select type, count() from airports where iso_country = 'US' group by type having count() > 1000 order by count(*) desc\n",
	"airports[airports.iso_country == 'US'].groupby('type').filter(lambda g: len(g) > 1000).groupby('type').size().sort_values(ascending=False)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## First N records"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country from by_country order by size desc limit 10\n",
	"by_country.nlargest(10, columns='airport_count')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select iso_country from by_country order by size desc limit 10 offset 10\n",
	"by_country.nlargest(20, columns='airport_count').tail(10)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Aggregate Functions: MIN, MAX, MEAN"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select max(length_ft), min(length_ft), mean(length_ft), median(length_ft) from runways\n",
	"runways.agg({'length_ft': ['min', 'max', 'mean', 'median']})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## JOIN"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'\n",
	"airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], left_on='airport_ref', right_on='id', how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## UNION ALL и UNION"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'\n",
	"pd.concat([airports[airports.ident == 'KLAX'][['name', 'municipality']], airports[airports.ident == 'KLGB'][['name', 'municipality']]])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## INSERT"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: create table heroes (id integer, name text);\n",
	"# SQL: insert into heroes values (1, 'Harry Potter');\n",
	"# SQL: insert into heroes values (2, 'Ron Weasley');\n",
	"# SQL: insert into heroes values (3, 'Hermione Granger');\n",
	"df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})\n",
	"df2 = pd.DataFrame({'id': [3], 'name': ['Hermione Granger']})\n",
	"pd.concat([df1, df2]).reset_index(drop=True)\n",
	"\n",
	"# Note: By default more Pandas operators return new object.\n",
	"# But some operators has accept param: inplace=True.\n",
	"# It's mean that we'll work with current dataframe object instead new.\n",
	"# If you need reset index for current datafreme object, you should use:\n",
	"# df.reset_index(drop=True, inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## UPDATE"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX'\n",
	"airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## DELETE"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SQL: delete from lax_freq where type = 'MISC'\n",
	"lax_freq = lax_freq[lax_freq.type != 'MISC']\n",
	"lax_freq.drop(lax_freq[lax_freq.type == 'MISC'].index)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Pandas exprot to different formats "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.to_csv (...) # to the csv file\n",
	"df.to_hdf (...) # to HDF5 file\n",
	"df.to_pickle (...) # in a serialized object\n",
	"df.to_sql (...) # to the SQL database\n",
	"df.to_excel (...) # to Excel file\n",
	"df.to_json (...) # to JSON string\n",
	"df.to_html (...) # display as an HTML table\n",
	"df.to_feather (...) # in binary feather format\n",
	"df.to_latex (...) # to the table environment\n",
	"df.to_stata (...) # to Stata binary data files\n",
	"df.to_msgpack (...) # msgpack object (serialization)\n",
	"df.to_gbq (...) # in the BigQuery table (Google)\n",
	"df.to_string (...) # to console output\n",
	"df.to_clipboard (...) # to the clipboard that can be pasted into Excel"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create Graphs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"top_10.plot(\n",
	" x='iso_country', \n",
	" y='airport_count',\n",
	" kind='barh',\n",
	" figsize=(10, 7),\n",
	" title='Top 10 countries with most airports')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}