naomi-henderson · February 3, 2020 20:15
diff --git a/PivotTables.ipynb b/PivotTables.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define a simple search on keywords\n",
    "def search_df(df, verbose= False, **search):\n",
    "    \"search by keywords - if list, then match exactly, otherwise match as substring\"\n",
    "    keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']\n",
    "    d = df\n",
    "    for skey in search.keys():\n",
    "        \n",
    "        if isinstance(search[skey], str):  # match a string as a substring\n",
    "            d = d[d[skey].str.contains(search[skey])]\n",
    "        else:\n",
    "            dk = []\n",
    "            for key in search[skey]:       # match a list of strings exactly\n",
    "                dk += [d[d[skey]==key]]\n",
    "            d = pd.concat(dk)\n",
    "            keys.remove(skey)\n",
    "    if verbose:\n",
    "        for key in keys:\n",
    "            print(key,' = ',list(d[key].unique()))      \n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfp = search_df(df, experiment_id=['historical','1pctCO2'], table_id='Omon', variable_id=['tos','o2','chl','dissic'], grid_label=['gn'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dm = dfp[['experiment_id','source_id','variable_id','member_id',]].groupby(['experiment_id','source_id','variable_id']).nunique()[['member_id']]\n",
    "\n",
    "table = pd.DataFrame.pivot_table(dm, values='member_id', index=['experiment_id','source_id'],\n",
    "                                 columns=['variable_id'], aggfunc=np.sum, fill_value=0)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "table[(table.chl>0)&(table.dissic>0)&(table.tos>0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pangeo-Oct2019",
   "language": "python",
   "name": "pangeo-oct2019"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# define a simple search on keywords\n",
	"def search_df(df, verbose= False, **search):\n",
	" \"search by keywords - if list, then match exactly, otherwise match as substring\"\n",
	" keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']\n",
	" d = df\n",
	" for skey in search.keys():\n",
	" \n",
	" if isinstance(search[skey], str): # match a string as a substring\n",
	" d = d[d[skey].str.contains(search[skey])]\n",
	" else:\n",
	" dk = []\n",
	" for key in search[skey]: # match a list of strings exactly\n",
	" dk += [d[d[skey]==key]]\n",
	" d = pd.concat(dk)\n",
	" keys.remove(skey)\n",
	" if verbose:\n",
	" for key in keys:\n",
	" print(key,' = ',list(d[key].unique())) \n",
	" return d"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dfp = search_df(df, experiment_id=['historical','1pctCO2'], table_id='Omon', variable_id=['tos','o2','chl','dissic'], grid_label=['gn'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dm = dfp[['experiment_id','source_id','variable_id','member_id',]].groupby(['experiment_id','source_id','variable_id']).nunique()[['member_id']]\n",
	"\n",
	"table = pd.DataFrame.pivot_table(dm, values='member_id', index=['experiment_id','source_id'],\n",
	" columns=['variable_id'], aggfunc=np.sum, fill_value=0)\n",
	"table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"table[(table.chl>0)&(table.dissic>0)&(table.tos>0)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pangeo-Oct2019",
	"language": "python",
	"name": "pangeo-oct2019"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}