naomi-henderson · November 21, 2020 23:13
diff --git a/Basics1.ipynb b/Basics1.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Search the CMIP6 Google Public Collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the master CMIP6 Google Cloud catalog \n",
    "#    this is in CSV format, so load into notebook into a pandas dataframe\n",
    "\n",
    "df_cloud = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv', dtype='unicode')\n",
    "df_cloud.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore this Pandas DataFrame - see https://realpython.com/pandas-python-explore-dataset/\n",
    "#    uncomment the other lines to get more examples - results printed only for the last uncommented line\n",
    "\n",
    "df_cloud.describe()\n",
    "#df_cloud.activity_id.unique()\n",
    "#df_cloud.activity_id.value_counts()\n",
    "#df_cloud.loc[df_cloud.activity_id == \"ScenarioMIP\", \"experiment_id\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Examples of selecting subsets of the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a dataframe for the 'CMIP' activity and print out the experiment names\n",
    "df_CMIP = df_cloud[df_cloud.activity_id == 'CMIP']\n",
    "df_CMIP.experiment_id.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a dataframe of the 3 hourly data\n",
    "df_3hourly = df_cloud[df_cloud.table_id.str.endswith(\"3hr\")]\n",
    "len(df_cloud),len(df_3hourly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Okay, now lets get down to business.\n",
    "#   Make a dictionary of desired choices:\n",
    "search = {}\n",
    "search['activity_id'] = 'ScenarioMIP'\n",
    "search['table_id']  = 'Amon'\n",
    "search['variable_id'] = 'tas'\n",
    "search['grid_label'] = 'gn'\n",
    "\n",
    "# recursively subselect the datasets\n",
    "df = df_cloud.copy()\n",
    "for key in search.keys():   \n",
    "    df = df[ df[key] == search[key] ]\n",
    "    \n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Or make a pivot_table to display key fields:\n",
    "pd.set_option('display.width', 1000)\n",
    "\n",
    "dm = df.groupby(['experiment_id','source_id']).nunique()[['member_id']]\n",
    "\n",
    "table = pd.DataFrame.pivot_table(dm,\n",
    "                                 values='member_id',\n",
    "                                 index=['source_id'],\n",
    "                                 columns=['experiment_id'],\n",
    "                                 aggfunc=np.sum,\n",
    "                                 fill_value=0)\n",
    "#print(len(dfs),'rows, activity_id, table_id = ',dfs.activity_id.unique(),dfs.table_id.unique())\n",
    "print(search)\n",
    "print('\\nNumber of ensemble members available:')\n",
    "print(table)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pangeo-Oct2019",
   "language": "python",
   "name": "pangeo-oct2019"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Search the CMIP6 Google Public Collection"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# This is the master CMIP6 Google Cloud catalog \n",
	"# this is in CSV format, so load into notebook into a pandas dataframe\n",
	"\n",
	"df_cloud = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv', dtype='unicode')\n",
	"df_cloud.keys()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Explore this Pandas DataFrame - see https://realpython.com/pandas-python-explore-dataset/\n",
	"# uncomment the other lines to get more examples - results printed only for the last uncommented line\n",
	"\n",
	"df_cloud.describe()\n",
	"#df_cloud.activity_id.unique()\n",
	"#df_cloud.activity_id.value_counts()\n",
	"#df_cloud.loc[df_cloud.activity_id == \"ScenarioMIP\", \"experiment_id\"].value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Examples of selecting subsets of the dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Make a dataframe for the 'CMIP' activity and print out the experiment names\n",
	"df_CMIP = df_cloud[df_cloud.activity_id == 'CMIP']\n",
	"df_CMIP.experiment_id.unique()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create a dataframe of the 3 hourly data\n",
	"df_3hourly = df_cloud[df_cloud.table_id.str.endswith(\"3hr\")]\n",
	"len(df_cloud),len(df_3hourly)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Okay, now lets get down to business.\n",
	"# Make a dictionary of desired choices:\n",
	"search = {}\n",
	"search['activity_id'] = 'ScenarioMIP'\n",
	"search['table_id'] = 'Amon'\n",
	"search['variable_id'] = 'tas'\n",
	"search['grid_label'] = 'gn'\n",
	"\n",
	"# recursively subselect the datasets\n",
	"df = df_cloud.copy()\n",
	"for key in search.keys(): \n",
	" df = df[ df[key] == search[key] ]\n",
	" \n",
	"df.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Or make a pivot_table to display key fields:\n",
	"pd.set_option('display.width', 1000)\n",
	"\n",
	"dm = df.groupby(['experiment_id','source_id']).nunique()[['member_id']]\n",
	"\n",
	"table = pd.DataFrame.pivot_table(dm,\n",
	" values='member_id',\n",
	" index=['source_id'],\n",
	" columns=['experiment_id'],\n",
	" aggfunc=np.sum,\n",
	" fill_value=0)\n",
	"#print(len(dfs),'rows, activity_id, table_id = ',dfs.activity_id.unique(),dfs.table_id.unique())\n",
	"print(search)\n",
	"print('\\nNumber of ensemble members available:')\n",
	"print(table)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pangeo-Oct2019",
	"language": "python",
	"name": "pangeo-oct2019"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}