naomi-henderson · February 14, 2021 12:20
diff --git a/GCS-CMIP6-sample_notebook.ipynb b/GCS-CMIP6-sample_notebook.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exploring the Pangeo CMIP6 Google Cloud catalog\n",
    "- You may need to update your `xarray` and `gcsfs` packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('0.3.1', '0.14.0')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xarray as xr\n",
    "import gcsfs\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "gcsfs.__version__, xr.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200903"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "view the first 5 lines of the `pandas.DataFrame`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is possible to interact with the `DataFrame`; for instance, we can see what the \"attributes\" of the datasets are by printing the columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's connect to Google Cloud Storage and start looking at the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fs = gcsfs.GCSFileSystem(token='anon', access='read_only')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`\n",
    "- Each zarr path (zstore) is the location of a single dataset\n",
    "- The original netcdf files have been concatenated in time, thus there is one dataset per attribute set\n",
    "- Opening the store only reads the metadata, not the data (lazy loading)\n",
    "- The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Grab the first line as an example:\n",
    "url = df.zstore.values[0]\n",
    "xr.open_zarr(fs.get_mapper(url),consolidated=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Searching the csv catalog file\n",
    "- If you are familiar with pandas dataframes, you will know many ways to subset them\n",
    "- One of our favorites is the use `panda.query`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sub = df.query(\"source_id == 'CESM2'\")\n",
    "df_sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_id = 'historical'\n",
    "source_id = 'CESM2'\n",
    "table_id = 'Amon'\n",
    "\n",
    "df_sub = df.query(f\" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' \")\n",
    "\n",
    "# what variables match this query?\n",
    "df_sub.variable_id.unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tas = df_sub.query(\" variable_id == 'tas' \")\n",
    "\n",
    "dset_dict = {}\n",
    "for zstore in df_tas.zstore.unique():\n",
    "    # make a nice concise name from the zstore value\n",
    "    name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]\n",
    "    print(name)\n",
    "    ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)\n",
    "    #print(dict(ds.dims))\n",
    "    dset_dict[name] = ds\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Note that you can use <TAB> after starting the dictionary name to see the choices\n",
    "ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']\n",
    "# plot first and last times and difference\n",
    "ds.tas[0].plot(aspect=2, size=3)\n",
    "ds.tas[-1].plot(aspect=2, size=3)\n",
    "(ds.tas[-1] - ds.tas[0]).plot(aspect=2, size=3)\n",
    "plt.tight_layout()\n",
    "plt.draw()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pangeo-Oct2019",
   "language": "python",
   "name": "pangeo-oct2019"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
diff --git a/GCS-CMIP6-sample_notebook.py b/GCS-CMIP6-sample_notebook.py
 #!/usr/bin/env python
 # coding: utf-8

 # ## Exploring the Pangeo CMIP6 Google Cloud catalog
 # - You may need to update your `xarray` and `gcsfs` packages

 import numpy as np
 import pandas as pd
 import xarray as xr
 import gcsfs

 import matplotlib.pyplot as plt

 print(gcsfs.__version__, xr.__version__)
 #'0.3.1', '0.14.0'

 # Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage. 

 df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
 print(len(df))


 # view the first 5 lines of the `pandas.DataFrame`
 print(df.head())


 # It is possible to interact with the `DataFrame`; for instance, we can see what the "attributes" of the datasets are by printing the columns.
 print(df.keys())


 # Now let's connect to Google Cloud Storage and start looking at the data
 fs = gcsfs.GCSFileSystem(token='anon', access='read_only')


 # ### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`
 # - Each zarr path (zstore) is the location of a single dataset
 # - The original netcdf files have been concatenated in time, thus there is one dataset per attribute set
 # - Opening the store only reads the metadata, not the data (lazy loading)
 # - The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True' 

 # Grab the first line as an example:
 url = df.zstore.values[0]
 ds = xr.open_zarr(fs.get_mapper(url),consolidated=True)
 print(ds)

 # ### Searching the csv catalog file
 # - If you are familiar with pandas dataframes, you will know many ways to subset them
 # - One of our favorites is the use `panda.query`

 df_sub = df.query("source_id == 'CESM2'")
 print(df_sub.head())

 experiment_id = 'historical'
 source_id = 'CESM2'
 table_id = 'Amon'

 df_sub = df.query(f" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' ")

 # what variables match this query?
 print(df_sub.variable_id.unique())

 # ### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub

 df_tas = df_sub.query(" variable_id == 'tas' ")

 dset_dict = {}
 for zstore in df_tas.zstore.unique():
    # make a nice concise name from the zstore value
    name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]
    print(name)
    ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)
    #print(dict(ds.dims))
    dset_dict[name] = ds

 # Note that you can use <TAB> after starting the dictionary name to see the choices
 ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']
 print(ds)
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Exploring the Pangeo CMIP6 Google Cloud catalog\n",
	"- You may need to update your `xarray` and `gcsfs` packages"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"('0.3.1', '0.14.0')"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import xarray as xr\n",
	"import gcsfs\n",
	"\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"gcsfs.__version__, xr.__version__"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"200903"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')\n",
	"len(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"view the first 5 lines of the `pandas.DataFrame`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It is possible to interact with the `DataFrame`; for instance, we can see what the \"attributes\" of the datasets are by printing the columns."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.keys()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now let's connect to Google Cloud Storage and start looking at the data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"fs = gcsfs.GCSFileSystem(token='anon', access='read_only')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`\n",
	"- Each zarr path (zstore) is the location of a single dataset\n",
	"- The original netcdf files have been concatenated in time, thus there is one dataset per attribute set\n",
	"- Opening the store only reads the metadata, not the data (lazy loading)\n",
	"- The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True' "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Grab the first line as an example:\n",
	"url = df.zstore.values[0]\n",
	"xr.open_zarr(fs.get_mapper(url),consolidated=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Searching the csv catalog file\n",
	"- If you are familiar with pandas dataframes, you will know many ways to subset them\n",
	"- One of our favorites is the use `panda.query`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df_sub = df.query(\"source_id == 'CESM2'\")\n",
	"df_sub.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"experiment_id = 'historical'\n",
	"source_id = 'CESM2'\n",
	"table_id = 'Amon'\n",
	"\n",
	"df_sub = df.query(f\" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' \")\n",
	"\n",
	"# what variables match this query?\n",
	"df_sub.variable_id.unique()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df_tas = df_sub.query(\" variable_id == 'tas' \")\n",
	"\n",
	"dset_dict = {}\n",
	"for zstore in df_tas.zstore.unique():\n",
	" # make a nice concise name from the zstore value\n",
	" name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]\n",
	" print(name)\n",
	" ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)\n",
	" #print(dict(ds.dims))\n",
	" dset_dict[name] = ds\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Note that you can use <TAB> after starting the dictionary name to see the choices\n",
	"ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']\n",
	"# plot first and last times and difference\n",
	"ds.tas[0].plot(aspect=2, size=3)\n",
	"ds.tas[-1].plot(aspect=2, size=3)\n",
	"(ds.tas[-1] - ds.tas[0]).plot(aspect=2, size=3)\n",
	"plt.tight_layout()\n",
	"plt.draw()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pangeo-Oct2019",
	"language": "python",
	"name": "pangeo-oct2019"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	},
	"widgets": {
	"application/vnd.jupyter.widget-state+json": {
	"state": {},
	"version_major": 2,
	"version_minor": 0
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
	#!/usr/bin/env python
	# coding: utf-8

	# ## Exploring the Pangeo CMIP6 Google Cloud catalog
	# - You may need to update your `xarray` and `gcsfs` packages

	import numpy as np
	import pandas as pd
	import xarray as xr
	import gcsfs

	import matplotlib.pyplot as plt

	print(gcsfs.__version__, xr.__version__)
	#'0.3.1', '0.14.0'

	# Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage.

	df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
	print(len(df))


	# view the first 5 lines of the `pandas.DataFrame`
	print(df.head())


	# It is possible to interact with the `DataFrame`; for instance, we can see what the "attributes" of the datasets are by printing the columns.
	print(df.keys())


	# Now let's connect to Google Cloud Storage and start looking at the data
	fs = gcsfs.GCSFileSystem(token='anon', access='read_only')


	# ### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`
	# - Each zarr path (zstore) is the location of a single dataset
	# - The original netcdf files have been concatenated in time, thus there is one dataset per attribute set
	# - Opening the store only reads the metadata, not the data (lazy loading)
	# - The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True'

	# Grab the first line as an example:
	url = df.zstore.values[0]
	ds = xr.open_zarr(fs.get_mapper(url),consolidated=True)
	print(ds)

	# ### Searching the csv catalog file
	# - If you are familiar with pandas dataframes, you will know many ways to subset them
	# - One of our favorites is the use `panda.query`

	df_sub = df.query("source_id == 'CESM2'")
	print(df_sub.head())

	experiment_id = 'historical'
	source_id = 'CESM2'
	table_id = 'Amon'

	df_sub = df.query(f" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' ")

	# what variables match this query?
	print(df_sub.variable_id.unique())

	# ### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub

	df_tas = df_sub.query(" variable_id == 'tas' ")

	dset_dict = {}
	for zstore in df_tas.zstore.unique():
	# make a nice concise name from the zstore value
	name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]
	print(name)
	ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)
	#print(dict(ds.dims))
	dset_dict[name] = ds

	# Note that you can use <TAB> after starting the dictionary name to see the choices
	ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']
	print(ds)