naomi-henderson · February 14, 2021 12:20
diff --git a/GCS-CMIP6-sample_notebook.ipynb b/GCS-CMIP6-sample_notebook.ipynb
diff --git a/GCS-CMIP6-sample_notebook.py b/GCS-CMIP6-sample_notebook.py
 #!/usr/bin/env python
 # coding: utf-8

 # ## Exploring the Pangeo CMIP6 Google Cloud catalog
 # - You may need to update your `xarray` and `gcsfs` packages

 import numpy as np
 import pandas as pd
 import xarray as xr
 import gcsfs

 import matplotlib.pyplot as plt

 print(gcsfs.__version__, xr.__version__)
 #'0.3.1', '0.14.0'

 # Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage. 

 df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
 print(len(df))


 # view the first 5 lines of the `pandas.DataFrame`
 print(df.head())


 # It is possible to interact with the `DataFrame`; for instance, we can see what the "attributes" of the datasets are by printing the columns.
 print(df.keys())


 # Now let's connect to Google Cloud Storage and start looking at the data
 fs = gcsfs.GCSFileSystem(token='anon', access='read_only')


 # ### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`
 # - Each zarr path (zstore) is the location of a single dataset
 # - The original netcdf files have been concatenated in time, thus there is one dataset per attribute set
 # - Opening the store only reads the metadata, not the data (lazy loading)
 # - The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True' 

 # Grab the first line as an example:
 url = df.zstore.values[0]
 ds = xr.open_zarr(fs.get_mapper(url),consolidated=True)
 print(ds)

 # ### Searching the csv catalog file
 # - If you are familiar with pandas dataframes, you will know many ways to subset them
 # - One of our favorites is the use `panda.query`

 df_sub = df.query("source_id == 'CESM2'")
 print(df_sub.head())

 experiment_id = 'historical'
 source_id = 'CESM2'
 table_id = 'Amon'

 df_sub = df.query(f" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' ")

 # what variables match this query?
 print(df_sub.variable_id.unique())

 # ### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub

 df_tas = df_sub.query(" variable_id == 'tas' ")

 dset_dict = {}
 for zstore in df_tas.zstore.unique():
    # make a nice concise name from the zstore value
    name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]
    print(name)
    ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)
    #print(dict(ds.dims))
    dset_dict[name] = ds

 # Note that you can use <TAB> after starting the dictionary name to see the choices
 ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']
 print(ds)
	#!/usr/bin/env python
	# coding: utf-8

	# ## Exploring the Pangeo CMIP6 Google Cloud catalog
	# - You may need to update your `xarray` and `gcsfs` packages

	import numpy as np
	import pandas as pd
	import xarray as xr
	import gcsfs

	import matplotlib.pyplot as plt

	print(gcsfs.__version__, xr.__version__)
	#'0.3.1', '0.14.0'

	# Open the master collection catalog for CMIP6 data residing in Pangeo's Google Cloud Storage.

	df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
	print(len(df))


	# view the first 5 lines of the `pandas.DataFrame`
	print(df.head())


	# It is possible to interact with the `DataFrame`; for instance, we can see what the "attributes" of the datasets are by printing the columns.
	print(df.keys())


	# Now let's connect to Google Cloud Storage and start looking at the data
	fs = gcsfs.GCSFileSystem(token='anon', access='read_only')


	# ### Now we can import any line of the `pandas.DataFrame` into an `xarray.Dataset`
	# - Each zarr path (zstore) is the location of a single dataset
	# - The original netcdf files have been concatenated in time, thus there is one dataset per attribute set
	# - Opening the store only reads the metadata, not the data (lazy loading)
	# - The metadata has been 'consolidated' (all metadata in one file), so we take advantage of this by setting 'consolidated=True'

	# Grab the first line as an example:
	url = df.zstore.values[0]
	ds = xr.open_zarr(fs.get_mapper(url),consolidated=True)
	print(ds)

	# ### Searching the csv catalog file
	# - If you are familiar with pandas dataframes, you will know many ways to subset them
	# - One of our favorites is the use `panda.query`

	df_sub = df.query("source_id == 'CESM2'")
	print(df_sub.head())

	experiment_id = 'historical'
	source_id = 'CESM2'
	table_id = 'Amon'

	df_sub = df.query(f" experiment_id == '{experiment_id}' and source_id == '{source_id}' and table_id == '{table_id}' ")

	# what variables match this query?
	print(df_sub.variable_id.unique())

	# ### Lets make a dictionary for all 'tas' (surface air temperature) datasets in df_sub

	df_tas = df_sub.query(" variable_id == 'tas' ")

	dset_dict = {}
	for zstore in df_tas.zstore.unique():
	# make a nice concise name from the zstore value
	name = zstore.split('gs://cmip6/CMIP6/')[1].replace('/','.')[:-1]
	print(name)
	ds = xr.open_zarr(fs.get_mapper(zstore),consolidated=True)
	#print(dict(ds.dims))
	dset_dict[name] = ds

	# Note that you can use <TAB> after starting the dictionary name to see the choices
	ds = dset_dict['CMIP.NCAR.CESM2.historical.r1i1p1f1.Amon.tas.gn.v20190308']
	print(ds)