naomi-henderson · November 21, 2020 23:18
diff --git a/Basics2.ipynb b/Basics2.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read CMIP6 datasets and store locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xarray as xr\n",
    "import os\n",
    "import gcsfs\n",
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specify a local path to put the netcdf files\n",
    "local_path = 'files_nc'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def search_df(df, verbose= False, **search):\n",
    "    '''search by keywords - if list, then match exactly, otherwise match as substring'''\n",
    "    keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']\n",
    "    d = df\n",
    "    for skey in search.keys():\n",
    "        if isinstance(search[skey], str):  # match a string as a substring\n",
    "            d = d[d[skey].str.contains(search[skey])]\n",
    "        else:\n",
    "            dk = []\n",
    "            for key in search[skey]:       # match a list of strings exactly\n",
    "                dk += [d[d[skey]==key]]\n",
    "            d = pd.concat(dk)\n",
    "            keys.remove(skey)\n",
    "    if verbose:\n",
    "        for key in keys:\n",
    "            print(key,' = ',list(d[key].unique()))      \n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_zid(gsurl):\n",
    "    ''' given a GCS zarr location, return the dataset_id'''\n",
    "    assert gsurl[:10] == 'gs://cmip6'\n",
    "    return gsurl[11:-1].split('/')\n",
    "\n",
    "def get_zdict(gsurl):\n",
    "    ''' given a GCS zarr location, return a dictionary of keywords'''\n",
    "    zid = get_zid(gsurl)\n",
    "    keys = ['activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']\n",
    "    values = list(zid)\n",
    "    return dict(zip(keys,values)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cloud = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv', dtype='unicode')\n",
    "\n",
    "fs = gcsfs.GCSFileSystem(token='anon', access='read_only')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Here we search the CMIP6 data for the datasets you need - using the same keywords as at the ESGF sites\n",
    "#       https://esgf-node.llnl.gov/search/cmip6/\n",
    "\n",
    "search = {}\n",
    "search['table_id'] = 'Amon'\n",
    "search['experiment_id'] = ['historical','ssp370']\n",
    "search['variable_id'] = ['tas']\n",
    "search['institution_id'] = ['NOAA-GFDL']\n",
    "    \n",
    "df_available = search_df(df_cloud, **search)\n",
    "\n",
    "print('number of matching datasets',len(df_available))\n",
    "df_available.zstore.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gsurls = df_available.zstore.values\n",
    "\n",
    "for gsurl in gsurls:\n",
    "    print(gsurl)\n",
    "    zdict = get_zdict(gsurl)\n",
    "    ncdir = local_path + gsurl[10:]\n",
    "    \n",
    "    model = zdict['source_id']\n",
    "    variable = zdict['variable_id']\n",
    "    \n",
    "    ncfiles = glob(f'{ncdir}{variable}*.nc')\n",
    "    if len(ncfiles) > 0:\n",
    "        print(ncfiles, 'already exists')\n",
    "        continue\n",
    "    \n",
    "    ds = xr.open_zarr(fs.get_mapper(gsurl),consolidated=True)\n",
    "\n",
    "    ncfile = f'{ncdir}{variable}.nc'\n",
    "    os.system(f'mkdir -p {ncdir}')\n",
    "    ds.to_netcdf(ncfile,mode='w',unlimited_dims='time')  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! tree -L 9 files_nc"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pangeo-Oct2019",
   "language": "python",
   "name": "pangeo-oct2019"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read CMIP6 datasets and store locally"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import xarray as xr\n",
	"import os\n",
	"import gcsfs\n",
	"from glob import glob"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# specify a local path to put the netcdf files\n",
	"local_path = 'files_nc'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def search_df(df, verbose= False, **search):\n",
	" '''search by keywords - if list, then match exactly, otherwise match as substring'''\n",
	" keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']\n",
	" d = df\n",
	" for skey in search.keys():\n",
	" if isinstance(search[skey], str): # match a string as a substring\n",
	" d = d[d[skey].str.contains(search[skey])]\n",
	" else:\n",
	" dk = []\n",
	" for key in search[skey]: # match a list of strings exactly\n",
	" dk += [d[d[skey]==key]]\n",
	" d = pd.concat(dk)\n",
	" keys.remove(skey)\n",
	" if verbose:\n",
	" for key in keys:\n",
	" print(key,' = ',list(d[key].unique())) \n",
	" return d"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_zid(gsurl):\n",
	" ''' given a GCS zarr location, return the dataset_id'''\n",
	" assert gsurl[:10] == 'gs://cmip6'\n",
	" return gsurl[11:-1].split('/')\n",
	"\n",
	"def get_zdict(gsurl):\n",
	" ''' given a GCS zarr location, return a dictionary of keywords'''\n",
	" zid = get_zid(gsurl)\n",
	" keys = ['activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']\n",
	" values = list(zid)\n",
	" return dict(zip(keys,values)) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df_cloud = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv', dtype='unicode')\n",
	"\n",
	"fs = gcsfs.GCSFileSystem(token='anon', access='read_only')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Here we search the CMIP6 data for the datasets you need - using the same keywords as at the ESGF sites\n",
	"# https://esgf-node.llnl.gov/search/cmip6/\n",
	"\n",
	"search = {}\n",
	"search['table_id'] = 'Amon'\n",
	"search['experiment_id'] = ['historical','ssp370']\n",
	"search['variable_id'] = ['tas']\n",
	"search['institution_id'] = ['NOAA-GFDL']\n",
	" \n",
	"df_available = search_df(df_cloud, **search)\n",
	"\n",
	"print('number of matching datasets',len(df_available))\n",
	"df_available.zstore.values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"gsurls = df_available.zstore.values\n",
	"\n",
	"for gsurl in gsurls:\n",
	" print(gsurl)\n",
	" zdict = get_zdict(gsurl)\n",
	" ncdir = local_path + gsurl[10:]\n",
	" \n",
	" model = zdict['source_id']\n",
	" variable = zdict['variable_id']\n",
	" \n",
	" ncfiles = glob(f'{ncdir}{variable}*.nc')\n",
	" if len(ncfiles) > 0:\n",
	" print(ncfiles, 'already exists')\n",
	" continue\n",
	" \n",
	" ds = xr.open_zarr(fs.get_mapper(gsurl),consolidated=True)\n",
	"\n",
	" ncfile = f'{ncdir}{variable}.nc'\n",
	" os.system(f'mkdir -p {ncdir}')\n",
	" ds.to_netcdf(ncfile,mode='w',unlimited_dims='time') \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"! tree -L 9 files_nc"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pangeo-Oct2019",
	"language": "python",
	"name": "pangeo-oct2019"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}