martindurant · July 10, 2018 21:06 · martindurant · Jul 10, 2018
diff --git a/cache.yaml b/cache.yaml

 sources:
  glob_source:
    description: glob of files
    driver: csv
    cache:
      - argkey: urlpath
        regex: 's3://bucket'
        sub: {{ CACHE_DIR }}
    args:
      urlpath: 's3://bucket/example*.csv'
  single_file:
    description: a file that can't be read directly from remote
    driver: netcdf
    cache:
      - argkey: urlpath
        regex: 's3://bucket'
        sub: {{ CACHE_DIR }}
        required: true
    args:
      urlpath: 's3://bucket/data.nc'
      chunks: {x: 50}
  nested:
    description: known data tree
    driver: parquet
    cache:
      - argkey: urlpath
        regex: 's3://bucket/data.parquet'
        sub: {{ CACHE_DIR }}
        files:
          - '_metadata'
          - '*/cat*/part.*.parquet'
    args:
      urlpath: 's3://bucket/data.parquet'
  complex:
    description: any number of levels
    driver: zarr
    cache:
      - argkey: urlpath
        regex: 'gcs://bucket/mydata.zarr'
        depth: 3  # levels of globbing to try
        sub: {{ CACHE_DIR }}
    args:
      urlpath: 's3://bucket/data.zarr'

	sources:
	glob_source:
	description: glob of files
	driver: csv
	cache:
	- argkey: urlpath
	regex: 's3://bucket'
	sub: {{ CACHE_DIR }}
	args:
	urlpath: 's3://bucket/example*.csv'
	single_file:
	description: a file that can't be read directly from remote
	driver: netcdf
	cache:
	- argkey: urlpath
	regex: 's3://bucket'
	sub: {{ CACHE_DIR }}
	required: true
	args:
	urlpath: 's3://bucket/data.nc'
	chunks: {x: 50}
	nested:
	description: known data tree
	driver: parquet
	cache:
	- argkey: urlpath
	regex: 's3://bucket/data.parquet'
	sub: {{ CACHE_DIR }}
	files:
	- '_metadata'
	- '/cat/part.*.parquet'
	args:
	urlpath: 's3://bucket/data.parquet'
	complex:
	description: any number of levels
	driver: zarr
	cache:
	- argkey: urlpath
	regex: 'gcs://bucket/mydata.zarr'
	depth: 3 # levels of globbing to try
	sub: {{ CACHE_DIR }}
	args:
	urlpath: 's3://bucket/data.zarr'