martinsotir · February 22, 2024 08:57
diff --git a/_hydra_kedro.md b/_hydra_kedro.md
diff --git a/hydra_config_loader_full.py b/hydra_config_loader_full.py
 """
 ### Experiment 2: Full Hydra configuration loader ###

 Copyright (c) 2021 Martin Sotir. All rights reserved.

 This work is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.
 """

 from collections.abc import Mapping
 from pathlib import Path, PurePosixPath
 from typing import Any, Dict, List

 from kedro.config import ConfigLoader


 class FullHydraConfigLoader(ConfigLoader):
    """Load a Kedro configuration from a single hydra configuration.
    Each call of the ``get`` method will reload the configuration and
    return only paths in the dictionnary matching the given patterns.

    This kedro config loader should be compatible with most kedro features
    and plugins, however configuration files must be edited to follow Hydra
    format:

    1. The catalog entries must be set inside a 'catalog' root dictionnary
       in the Hydra configuration (the format remains the same as for Kedro:
       https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html )
       To keep a seperate `catalog.yaml` file, we can leverage the Hydra
       default list features (see the example below).

    2. The same apply for 'parameters', 'logging', 'credentials' and any other
       Kedro root configuration files.

    3. The kedro configuration environment system is completly replaced by a Hydra
       parameter group 'env'. A configuration file must be defined for each environment
       (except 'base') in the 'env' directory. This files can override any setting
       from other configuration files.

    Example:

    * `conf/config.yaml` (single root configuration file):

    ```yaml
    defaults: # Ordered list of parameter group (order is important for overrides).
      - catalog
      - parameters
      - catalog
      - _self_  # This file configution
      - env: local # Last, loads environment overrides (local by default)

    directories:
      raw: "./data/01_raw"
      interim: "/tmp/02_interim"
      processed: "/data/03_processed"
    ```

    * `conf/catalog.yaml` (note how we set package scope with the `# @package <scope>` directive):

    ```yaml
    # @package catalog

    example_iris_data:
      type: pandas.CSVDataSet
      filepath: "${directories.raw}/iris.csv"
    ```

    * `conf/env/local.yaml` (This time using a _global_ scope):

    ```
    # @package _global_

    # Override parameters:
    parameters:
    example_test_data_ratio: 0.5

    # Overrides catalog entry configuration:
    catalog:
    example_iris_data2:
        filepath: "../data/iris.csv"
    ```

    Note: Hydra works better with ".yaml" extension files (rather than ".yml").
    When using an extension other than ".yaml" the file extension must be explicitly
    set in default lists (otherwise Hydra throws "Error: Could not load <config>").

    The trick to make the Hydra configuration works with Kedro glob path patternes
    lookup (`conf_loader.get("parameters/**")` syntax) is to convert keys in the
    configuration into paths. For instance, the configuration entry
    `conf['catalog']['iris']['filepath']` will is associated with the path
    `catalog/iris/filepath`and will be returned by the `.get(`catalog/**)` call.

    Note:

    * Patterns given the ``get`` dictionary can also match nested keys in the
    Hydra configuration. The depth of the lookup can be controlled with the
    `lookup_depth` parameter (= 1 by default).

    * Root keys are not included in the returned configuration dict ('parameters',
    'catalog' keys will not appear in the configuration only sub-dictionnaries keys
    and values are returned). This means that any parameter defined in the root
    configuration file can not be accessed directly using the `.get(*patterns)`
    method, however this values can still be used for string interpolation within
    other confuraton files. The root configuration files can then replace the
    `globals.yml` file from the Kedro TemplatedConfigLoader class.
    """

    def __init__(self, conf_root: Path, conf_name: str = 'config', resolve_interpolation=True,
                 overrides: List[str] = None, job_name='app'):
        """
        Args:
            conf_root: Root directory of the Hydra configuration.
            conf_name: Hydra configuration name (usally root configuration file name,
                       without extension if the extension is .yaml)
            resolve_interpolation: Apply OmegaConf interpolation
            overrides: List of Hydra overrides commands applied to the loaded configuration
                       (https://hydra.cc/docs/advanced/override_grammar/basic)
            job_name: hydra job name (used in some hydra configuration fields)
        Raises:
            ValueError: If ``conf_paths`` is empty.
        """
        self.conf_root = Path(conf_root).absolute()
        self.conf_name = str(Path(conf_name).stem) if Path(conf_name).suffix == 'yaml' else conf_name
        self.resolve_interpolation = resolve_interpolation
        self.overrides = overrides if overrides is not None else []
        self.job_name = job_name

    def _load_configuration(self) -> Dict[str, Any]:

        from hydra.experimental import compose, initialize_config_dir
        from omegaconf import OmegaConf

        with initialize_config_dir(config_dir=str(self.conf_root), job_name=self.job_name):
            try:
                conf = compose(config_name=self.conf_name, overrides=self.overrides, return_hydra_config=True)
            except AssertionError as e:
                if "Invalid loaded object type : NoneType" in e.args[0]:
                    # OmegaConf will raise an assertion error for empty files:
                    return dict()
                else:
                    raise e

            # Interpolation is not applied by default with the compose API:
            return OmegaConf.to_container(conf, resolve=self.resolve_interpolation)
        pass

    def get(self, *patterns: str, include_hydra_conf=False, lookup_depth=1) -> Dict[str, Any]:

        # Relaoad configuration each time:
        config = self._load_configuration()
        if not include_hydra_conf:
            del config['hydra']

        # convert nested dictionnary to single-level "path"-value dictionnary
        config_paths = dict_to_paths(config, sep='/', max_level=lookup_depth)

        # Filter paths using pathlib glob syntax and convert back the path dict to a nested dict:
        def match_path(path):
            path = PurePosixPath(path)
            return any((path.match(p) for p in patterns))

        filtered_conf = paths_to_dict({path: val for path, val in config_paths.items() if match_path(path)})

        # Remove root config keys
        filtered_conf = {sub_k: sub_v for k, v in filtered_conf.items() if isinstance(v, dict)
                         for sub_k, sub_v in v.items()}

        # Remove key with '_' prefix (convention from Kedro ConfigLoader):
        return {k: v for k, v in filtered_conf.items() if not k.startswith("_")}


 def dict_to_paths(val, sep='/', sep_escape='###', max_level=float('inf'), prefix="") -> Dict[str, Any]:
    """
    Flatten nested dictionnary into a single dictionnary.
    Args:
      sep: separator used when joining nested keys.
      sep_escape: escape sequence replacing occurances of the separator in existing keys.
      max_level: optional limit on the flattening (max_level=0 will return the orginal dict)
    """
    if isinstance(val, Mapping) and max_level > 0:
        return {
            f"{prefix}{k.replace(sep, sep_escape)}{sub_k}": sub_v
            for k, v in val.items()
            for sub_k, sub_v in dict_to_paths(v, prefix=sep, sep=sep, max_level=max_level - 1).items()}
    else:
        return {"": val}


 def nested_update(base_dict: dict, update: dict):
    """
    Merge nested dictionnary. The `update` dict take precedence over `base_dict` values.
    `base_dict` is modified inplace.
    """
    for k, v, in update.items():
        if (k in base_dict) and isinstance(base_dict[k], dict) and isinstance(v, dict):
            base_dict[k] = nested_update(base_dict.get(k, {}), v)
        else:
            base_dict[k] = v
    return base_dict


 def paths_to_dict(val, sep='/', sep_escape="###"):
    """
    Convert flatten back to a regular nested dictionnary (reverse `dict_to_paths` operation)
    """
    base_dict = {}
    for k, v in val.items():
        elems = [elem.replace(sep_escape, sep) for elem in k.split(sep)]
        update = dict()
        last_update = update
        for elem in elems[:-1]:
            last_update[elem] = {}
            last_update = last_update[elem]
        last_update[elems[-1]] = v
        base_dict = nested_update(base_dict, update)
    return base_dict
diff --git a/hydra_config_loader_minimal.py b/hydra_config_loader_minimal.py
 """
 ### Experiment 1: Miminaly invasive Hydra configuration loader ###

 Copyright (c) 2021 Martin Sotir. All rights reserved.

 This work is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.
 """

 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Union

 from kedro.config import ConfigLoader


 class HydraConfigLoaderMinimal(ConfigLoader):
    """Recursively scan the directories specified in ``conf_paths`` for
    configuration files in the Hydra yaml format, load them,
    and return them in the form of a config dictionary.

    Each config file matching the pattern given in the ``.get()`` method
    will be loaded in a separate Hydra environment and then be merged
    exactly in the same way as the traditional Kedro ``ConfigLoader``.
    This means that this loader will throw an exception if two configurations
    contain duplicated keys.

    Each loaded configuration file will be parsed using the Hydra Compose API
    (https://hydra.cc/docs/next/experimental/compose_api).

    Main features:

    * Mutually exclusive configuration groups: https://hydra.cc/docs/terminology#config-group

    * OmegaConf interpolation patterns (including access to environment variables):
      https://omegaconf.readthedocs.io/en/latest/usage.html#variable-interpolation

    * Configuration overrides synthax (trhoug the ``overrides`` parameter):
      https://hydra.cc/docs/advanced/override_grammar/basic

    Drawbacks:

    * Structured configuration loading is not suported/tested.

    * Most fields in the hydra configuration returned when ``return_hydra_config``
      is set to `True` will not be meaningful (as hydra.main() is not called).
      Hydra configurations (logger, job paths, multirun, sweeper, etc.) are
      ignored by Kedro.

    * If `return_hydra_config` is set to True, a `ValueError` will be raised (for
      duplicated keys) if the the givent pattern in ``get()`` match more than one
      file (within all paths in `conf_paths`). For this reason, it is not
      recommended to set this parameter to true appart from debugging purposes.
      If an hydra configuration parameter is required to run a kedro pipeline,
      a dedicated configuraton variable can be set explicitly in the yaml
      configuration using the interpolation syntax (see:
      https://hydra.cc/docs/configure_hydra/intro#hydra ).
      For instance: `job_dir: ${hydra.run.dir}`

    * Ovverides will be applied indepedently, for each loaded files that match
      cnfiguration file patterns. "Append" overrides may not work as expected or
      may create duplicated entries.

    * For now, there is no easy way to specify Hydra configuration overrides
      from the `kedro run` command line tool. Parameters provided with the
      `--params` option will ovverride the final configuration parameters
      but won't impact Hydra group choices (nor interpolation).  This means
      that there is no way to change hydra group choices appart from editing
      the `defaults` list in yaml files.

    """

    def __init__(self, conf_paths: Union[str, Iterable[str]], resolve_interpolation=True,
                 return_hydra_config=False, global_overrides: List[str] = None, job_name='app'):
        """
        Args:
            conf_paths: Non-empty path or list of paths to configuration directories.
            return_hydra_config: Export the full hydra configuration (not recommeded)
            resolve_interpolation: Enable OmegaConf interpolation.
            global_overrides: List of Hydra overrides commands applied to all loaded files
                       (https://hydra.cc/docs/advanced/override_grammar/basic)
            job_name: hydra job name (used in some hydra configuration fields)
        Raises:
            ValueError: If ``conf_paths`` is empty.
        """
        self.return_hydra_config = return_hydra_config
        self.resolve_interpolation = resolve_interpolation
        self.global_overrides = global_overrides if global_overrides is not None else []
        self.job_name = job_name
        super().__init__(conf_paths)

    def _load_config_file(self, config_file: Path, overrides: List[str] = []) -> Dict[str, Any]:

        from hydra.experimental import compose, initialize_config_dir
        from omegaconf import OmegaConf

        overrides = overrides + self.global_overrides

        with initialize_config_dir(config_dir=str(config_file.parent), job_name=self.job_name):
            try:
                conf = compose(config_name=config_file.name, overrides=overrides, return_hydra_config=True)
            except AssertionError as e:
                if "Invalid loaded object type : NoneType" in e.args[0]:
                    # OmegaConf will raise an assertion error for empty files:
                    return dict()
                else:
                    raise e

            # Interpolation is not applied by default with the compose API:
            resolved_conf = OmegaConf.to_container(conf, resolve=self.resolve_interpolation)

        if not self.return_hydra_config:
            del resolved_conf['hydra']

        # Remove key with '_' prefix (convention from Kedro ConfigLoader):
        return {k: v for k, v in resolved_conf.items() if not k.startswith("_")}
	"""
	### Experiment 2: Full Hydra configuration loader ###

	Copyright (c) 2021 Martin Sotir. All rights reserved.

	This work is licensed under the terms of the MIT license.
	For a copy, see <https://opensource.org/licenses/MIT>.
	"""

	from collections.abc import Mapping
	from pathlib import Path, PurePosixPath
	from typing import Any, Dict, List

	from kedro.config import ConfigLoader


	class FullHydraConfigLoader(ConfigLoader):
	"""Load a Kedro configuration from a single hydra configuration.
	Each call of the ``get`` method will reload the configuration and
	return only paths in the dictionnary matching the given patterns.

	This kedro config loader should be compatible with most kedro features
	and plugins, however configuration files must be edited to follow Hydra
	format:

	1. The catalog entries must be set inside a 'catalog' root dictionnary
	in the Hydra configuration (the format remains the same as for Kedro:
	https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html )
	To keep a seperate `catalog.yaml` file, we can leverage the Hydra
	default list features (see the example below).

	2. The same apply for 'parameters', 'logging', 'credentials' and any other
	Kedro root configuration files.

	3. The kedro configuration environment system is completly replaced by a Hydra
	parameter group 'env'. A configuration file must be defined for each environment
	(except 'base') in the 'env' directory. This files can override any setting
	from other configuration files.

	Example:

	* `conf/config.yaml` (single root configuration file):

	```yaml
	defaults: # Ordered list of parameter group (order is important for overrides).
	- catalog
	- parameters
	- catalog
	- _self_ # This file configution
	- env: local # Last, loads environment overrides (local by default)

	directories:
	raw: "./data/01_raw"
	interim: "/tmp/02_interim"
	processed: "/data/03_processed"
	```

	* `conf/catalog.yaml` (note how we set package scope with the `# @package <scope>` directive):

	```yaml
	# @package catalog

	example_iris_data:
	type: pandas.CSVDataSet
	filepath: "${directories.raw}/iris.csv"
	```

	* `conf/env/local.yaml` (This time using a _global_ scope):

	```
	# @package _global_

	# Override parameters:
	parameters:
	example_test_data_ratio: 0.5

	# Overrides catalog entry configuration:
	catalog:
	example_iris_data2:
	filepath: "../data/iris.csv"
	```

	Note: Hydra works better with ".yaml" extension files (rather than ".yml").
	When using an extension other than ".yaml" the file extension must be explicitly
	set in default lists (otherwise Hydra throws "Error: Could not load <config>").

	The trick to make the Hydra configuration works with Kedro glob path patternes
	lookup (`conf_loader.get("parameters/**")` syntax) is to convert keys in the
	configuration into paths. For instance, the configuration entry
	`conf['catalog']['iris']['filepath']` will is associated with the path
	`catalog/iris/filepath`and will be returned by the `.get(`catalog/**)` call.

	Note:

	* Patterns given the ``get`` dictionary can also match nested keys in the
	Hydra configuration. The depth of the lookup can be controlled with the
	`lookup_depth` parameter (= 1 by default).

	* Root keys are not included in the returned configuration dict ('parameters',
	'catalog' keys will not appear in the configuration only sub-dictionnaries keys
	and values are returned). This means that any parameter defined in the root
	configuration file can not be accessed directly using the `.get(*patterns)`
	method, however this values can still be used for string interpolation within
	other confuraton files. The root configuration files can then replace the
	`globals.yml` file from the Kedro TemplatedConfigLoader class.
	"""

	def __init__(self, conf_root: Path, conf_name: str = 'config', resolve_interpolation=True,
	overrides: List[str] = None, job_name='app'):
	"""
	Args:
	conf_root: Root directory of the Hydra configuration.
	conf_name: Hydra configuration name (usally root configuration file name,
	without extension if the extension is .yaml)
	resolve_interpolation: Apply OmegaConf interpolation
	overrides: List of Hydra overrides commands applied to the loaded configuration
	(https://hydra.cc/docs/advanced/override_grammar/basic)
	job_name: hydra job name (used in some hydra configuration fields)
	Raises:
	ValueError: If ``conf_paths`` is empty.
	"""
	self.conf_root = Path(conf_root).absolute()
	self.conf_name = str(Path(conf_name).stem) if Path(conf_name).suffix == 'yaml' else conf_name
	self.resolve_interpolation = resolve_interpolation
	self.overrides = overrides if overrides is not None else []
	self.job_name = job_name

	def _load_configuration(self) -> Dict[str, Any]:

	from hydra.experimental import compose, initialize_config_dir
	from omegaconf import OmegaConf

	with initialize_config_dir(config_dir=str(self.conf_root), job_name=self.job_name):
	try:
	conf = compose(config_name=self.conf_name, overrides=self.overrides, return_hydra_config=True)
	except AssertionError as e:
	if "Invalid loaded object type : NoneType" in e.args[0]:
	# OmegaConf will raise an assertion error for empty files:
	return dict()
	else:
	raise e

	# Interpolation is not applied by default with the compose API:
	return OmegaConf.to_container(conf, resolve=self.resolve_interpolation)
	pass

	def get(self, *patterns: str, include_hydra_conf=False, lookup_depth=1) -> Dict[str, Any]:

	# Relaoad configuration each time:
	config = self._load_configuration()
	if not include_hydra_conf:
	del config['hydra']

	# convert nested dictionnary to single-level "path"-value dictionnary
	config_paths = dict_to_paths(config, sep='/', max_level=lookup_depth)

	# Filter paths using pathlib glob syntax and convert back the path dict to a nested dict:
	def match_path(path):
	path = PurePosixPath(path)
	return any((path.match(p) for p in patterns))

	filtered_conf = paths_to_dict({path: val for path, val in config_paths.items() if match_path(path)})

	# Remove root config keys
	filtered_conf = {sub_k: sub_v for k, v in filtered_conf.items() if isinstance(v, dict)
	for sub_k, sub_v in v.items()}

	# Remove key with '_' prefix (convention from Kedro ConfigLoader):
	return {k: v for k, v in filtered_conf.items() if not k.startswith("_")}


	def dict_to_paths(val, sep='/', sep_escape='###', max_level=float('inf'), prefix="") -> Dict[str, Any]:
	"""
	Flatten nested dictionnary into a single dictionnary.
	Args:
	sep: separator used when joining nested keys.
	sep_escape: escape sequence replacing occurances of the separator in existing keys.
	max_level: optional limit on the flattening (max_level=0 will return the orginal dict)
	"""
	if isinstance(val, Mapping) and max_level > 0:
	return {
	f"{prefix}{k.replace(sep, sep_escape)}{sub_k}": sub_v
	for k, v in val.items()
	for sub_k, sub_v in dict_to_paths(v, prefix=sep, sep=sep, max_level=max_level - 1).items()}
	else:
	return {"": val}


	def nested_update(base_dict: dict, update: dict):
	"""
	Merge nested dictionnary. The `update` dict take precedence over `base_dict` values.
	`base_dict` is modified inplace.
	"""
	for k, v, in update.items():
	if (k in base_dict) and isinstance(base_dict[k], dict) and isinstance(v, dict):
	base_dict[k] = nested_update(base_dict.get(k, {}), v)
	else:
	base_dict[k] = v
	return base_dict


	def paths_to_dict(val, sep='/', sep_escape="###"):
	"""
	Convert flatten back to a regular nested dictionnary (reverse `dict_to_paths` operation)
	"""
	base_dict = {}
	for k, v in val.items():
	elems = [elem.replace(sep_escape, sep) for elem in k.split(sep)]
	update = dict()
	last_update = update
	for elem in elems[:-1]:
	last_update[elem] = {}
	last_update = last_update[elem]
	last_update[elems[-1]] = v
	base_dict = nested_update(base_dict, update)
	return base_dict
	"""
	### Experiment 1: Miminaly invasive Hydra configuration loader ###

	Copyright (c) 2021 Martin Sotir. All rights reserved.

	This work is licensed under the terms of the MIT license.
	For a copy, see <https://opensource.org/licenses/MIT>.
	"""

	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Union

	from kedro.config import ConfigLoader


	class HydraConfigLoaderMinimal(ConfigLoader):
	"""Recursively scan the directories specified in ``conf_paths`` for
	configuration files in the Hydra yaml format, load them,
	and return them in the form of a config dictionary.

	Each config file matching the pattern given in the ``.get()`` method
	will be loaded in a separate Hydra environment and then be merged
	exactly in the same way as the traditional Kedro ``ConfigLoader``.
	This means that this loader will throw an exception if two configurations
	contain duplicated keys.

	Each loaded configuration file will be parsed using the Hydra Compose API
	(https://hydra.cc/docs/next/experimental/compose_api).

	Main features:

	* Mutually exclusive configuration groups: https://hydra.cc/docs/terminology#config-group

	* OmegaConf interpolation patterns (including access to environment variables):
	https://omegaconf.readthedocs.io/en/latest/usage.html#variable-interpolation

	* Configuration overrides synthax (trhoug the ``overrides`` parameter):
	https://hydra.cc/docs/advanced/override_grammar/basic

	Drawbacks:

	* Structured configuration loading is not suported/tested.

	* Most fields in the hydra configuration returned when ``return_hydra_config``
	is set to `True` will not be meaningful (as hydra.main() is not called).
	Hydra configurations (logger, job paths, multirun, sweeper, etc.) are
	ignored by Kedro.

	* If `return_hydra_config` is set to True, a `ValueError` will be raised (for
	duplicated keys) if the the givent pattern in ``get()`` match more than one
	file (within all paths in `conf_paths`). For this reason, it is not
	recommended to set this parameter to true appart from debugging purposes.
	If an hydra configuration parameter is required to run a kedro pipeline,
	a dedicated configuraton variable can be set explicitly in the yaml
	configuration using the interpolation syntax (see:
	https://hydra.cc/docs/configure_hydra/intro#hydra ).
	For instance: `job_dir: ${hydra.run.dir}`

	* Ovverides will be applied indepedently, for each loaded files that match
	cnfiguration file patterns. "Append" overrides may not work as expected or
	may create duplicated entries.

	* For now, there is no easy way to specify Hydra configuration overrides
	from the `kedro run` command line tool. Parameters provided with the
	`--params` option will ovverride the final configuration parameters
	but won't impact Hydra group choices (nor interpolation). This means
	that there is no way to change hydra group choices appart from editing
	the `defaults` list in yaml files.

	"""

	def __init__(self, conf_paths: Union[str, Iterable[str]], resolve_interpolation=True,
	return_hydra_config=False, global_overrides: List[str] = None, job_name='app'):
	"""
	Args:
	conf_paths: Non-empty path or list of paths to configuration directories.
	return_hydra_config: Export the full hydra configuration (not recommeded)
	resolve_interpolation: Enable OmegaConf interpolation.
	global_overrides: List of Hydra overrides commands applied to all loaded files
	(https://hydra.cc/docs/advanced/override_grammar/basic)
	job_name: hydra job name (used in some hydra configuration fields)
	Raises:
	ValueError: If ``conf_paths`` is empty.
	"""
	self.return_hydra_config = return_hydra_config
	self.resolve_interpolation = resolve_interpolation
	self.global_overrides = global_overrides if global_overrides is not None else []
	self.job_name = job_name
	super().__init__(conf_paths)

	def _load_config_file(self, config_file: Path, overrides: List[str] = []) -> Dict[str, Any]:

	from hydra.experimental import compose, initialize_config_dir
	from omegaconf import OmegaConf

	overrides = overrides + self.global_overrides

	with initialize_config_dir(config_dir=str(config_file.parent), job_name=self.job_name):
	try:
	conf = compose(config_name=config_file.name, overrides=overrides, return_hydra_config=True)
	except AssertionError as e:
	if "Invalid loaded object type : NoneType" in e.args[0]:
	# OmegaConf will raise an assertion error for empty files:
	return dict()
	else:
	raise e

	# Interpolation is not applied by default with the compose API:
	resolved_conf = OmegaConf.to_container(conf, resolve=self.resolve_interpolation)

	if not self.return_hydra_config:
	del resolved_conf['hydra']

	# Remove key with '_' prefix (convention from Kedro ConfigLoader):
	return {k: v for k, v in resolved_conf.items() if not k.startswith("_")}