diff --git a/declearn/quickrun/__init__.py b/declearn/quickrun/__init__.py index 2bcb6b0f4e2730f0329a4db6d307b56d450d388c..d3686d55f54423bdb63ebae505f63102a54ad83f 100644 --- a/declearn/quickrun/__init__.py +++ b/declearn/quickrun/__init__.py @@ -15,6 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Script to quickly run example locally using declearn""" +"""Script to quickly run a simulated FL example locally using declearn.""" from .run import quickrun diff --git a/declearn/quickrun/_config.py b/declearn/quickrun/_config.py index ad32be23e20cbd96cd80c3cf7e9fd035111bbf56..52e7c31aca0f308089e8e0bb1b1e6266e868d66a 100644 --- a/declearn/quickrun/_config.py +++ b/declearn/quickrun/_config.py @@ -24,16 +24,15 @@ from declearn.metrics import MetricInputType, MetricSet from declearn.utils import TomlConfig __all__ = [ - "ModelConfig", "DataSourceConfig", "ExperimentConfig", + "ModelConfig", ] @dataclasses.dataclass class ModelConfig(TomlConfig): - """Dataclass used to provide custom model location and - class name""" + """Dataclass used to provide custom model location and class name.""" model_file: Optional[str] = None model_name: str = "MyModel" @@ -41,8 +40,7 @@ class ModelConfig(TomlConfig): @dataclasses.dataclass class DataSourceConfig(TomlConfig): - """Dataclass associated with the functions - declearn.quickrun._parser:parse_data_folder + """Dataclass associated with the quickrun's `parse_data_folder` function. data_folder: str Absolute path to the to the main folder hosting the data. @@ -52,7 +50,7 @@ class DataSourceConfig(TomlConfig): dataset_names: dict or None Dict of custom dataset names, to look for in each client folder. Expect 'train_data, train_target, valid_data, valid_target' as keys. - If None, , default to expected prefix search. + If None, default to expected prefix search. """ data_folder: Optional[str] = None @@ -62,11 +60,7 @@ class DataSourceConfig(TomlConfig): @dataclasses.dataclass class ExperimentConfig(TomlConfig): - """ - - Dataclass providing kwargs to - declearn.main._server.FederatedServer - and declearn.main._client.FederatedClient + """Dataclass providing kwargs to `FederatedServer` and `FederatedClient`. metrics: list[str] or None List of Metric childclass names, defining evaluation metrics diff --git a/declearn/quickrun/_parser.py b/declearn/quickrun/_parser.py index 34e30feb2e9f528f5929bde8e44fb6ac8fab4147..422d0999d6625b0959ebbd0931ec0d841f13d88d 100644 --- a/declearn/quickrun/_parser.py +++ b/declearn/quickrun/_parser.py @@ -15,28 +15,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Utils parsing a data folder following a standard format into a nested -dictionnary -""" +"""Util to parse the contents of a data folder into a nested dict of paths.""" import os from pathlib import Path -from typing import Any, Dict, Optional +from typing import Dict, List, Optional from declearn.quickrun._config import DataSourceConfig -# pylint: disable=too-many-arguments,too-many-branches,too-many-locals + +__all__ = [ + "parse_data_folder", +] def parse_data_folder( data_config: DataSourceConfig, folder: Optional[str] = None, -) -> Dict: - """Utils parsing a data folder following a standard format into a nested - dictionnary. +) -> Dict[str, Dict[str, str]]: + """Parse the contents of a data folder into a nested dict of file paths. - The default expected format is : + This function expects the folder to abide by the following standard: folder/ └─── data*/ @@ -48,96 +47,148 @@ def parse_data_folder( └─── client*/ │ ... - Parameters: - ----------- - data_config : DataSourceConfig - DataSourceConfig instance, see class documentation for details. - folder : str or None + Parameters + ---------- + data_config: DataSourceConfig + DataSourceConfig instance; see its documentation for details. + folder: str or None The main experiment folder in which to look for a `data*` folder. - Overwritten by data_folder. - """ - - data_folder = data_config.data_folder - client_names = data_config.client_names - dataset_names = data_config.dataset_names + Overridden by `data_config.data_folder` when specified. - if not folder and not data_folder: - raise ValueError( - "Please provide either a parent folder or a data folder" - ) - # Data_folder - if not data_folder: - gen_folders = Path(folder).glob("data*") # type: ignore - data_folder = next(gen_folders, False) # type: ignore - if not data_folder: - raise ValueError( - f"No folder starting with 'data' found in {folder}. " - "Please store your split data under a 'data_*' folder. " - "To use an example dataset run `declearn-split` first." - ) - if next(gen_folders, False): - raise ValueError( - "More than one folder starting with 'data' found" - f"in {folder}. Please store your data under a single" - "parent folder" - ) - else: - if os.path.isdir(data_folder): - data_folder = Path(data_folder) # type: ignore - else: - raise ValueError( - f"{data_folder} is not a valid path. To use an example " - "dataset run `declearn-split` first." - ) - # Get clients dir - if client_names: - if isinstance(client_names, list): - valid_names = [ - os.path.isdir(os.path.join(data_folder, n)) - for n in client_names - ] - if sum(valid_names) != len(client_names): - raise ValueError( - "Not all provided client names could be found in" - f"{data_folder}" - ) - clients = { - n: {} for n in client_names - } # type: Dict[str,Dict[Any,Any]] - else: - raise ValueError( - "Please provide a valid list of client names for " - "argument 'client_names'" - ) - else: - clients = {c: {} for c in next(os.walk(data_folder))[1]} - # Get train and valid files + Returns + ------- + paths: + Nested directory containing the parsed file paths, with structure + `{client_name: {file_key_name: file_path}}`, where the key names + are always the same: "train_data", "train_target", "valid_data" + and "valid_target". + """ + # Identify the root data folder. + data_folder = get_data_folder_path(data_config.data_folder, folder) + # Identify clients' data folders. + client_names = list_client_names(data_folder, data_config.client_names) + clients = {c: {} for c in client_names} # type: Dict[str, Dict[str, str]] + # Set up a mapping between expected files and their naming. data_items = [ "train_data", "train_target", "valid_data", "valid_target", ] + dataset_names = data_config.dataset_names if dataset_names: - if set(data_items) != set(dataset_names.keys()): + if set(data_items) != dataset_names.keys(): raise ValueError( - f"Please provide a properly formatted dictionnary as input" - f"using the following keys : {str(data_items)}" + "Please provide a properly formatted dictionnary as input, " + f"using the following keys: {data_items}" ) else: - dataset_names = {i: i for i in data_items} - for client, files in clients.items(): # type: ignore + dataset_names = {name: name for name in data_items} + # Gather client-wise file paths. + for client, paths in clients.items(): + client_dir = data_folder.joinpath(client) for key, val in dataset_names.items(): - filepath = Path(data_folder / client) # type: ignore - gen_file = filepath.glob(f"{val}*") - file = next(gen_file, False) - if not file: + files = [p for p in client_dir.glob(f"{val}*") if p.is_file()] + if not files: raise ValueError( - f"Could not find a file named '{val}.*' in {client}" + f"Could not find a '{val}.*' file for client '{client}'." ) - if next(gen_file, False): + if len(files) > 1: raise ValueError( - f"Found more than one file named '{val}.*' in {client}" + f"Found multiple '{val}.*' files for client '{client}'." ) - files[key] = str(file) + paths[key] = files[0].as_posix() + # Return the nested directory of parsed file paths. return clients + + +def get_data_folder_path( + data_folder: Optional[str], + root_folder: Optional[str], +) -> Path: + """Return the path to a data folder. + + Parameters + ---------- + data_folder: + Optional user-specified data folder. + root_folder: + Root folder, under which to look up a 'data*' folder. + Unused if `data_folder` is not None. + + Returns + ------- + dirpath: + pathlib.Path wrapping the path to the identified data folder. + + Raises + ------ + ValueError + If the input arguments point to non-existing folders, or a data + folder cannot be unambiguously found under the root folder. + """ + # Case when a data folder is explicitly designated. + if isinstance(data_folder, str): + if os.path.isdir(data_folder): + return Path(data_folder) + raise ValueError( + f"{data_folder} is not a valid path. To use an example " + "dataset, run `declearn-split` first." + ) + # Case when working from a root folder. + if not isinstance(root_folder, str): + raise ValueError( + "Please provide either a data folder or its parent folder." + ) + folders = list(Path(root_folder).glob("data*")) + if not folders: + raise ValueError( + f"No folder starting with 'data' found under {root_folder}. " + "Please store your split data under a 'data_*' folder. " + "To use an example dataset, run `declearn-split` first." + ) + if len(folders) > 1: + raise ValueError( + "More than one folder starting with 'data' found under " + f"{root_folder}. Please store your data under a single " + "parent folder, or specify the target data folder." + ) + return folders[0] + + +def list_client_names( + data_folder: Path, + client_names: Optional[List[str]], +) -> List[str]: + """List client-wise subdirectories under a data folder. + + Parameters + ---------- + data_folder: + `pathlib.Path` designating the main data folder. + client_names: + Optional list of clients to restrict the outputs to. + + Raises + ------ + ValueError + If `client_names` is of unproper type, or lists names that cannot + be found under `data_folder`. + """ + # Case when client names are provided: verify that they can be found. + if client_names: + if not isinstance(client_names, list): + raise ValueError( + "Please provide a valid list of client names for " + "argument 'client_names'" + ) + if not all( + data_folder.joinpath(name).is_dir() for name in client_names + ): + raise ValueError( + "Not all provided client names could be found under " + f"{data_folder}" + ) + return client_names.copy() + # Otherwise, list subdirectories of the data folder. + return [path.name for path in data_folder.iterdir() if path.is_dir()] diff --git a/declearn/quickrun/run.py b/declearn/quickrun/run.py index 21cb3a3653c4930b87f7d97a8c7a782e04bbf40c..293cf5297c9b9c8f3236c5a1bef9d50f8c4a1089 100644 --- a/declearn/quickrun/run.py +++ b/declearn/quickrun/run.py @@ -15,14 +15,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Script to quickly run example locally using declearn. +"""Script to quickly run a simulated FL example locally using declearn. The script requires to be provided with the path to a folder containing: -* A declearn model -* A TOML file with all the elements required to configurate an FL experiment -* A data folder, structured in a specific way +* A python file in which a declearn model is instantiated (in main scope). +* A TOML file with all the elements required to configure an FL experiment. +* A data folder, structured in a specific way. If not provided with this, the script defaults to the MNIST example provided by declearn in `declearn.example.quickrun`. @@ -149,9 +148,7 @@ def get_toml_folder(config: str) -> Tuple[str, str]: * The path to the TOML config file * The path to the main folder of the experiment """ - # default to the mnist example config = os.path.abspath(config) - # check if config is TOML or dir if os.path.isfile(config): toml = config folder = os.path.dirname(config) @@ -184,24 +181,42 @@ def server_to_client_network( def quickrun(config: str) -> None: - """ - Run a server and its clients using multiprocessing. + """Run a server and its clients using multiprocessing. - The script requires to be provided with the path a TOML file + The script requires to be provided with the path to a TOML file with all the elements required to configurate an FL experiment, or the path to a folder containing : - * a TOML file with all the elements required to configurate an - FL experiment - * A declearn model - * A data folder, structured in a specific way - - Parameters: - ---- + * A TOML file with all the elements required to configure an FL experiment. + * A python file in which a declearn model is instantiated (in main scope). + * A data folder, structured in a specific way: + folder/ + [client_a]/ + train_data.(csv|npy|sparse|svmlight) + train_target.(csv|npy|sparse|svmlight) + valid_data.(csv|npy|sparse|svmlight) + valid_target.(csv|npy|sparse|svmlight) + [client_b]/ + ... + ... + + Parameters + ---------- config: str Path to either a toml file or a properly formatted folder - containing the elements required to launch an FL experiment - + containing the elements required to launch the experiment. + + Notes + ----- + - The data folder structure may be obtained by using the `declearn-split` + commandline entry-point, or the `declearn.dataset.split_data` util. + - The quickrun mode works by simulating a federated learning process, where + all clients operate under parallel python processes, and communicate over + the localhost using un-encrypted websockets communications. + - When run without any argument, this script/function operates on a basic + MNIST example, for demonstration purposes. + - You may refer to a more detailed MNIST example on our GitLab repository. + See the `examples/mnist_quickrun` folder. """ # main script; pylint: disable=too-many-locals toml, folder = get_toml_folder(config) @@ -235,7 +250,7 @@ def quickrun(config: str) -> None: def main() -> None: - """Fire-wrapped quickrun""" + """Fire-wrapped `quickrun`.""" fire.Fire(quickrun) diff --git a/declearn/utils/_toml_config.py b/declearn/utils/_toml_config.py index 476d960e9da8ed3dc23510cc7a11dad082176b22..012a2255d09a48b58314e5dc2ddc789255b4754b 100644 --- a/declearn/utils/_toml_config.py +++ b/declearn/utils/_toml_config.py @@ -321,11 +321,11 @@ class TomlConfig: warn_user: bool, default=True Boolean indicating whether to raise a warning when some fields are unused. Useful for cases where unused fields are - expected, e.g. quickrun. + expected, e.g. in declearn-quickrun mode. use_section: optional(str), default=None If not None, points to a specific section of the TOML that should be used, rather than the whole file. Useful to parse - orchestrating TOML files, e.g. quickrun. + orchestrating TOML files, e.g. in declearn-quickrun mode. section_fail_ok: bool, default=False If True, allow the section specified in use_section to be missing from the TOML file without raising an Error.