Source code for fedjax.training.federated_experiment

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Federated experiment manager."""

import abc
import os.path
import time
from typing import Any, Mapping, NamedTuple, Optional, Sequence, Tuple

from absl import logging
from fedjax.core import client_datasets
from fedjax.core import client_samplers
from fedjax.core import federated_algorithm
from fedjax.core import federated_data
from fedjax.core import models
from fedjax.core import util
from fedjax.training import checkpoint
from fedjax.training import logging as fedjax_logging
import jax.numpy as jnp

tf = util.import_tf()


[docs]def set_tf_cpu_only(): """Restricts TensorFlow device visibility to only CPU. TensorFlow is only used for data loading, so we prevent it from allocating GPU/TPU memory. """ tf.config.experimental.set_visible_devices([], 'GPU') tf.config.experimental.set_visible_devices([], 'TPU')
[docs]class EvaluationFn(metaclass=abc.ABCMeta): """Evaluation function that are only fed state at every call. Typically used for full evaluation or evaluation on sampled clients from a test set. """
[docs] @abc.abstractmethod def __call__(self, state: federated_algorithm.ServerState, round_num: int) -> Mapping[str, jnp.ndarray]: """Runs final evaluation."""
[docs]class ModelSampleClientsEvaluationFn(EvaluationFn): """Evaluation on sampled clients using the centralized model. The state to be evaluated must contain a params field. """
[docs] def __init__(self, client_sampler: client_samplers.ClientSampler, model: models.Model, batch_hparams: client_datasets.PaddedBatchHParams): self._client_sampler = client_sampler self._model = model self._batch_hparams = batch_hparams
[docs] def __call__(self, state: federated_algorithm.ServerState, round_num: int) -> Mapping[str, jnp.ndarray]: params = state.params self._client_sampler.set_round_num(round_num) clients = self._client_sampler.sample() batches = client_datasets.padded_batch_client_datasets( [i for _, i, _ in clients], self._batch_hparams) return models.evaluate_model(self._model, params, batches) # pytype: disable=wrong-arg-types # jax-ndarray
[docs]class ModelFullEvaluationFn(EvaluationFn): """Evaluation on an entire federated dataset using the centralized model."""
[docs] def __init__(self, fd: federated_data.FederatedData, model: models.Model, batch_hparams: client_datasets.PaddedBatchHParams): self._fd = fd self._model = model self._batch_hparams = batch_hparams
[docs] def __call__(self, state: federated_algorithm.ServerState, round_num: int) -> Mapping[str, jnp.ndarray]: del round_num params = state.params batches = federated_data.padded_batch_federated_data( self._fd, self._batch_hparams) return models.evaluate_model(self._model, params, batches) # pytype: disable=wrong-arg-types # jax-ndarray
[docs]class TrainClientsEvaluationFn(metaclass=abc.ABCMeta): """Evaluation function that are fed training clients at every call. Typically used for evaluation on the training clients used in a step. """
[docs] @abc.abstractmethod def __call__( self, state: federated_algorithm.ServerState, round_num: int, train_clients: Sequence[Tuple[federated_data.ClientId, client_datasets.ClientDataset, Any]] ) -> Mapping[str, jnp.ndarray]: """Runs evaluation."""
[docs]class ModelTrainClientsEvaluationFn(TrainClientsEvaluationFn): """Evaluation on training clients using the centralized model. The state to be evaluated must contain a params field. """
[docs] def __init__(self, model: models.Model, batch_hparams: client_datasets.PaddedBatchHParams): self._model = model self._batch_hparams = batch_hparams
[docs] def __call__( self, state: federated_algorithm.ServerState, round_num: int, train_clients: Sequence[Tuple[federated_data.ClientId, client_datasets.ClientDataset, Any]] ) -> Mapping[str, jnp.ndarray]: del round_num params = state.params batches = client_datasets.padded_batch_client_datasets( [client_dataset for _, client_dataset, _ in train_clients], self._batch_hparams) return models.evaluate_model(self._model, params, batches) # pytype: disable=wrong-arg-types # jax-ndarray
[docs]class FederatedExperimentConfig(NamedTuple): """Common configurations of a federated experiment. Attribues: root_dir: Root directory for experiment outputs (e.g. metrics). num_rounds: Number of federated training rounds. checkpoint_frequency: Checkpoint frequency in rounds. If <= 0, no checkpointing is done. num_checkpoints_to_keep: Maximum number of checkpoints to keep. eval_frequency: Evaluation frequency in rounds. If <= 0, no evaluation is done. """ root_dir: str num_rounds: int checkpoint_frequency: int = 0 num_checkpoints_to_keep: int = 1 eval_frequency: int = 0
[docs]def run_federated_experiment( algorithm: federated_algorithm.FederatedAlgorithm, init_state: federated_algorithm.ServerState, client_sampler: client_samplers.ClientSampler, config: FederatedExperimentConfig, periodic_eval_fn_map: Optional[Mapping[str, Any]] = None, final_eval_fn_map: Optional[Mapping[str, EvaluationFn]] = None ) -> federated_algorithm.ServerState: """Runs the training loop of a federated algorithm experiment. Args: algorithm: Federated algorithm to use. init_state: Initial server state. client_sampler: Sampler for training clients. config: FederatedExperimentConfig configurations. periodic_eval_fn_map: Mapping of name to evaluation functions that are run repeatedly over multiple federated training rounds. The frequency is defined in `_FederatedExperimentConfig.eval_frequency`. final_eval_fn_map: Mapping of name to evaluation functions that are run at the very end of federated training. Typically, full test evaluation functions will be set here. Returns: Final state of the input federated algortihm after training. """ if config.root_dir: tf.io.gfile.makedirs(config.root_dir) if periodic_eval_fn_map is None: periodic_eval_fn_map = {} if final_eval_fn_map is None: final_eval_fn_map = {} logger = fedjax_logging.Logger(config.root_dir) latest = checkpoint.load_latest_checkpoint(config.root_dir) if latest: state, last_round_num = latest start_round_num = last_round_num + 1 else: state = init_state start_round_num = 1 client_sampler.set_round_num(start_round_num) start = time.time() for round_num in range(start_round_num, config.num_rounds + 1): # Get a random state and randomly sample clients. clients = client_sampler.sample() client_ids = [i[0] for i in clients] logging.info('round_num %d: client_ids = %s', round_num, client_ids) # Run one round of the algorithm, where bulk of the work happens. state, _ = algorithm.apply(state, clients) # Save checkpoint. should_save_checkpoint = config.checkpoint_frequency and ( round_num == start_round_num or round_num % config.checkpoint_frequency == 0) if should_save_checkpoint: checkpoint.save_checkpoint(config.root_dir, state, round_num, config.num_checkpoints_to_keep) # Run evaluation. should_run_eval = config.eval_frequency and ( round_num == start_round_num or round_num % config.eval_frequency == 0) if should_run_eval: start_periodic_eval = time.time() for eval_name, eval_fn in periodic_eval_fn_map.items(): if isinstance(eval_fn, EvaluationFn): metrics = eval_fn(state, round_num) elif isinstance(eval_fn, TrainClientsEvaluationFn): metrics = eval_fn(state, round_num, clients) else: raise ValueError(f'Invalid eval_fn type {type(eval_fn)}') if metrics: for metric_name, metric_value in metrics.items(): logger.log(eval_name, metric_name, metric_value, round_num) logger.log('.', 'periodic_eval_duration_sec', time.time() - start_periodic_eval, round_num) # Log the time it takes per round. Rough approximation since we're not # using DeviceArray.block_until_ready() logger.log('.', 'mean_round_duration_sec', (time.time() - start) / (round_num + 1 - start_round_num), round_num) # Block until previous work has finished. jnp.zeros([]).block_until_ready() # Logging overall time it took. num_rounds = config.num_rounds - start_round_num + 1 mean_round_duration = ((time.time() - start) / num_rounds if num_rounds > 0 else 0) # Final evaluation. final_eval_start = time.time() for eval_name, eval_fn in final_eval_fn_map.items(): metrics = eval_fn(state, round_num) if metrics: metrics_path = os.path.join(config.root_dir, f'{eval_name}.tsv') with tf.io.gfile.GFile(metrics_path, 'w') as f: f.write('\t'.join(metrics.keys()) + '\n') f.write('\t'.join([str(v) for v in metrics.values()])) # DeviceArray.block_until_ready() isn't needed here since we write to file. final_eval_duration = time.time() - final_eval_start logging.info('mean_round_duration = %f sec.', mean_round_duration) logging.info('final_eval_duration = %f sec.', final_eval_duration) return state