# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from abc import ABC, abstractmethod
from typing import Callable, List, Optional, Union
from nvflare.apis.client import Client
from nvflare.apis.controller_spec import ClientTask, OperatorMethod, Task, TaskOperatorKey
from nvflare.apis.fl_constant import ReturnCode
from nvflare.apis.fl_context import FLContext
from nvflare.apis.impl.controller import Controller
from nvflare.apis.shareable import Shareable
from nvflare.apis.signal import Signal
from nvflare.app_common.abstract.fl_model import FLModel, ParamsType
from nvflare.app_common.abstract.learnable_persistor import LearnablePersistor
from nvflare.app_common.abstract.model import ModelLearnable, ModelLearnableKey, make_model_learnable
from nvflare.app_common.app_constant import AppConstants
from nvflare.app_common.app_event_type import AppEventType
from nvflare.app_common.utils.error_handling_utils import get_error_handling_message, should_ignore_result_error
from nvflare.app_common.utils.fl_component_wrapper import FLComponentWrapper
from nvflare.app_common.utils.fl_model_utils import FLModelUtils
from nvflare.fuel.utils.validation_utils import check_non_negative_int, check_positive_int, check_str
from nvflare.security.logging import secure_format_exception
[docs]
class BaseModelController(Controller, FLComponentWrapper, ABC):
def __init__(
self,
persistor_id: str = AppConstants.DEFAULT_PERSISTOR_ID,
ignore_result_error: Optional[bool] = None,
allow_empty_global_weights: bool = False,
task_check_period: float = 0.5,
):
"""FLModel based controller.
Args:
persistor_id (str, optional): ID of the persistor component. Defaults to AppConstants.DEFAULT_PERSISTOR_ID ("persistor").
ignore_result_error (bool or None, optional): How to handle client result errors.
- None: Dynamic mode (default) - ignore errors if min_responses still reachable, panic otherwise.
- False: Strict mode - panic on any client error.
- True: Resilient mode - always ignore client errors.
allow_empty_global_weights (bool, optional): whether to allow empty global weights. Some pipelines can have
empty global weights at first round, such that clients start training from scratch without any global info.
Defaults to False.
task_check_period (float, optional): interval for checking status of tasks. Defaults to 0.5.
"""
super().__init__(task_check_period=task_check_period)
# Check arguments
check_str("persistor_id", persistor_id)
if not isinstance(task_check_period, (int, float)):
raise TypeError(f"task_check_period must be an int or float but got {type(task_check_period)}")
elif task_check_period <= 0:
raise ValueError("task_check_period must be greater than 0.")
self._task_check_period = task_check_period
self._persistor_id = persistor_id
self.persistor = None
# config data
self._ignore_result_error = ignore_result_error
self._allow_empty_global_weights = allow_empty_global_weights
# model related
self._results = []
# Task context for dynamic ignore_result_error mode (when ignore_result_error=None).
# These are reset per send_model() call to track error tolerance for the current task.
self._current_min_responses = 0 # Minimum successful responses needed for this task
self._current_num_targets = 0 # Total number of clients targeted for this task
self._current_failed_clients = set() # Set of client names that returned errors in this task
[docs]
def start_controller(self, fl_ctx: FLContext) -> None:
self.fl_ctx = fl_ctx
self.info("Initializing BaseModelController workflow.")
self.engine = self.fl_ctx.get_engine()
if self._persistor_id:
self.persistor = self.engine.get_component(self._persistor_id)
if not isinstance(self.persistor, LearnablePersistor):
self.warning(
f"Persistor {self._persistor_id} must be a LearnablePersistor type object, "
f"but got {type(self.persistor)}"
)
self.persistor = None
FLComponentWrapper.initialize(self)
def _build_shareable(self, data: FLModel = None) -> Shareable:
data_shareable: Shareable = FLModelUtils.to_shareable(data)
data_shareable.add_cookie(
AppConstants.CONTRIBUTION_ROUND, data_shareable.get_header(AppConstants.CURRENT_ROUND)
)
return data_shareable
[docs]
def broadcast_model(
self,
data,
task_name: str = AppConstants.TASK_TRAIN,
targets: Union[List[Client], List[str], None] = None,
min_responses: int = None,
timeout: int = 0,
wait_time_after_min_received: int = 0,
blocking: bool = True,
callback: Callable[[FLModel], None] = None,
) -> List:
"""Send a task with data to a list of targets.
Args:
data: FLModel to be sent to clients. It must be a FLModel object. It will raise an exception if None.
task_name (str, optional): name of the task. Defaults to "train".
targets (List[str], optional): the list of target client names or None (all clients). Defaults to None.
min_responses (int, optional): the minimum number of responses expected. If None, must receive responses from
all clients that the task has been sent to. Defaults to None.
timeout (int, optional): time to wait for clients to perform task. Defaults to 0, i.e., never time out.
wait_time_after_min_received (int, optional): time to wait after
minimum number of clients responses has been received. Defaults to 0.
blocking (bool, optional): whether to block to wait for task result. Defaults to True.
callback (Callable[[FLModel], None], optional): callback when a result is received, only called when blocking=False. Defaults to None.
Returns:
List[FLModel] if blocking=True else None
"""
if not isinstance(task_name, str):
raise TypeError(f"task_name must be a string but got {type(task_name)}")
if not isinstance(data, FLModel):
raise TypeError(f"data must be a FLModel but got {type(data)}")
if min_responses is None:
min_responses = 0 # this is internally used by controller's broadcast to represent all targets
check_non_negative_int("min_responses", min_responses)
check_non_negative_int("timeout", timeout)
check_non_negative_int("wait_time_after_min_received", wait_time_after_min_received)
if not blocking and not isinstance(callback, Callable):
raise TypeError("callback must be defined if blocking is False, but got {}".format(type(callback)))
# Store task context for dynamic ignore_result_error mode
num_targets = len(targets) if targets else len(self.engine.get_clients())
self._current_min_responses = min_responses if min_responses > 0 else num_targets
self._current_num_targets = num_targets
self._current_failed_clients = set()
self.set_fl_context(data)
self.fire_event(AppEventType.ROUND_STARTED, self.fl_ctx)
task = self._prepare_task(data=data, task_name=task_name, timeout=timeout, callback=callback)
if targets:
targets = [client.name if isinstance(client, Client) else client for client in targets]
self.info(f"Sending task {task_name} to {targets}")
else:
self.info(f"Sending task {task_name} to all clients")
if blocking:
self._results = [] # reset results list
self.broadcast_and_wait(
task=task,
targets=targets,
min_responses=min_responses,
wait_time_after_min_received=wait_time_after_min_received,
fl_ctx=self.fl_ctx,
abort_signal=self.abort_signal,
)
if targets is not None:
expected_responses = min_responses if min_responses != 0 else len(targets)
if len(self._results) != expected_responses:
self.warning(
f"Number of results ({len(self._results)}) is different from number of expected responses ({expected_responses})."
)
# de-reference the internal results before returning
results = self._results
self._results = []
return results
else:
self.broadcast(
task=task,
targets=targets,
min_responses=min_responses,
wait_time_after_min_received=wait_time_after_min_received,
fl_ctx=self.fl_ctx,
)
def _prepare_task(
self,
data: FLModel,
task_name: str,
timeout: int,
callback: Callable,
):
# Create task
data_shareable = self._build_shareable(data)
operator = {
TaskOperatorKey.OP_ID: task_name,
TaskOperatorKey.METHOD: OperatorMethod.BROADCAST,
TaskOperatorKey.TIMEOUT: timeout,
}
task = Task(
name=task_name,
data=data_shareable,
operator=operator,
props={AppConstants.TASK_PROP_CALLBACK: callback, AppConstants.META_DATA: data.meta},
timeout=timeout,
before_task_sent_cb=self._prepare_task_data,
result_received_cb=self._process_result,
)
return task
def _prepare_task_data(self, client_task: ClientTask, fl_ctx: FLContext) -> None:
self.fire_event_with_data(
AppEventType.BEFORE_TRAIN_TASK, fl_ctx, AppConstants.TRAIN_SHAREABLE, client_task.task.data
)
def _process_result(self, client_task: ClientTask, fl_ctx: FLContext) -> None:
self.fl_ctx = fl_ctx
result = client_task.result
client_name = client_task.client.name
# Set CURRENT_ROUND in fl_ctx so widgets (e.g. IntimeModelSelector) and aggregators
# see the correct round; the callback fl_ctx may not have it.
current_round = client_task.task.data.get_header(AppConstants.CURRENT_ROUND, None)
if current_round is not None:
fl_ctx.set_prop(AppConstants.CURRENT_ROUND, current_round, private=True, sticky=False)
# Check return code and handle errors first
self.event(AppEventType.BEFORE_CONTRIBUTION_ACCEPT)
accepted = self._accept_train_result(client_name=client_name, result=result, fl_ctx=fl_ctx)
self.event(AppEventType.AFTER_CONTRIBUTION_ACCEPT)
# If result was rejected (error ignored or panic), skip further processing
if not accepted:
client_task.result = None
return
# Now try to convert result to FLModel
try:
result_model = FLModelUtils.from_shareable(result)
result_model.meta["props"] = client_task.task.props[AppConstants.META_DATA]
result_model.meta["client_name"] = client_name
except Exception as e:
self.warning(f"Failed to convert result from {client_name} to FLModel: {e}")
client_task.result = None
return
callback = client_task.task.get_prop(AppConstants.TASK_PROP_CALLBACK)
if callback:
try:
callback(result_model)
except Exception as e:
self.error(f"Unsuccessful callback {callback} for task {client_task.task.name}: {e}")
else:
self._results.append(result_model)
# Cleanup task result
client_task.result = None
# Note: Memory cleanup (gc.collect + malloc_trim) is handled by subclasses
# via _maybe_cleanup_memory() based on memory_gc_rounds setting
[docs]
def process_result_of_unknown_task(
self, client: Client, task_name: str, client_task_id: str, result: Shareable, fl_ctx: FLContext
) -> None:
if task_name == AppConstants.TASK_TRAIN:
accepted = self._accept_train_result(
client_name=client.name, result=result, fl_ctx=fl_ctx, is_unknown_task=True
)
if accepted:
self.info(f"Result of unknown task {task_name} sent to aggregator.")
else:
self.error("Ignoring result from unknown task.")
def _accept_train_result(
self, client_name: str, result: Shareable, fl_ctx: FLContext, is_unknown_task: bool = False
) -> bool:
"""Accept or reject a training result based on error handling policy.
Args:
client_name: Name of the client that sent the result.
result: The Shareable result from the client.
fl_ctx: The FLContext.
is_unknown_task: Whether this result is from an unknown/late task.
Returns:
True if the result was accepted, False if it was rejected (error ignored or panic triggered).
"""
self.fl_ctx = fl_ctx
rc = result.get_return_code()
current_round = result.get_header(AppConstants.CURRENT_ROUND, None)
# For unknown/late tasks, always ignore errors (no valid tolerance context)
# For normal tasks, use the configured ignore_result_error setting
ignore_result_error = True if is_unknown_task else self._ignore_result_error
# Use empty set for unknown tasks since we don't have valid tracking context
failed_clients = set() if is_unknown_task else self._current_failed_clients
num_targets = 0 if is_unknown_task else self._current_num_targets
min_responses = 0 if is_unknown_task else self._current_min_responses
# Raise panic if bad peer context or execution exception.
if rc and rc != ReturnCode.OK:
should_ignore = should_ignore_result_error(
ignore_result_error=ignore_result_error,
client_name=client_name,
failed_clients=failed_clients,
num_targets=num_targets,
min_responses=min_responses,
)
msg = get_error_handling_message(
ignore_result_error=ignore_result_error,
client_name=client_name,
error_code=rc,
current_round=current_round,
controller_name=self.__class__.__name__,
failed_clients=failed_clients,
num_targets=num_targets,
min_responses=min_responses,
)
if should_ignore:
self.warning(msg)
return False # Result rejected - error ignored
else:
self.panic(msg)
return False # Result rejected - panic triggered
self.fl_ctx.set_prop(AppConstants.TRAINING_RESULT, result, private=True, sticky=False)
return True # Result accepted
[docs]
@abstractmethod
def run(self):
"""Main `run` routine called by the Controller's `control_flow` to execute the workflow.
Returns: None.
"""
raise NotImplementedError
[docs]
def control_flow(self, abort_signal: Signal, fl_ctx: FLContext) -> None:
self.fl_ctx = fl_ctx
self.abort_signal = abort_signal
try:
self.info("Beginning model controller run.")
self.event(AppEventType.TRAINING_STARTED)
self.run()
except Exception as e:
error_msg = f"Exception in model controller run: {secure_format_exception(e)}"
self.exception(error_msg)
self.panic(error_msg)
[docs]
def load_model(self):
# initialize global model
model = None
if self.persistor:
self.info("loading initial model from persistor")
global_weights = self.persistor.load(self.fl_ctx)
if not isinstance(global_weights, ModelLearnable):
self.panic(
f"Expected global weights to be of type `ModelLearnable` but received {type(global_weights)}"
)
return
if global_weights.is_empty():
if not self._allow_empty_global_weights:
# if empty not allowed, further check whether it is available from fl_ctx
global_weights = self.fl_ctx.get_prop(AppConstants.GLOBAL_MODEL)
if not global_weights.is_empty():
model = FLModel(
params_type=ParamsType.FULL,
params=global_weights[ModelLearnableKey.WEIGHTS],
meta=global_weights[ModelLearnableKey.META],
)
elif self._allow_empty_global_weights:
model = FLModel(params_type=ParamsType.FULL, params={})
else:
self.panic(
f"Neither `persistor` {self._persistor_id} or `fl_ctx` returned a global model! If this was intended, set `self._allow_empty_global_weights` to `True`."
)
return
else:
self.info("persistor not configured, creating empty initial FLModel")
model = FLModel(params_type=ParamsType.FULL, params={})
# persistor uses Learnable format to save model
ml = make_model_learnable(weights=model.params, meta_props=model.meta)
self.fl_ctx.set_prop(AppConstants.GLOBAL_MODEL, ml, private=True, sticky=True)
self.event(AppEventType.INITIAL_MODEL_LOADED)
return model
[docs]
def get_run_dir(self):
"""Get current run directory."""
return self.engine.get_workspace().get_run_dir(self.fl_ctx.get_job_id())
[docs]
def get_app_dir(self):
"""Get current app directory."""
return self.engine.get_workspace().get_app_dir(self.fl_ctx.get_job_id())
[docs]
def save_model(self, model):
if self.persistor:
self.info("Start persist model on server.")
self.event(AppEventType.BEFORE_LEARNABLE_PERSIST)
# persistor uses Learnable format to save model
ml = make_model_learnable(weights=model.params, meta_props=model.meta)
self.persistor.save(ml, self.fl_ctx)
self.event(AppEventType.AFTER_LEARNABLE_PERSIST)
self.info("End persist model on server.")
else:
self.error("persistor not configured, model will not be saved")
[docs]
def sample_clients(self, num_clients: int = None) -> List[str]:
clients = [client.name for client in self.engine.get_clients()]
if num_clients:
check_positive_int("num_clients", num_clients)
if num_clients < len(clients):
random.shuffle(clients)
clients = clients[0:num_clients]
self.info(
f"num_clients ({num_clients}) is less than the number of available clients. Returning a random subset of ({num_clients}) clients."
)
elif num_clients > len(clients):
self.error(
f"num_clients ({num_clients}) is greater than the number of available clients. Returning all ({len(clients)}) available clients."
)
self.info(f"Sampled clients: {clients}")
return clients
[docs]
def set_fl_context(self, data: FLModel):
"""Set fl_ctx CURRENT_ROUND and NUM_ROUNDS from FLModel so they stay current each round.
Uses existing (private, sticky) attributes when the prop is already set so set_prop()
accepts the update without warning; otherwise uses private=True, sticky=False. Required
for flows like FedAvg that do not set CURRENT_ROUND in fl_ctx before send; downstream
(e.g. aggregators) rely on it.
"""
if not data:
return
if data.current_round is not None:
detail = self.fl_ctx.get_prop_detail(AppConstants.CURRENT_ROUND)
if detail is not None:
self.fl_ctx.set_prop(
AppConstants.CURRENT_ROUND,
data.current_round,
private=detail["private"],
sticky=detail["sticky"],
)
else:
self.fl_ctx.set_prop(
AppConstants.CURRENT_ROUND,
data.current_round,
private=True,
sticky=False,
)
if data.total_rounds is not None:
detail = self.fl_ctx.get_prop_detail(AppConstants.NUM_ROUNDS)
if detail is not None:
self.fl_ctx.set_prop(
AppConstants.NUM_ROUNDS,
data.total_rounds,
private=detail["private"],
sticky=detail["sticky"],
)
else:
self.fl_ctx.set_prop(
AppConstants.NUM_ROUNDS,
data.total_rounds,
private=True,
sticky=False,
)
[docs]
def get_component(self, component_id: str):
return self.engine.get_component(component_id)
[docs]
def build_component(self, config_dict: dict):
return self.engine.build_component(config_dict)
[docs]
def stop_controller(self, fl_ctx: FLContext):
self.fl_ctx = fl_ctx
self.finalize()