Source code for nvflare.app_opt.xgboost.histogram_based.executor

# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os

import xgboost as xgb
from xgboost import callback

from nvflare.apis.event_type import EventType
from nvflare.apis.executor import Executor
from nvflare.apis.fl_constant import FLContextKey, ReturnCode
from nvflare.apis.fl_context import FLContext
from nvflare.apis.shareable import Shareable, make_reply
from nvflare.apis.signal import Signal
from nvflare.apis.workspace import Workspace
from nvflare.app_common.app_constant import AppConstants
from nvflare.app_common.tracking.log_writer import LogWriter
from nvflare.app_common.utils.file_utils import resolve_path_under_root
from nvflare.app_opt.xgboost.data_loader import XGBDataLoader
from nvflare.app_opt.xgboost.histogram_based.constants import XGB_TRAIN_TASK, XGBShareableHeader
from nvflare.app_opt.xgboost.metrics_cb import MetricsCallback
from nvflare.security.logging import secure_format_exception, secure_log_traceback



[docs]
class XGBoostParams:
    def __init__(
        self, xgb_params: dict, num_rounds: int = 10, early_stopping_rounds: int = 2, verbose_eval: bool = False
    ):
        """Container for all XGBoost parameters.

        Args:
            xgb_params: The Booster parameters. This dict is passed to `xgboost.train()`
                as the argument `params`. It contains all the Booster parameters.
                Please refer to XGBoost documentation for details:
                https://xgboost.readthedocs.io/en/stable/parameter.html
        """
        self.num_rounds = num_rounds
        self.early_stopping_rounds = early_stopping_rounds
        self.verbose_eval = verbose_eval
        self.xgb_params: dict = xgb_params if xgb_params else {}




[docs]
class FedXGBHistogramExecutor(Executor):
    """Federated XGBoost Executor Spec for histogram-base collaboration.

    This class implements a basic xgb_train logic, feel free to overwrite the function for custom behavior.
    """

    def __init__(
        self,
        num_rounds,
        early_stopping_rounds,
        xgb_params: dict,
        data_loader_id: str,
        verbose_eval=False,
        use_gpus=False,
        metrics_writer_id: str = None,
        model_file_name="test.model.json",
    ):
        """Federated XGBoost Executor for histogram-base collaboration.

        This class sets up the training environment for Federated XGBoost.
        This is the executor running on each NVFlare client, which starts XGBoost training.

        Args:
            num_rounds: number of boosting rounds
            early_stopping_rounds: early stopping rounds
            xgb_params: This dict is passed to `xgboost.train()` as the first argument `params`.
                It contains all the Booster parameters.
                Please refer to XGBoost documentation for details:
                https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training
            data_loader_id: the ID points to XGBDataLoader.
            verbose_eval: verbose_eval in xgboost.train
            use_gpus: flag to enable gpu training
            metrics_writer_id: the ID points to a LogWriter, if provided, a MetricsCallback will be added.
                Users can then use the receivers from nvflare.app_opt.tracking.
            model_file_name (str): where to save the model.
        """
        super().__init__()

        self.num_rounds = num_rounds
        self.early_stopping_rounds = early_stopping_rounds
        self.xgb_params = xgb_params
        self.data_loader_id = data_loader_id
        self.data_loader = None
        self.verbose_eval = verbose_eval
        self.use_gpus = use_gpus

        self.rank = None
        self.world_size = None
        self.client_id = None
        self._ca_cert_path = None
        self._client_key_path = None
        self._client_cert_path = None
        self._server_address = "localhost"
        self.train_data = None
        self.val_data = None
        self.model_file_name = model_file_name

        self._metrics_writer_id = metrics_writer_id
        self._metrics_writer = None

    def _get_model_path(self, run_dir: str) -> str:
        return resolve_path_under_root(run_dir, self.model_file_name, "model_file_name")


[docs]
    def initialize(self, fl_ctx):
        self.client_id = fl_ctx.get_identity_name()
        self._server_address = self._get_server_address(fl_ctx)
        self.log_info(fl_ctx, f"server address is {self._server_address}")

        engine = fl_ctx.get_engine()

        self.data_loader = engine.get_component(self.data_loader_id)
        if not isinstance(self.data_loader, XGBDataLoader):
            self.system_panic("data_loader should be type XGBDataLoader", fl_ctx)

        if self._metrics_writer_id:
            self._metrics_writer = engine.get_component(self._metrics_writer_id)
            if not isinstance(self._metrics_writer, LogWriter):
                self.system_panic("writer should be type LogWriter", fl_ctx)



[docs]
    def xgb_train(self, params: XGBoostParams) -> xgb.core.Booster:
        """XGBoost training logic.

        Args:
            params (XGBoostParams): xgboost parameters.

        Returns:
            A xgboost booster.
        """
        # Load file, file will not be sharded in federated mode.
        dtrain = self.train_data
        dval = self.val_data

        # Specify validations set to watch performance
        watchlist = [(dval, "eval"), (dtrain, "train")]

        callbacks = [callback.EvaluationMonitor(rank=self.rank)]

        if self._metrics_writer:
            callbacks.append(MetricsCallback(self._metrics_writer))

        # Run training, all the features in training API is available.
        bst = xgb.train(
            params.xgb_params,
            dtrain,
            params.num_rounds,
            evals=watchlist,
            early_stopping_rounds=params.early_stopping_rounds,
            verbose_eval=params.verbose_eval,
            callbacks=callbacks,
        )

        return bst



[docs]
    def handle_event(self, event_type: str, fl_ctx: FLContext):
        if event_type == EventType.START_RUN:
            self.initialize(fl_ctx)


    def _get_server_address(self, fl_ctx: FLContext):
        return self._server_address

    def _get_certificates(self, fl_ctx: FLContext):
        workspace: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
        bin_folder = workspace.get_startup_kit_dir()
        ca_cert_path = os.path.join(bin_folder, "rootCA.pem")
        if not os.path.exists(ca_cert_path):
            self.log_error(fl_ctx, "Missing ca certificate (rootCA.pem)")
            return False
        client_key_path = os.path.join(bin_folder, "client.key")
        if not os.path.exists(client_key_path):
            self.log_error(fl_ctx, "Missing client key (client.key)")
            return False
        client_cert_path = os.path.join(bin_folder, "client.crt")
        if not os.path.exists(client_cert_path):
            self.log_error(fl_ctx, "Missing client certificate (client.crt)")
            return False
        self._ca_cert_path = ca_cert_path
        self._client_key_path = client_key_path
        self._client_cert_path = client_cert_path
        return True


[docs]
    def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable:
        self.log_info(fl_ctx, f"Client trainer got task: {task_name}")

        try:
            if task_name == XGB_TRAIN_TASK:
                return self.train(shareable, fl_ctx, abort_signal)
            else:
                self.log_error(fl_ctx, f"Could not handle task: {task_name}")
                return make_reply(ReturnCode.TASK_UNKNOWN)
        except Exception as e:
            # Task execution error, return EXECUTION_EXCEPTION Shareable
            self.log_exception(fl_ctx, f"learner execute exception: {secure_format_exception(e)}")
            return make_reply(ReturnCode.EXECUTION_EXCEPTION)



[docs]
    def train(self, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable:
        """XGBoost training task pipeline which handles NVFlare specific tasks"""
        if abort_signal.triggered:
            return make_reply(ReturnCode.TASK_ABORTED)

        # Print round information
        current_round = shareable.get_header(AppConstants.CURRENT_ROUND)
        total_rounds = shareable.get_header(AppConstants.NUM_ROUNDS)
        client_name = fl_ctx.get_identity_name()
        self.log_info(fl_ctx, f"Client: {client_name} Round: {current_round}/{total_rounds}")

        rank_map = shareable.get_header(XGBShareableHeader.RANK_MAP)
        if client_name not in rank_map:
            self.log_error(fl_ctx, f"Train failed due to client {client_name} missing in rank_map: {rank_map}")
            return make_reply(ReturnCode.ERROR)

        world_size = shareable.get_header(XGBShareableHeader.WORLD_SIZE)
        if world_size is None:
            self.log_error(fl_ctx, f"Train failed in client {client_name}: missing xgboost world size in header.")
            return make_reply(ReturnCode.ERROR)

        xgb_fl_server_port = shareable.get_header(XGBShareableHeader.XGB_FL_SERVER_PORT)
        if xgb_fl_server_port is None:
            self.log_error(fl_ctx, f"Train failed in client {client_name}: missing xgboost FL server port in header.")
            return make_reply(ReturnCode.ERROR)

        secure_comm = shareable.get_header(XGBShareableHeader.XGB_FL_SERVER_SECURE)
        if secure_comm is None:
            self.log_error(fl_ctx, f"Train failed in client {client_name}: missing xgboost secure_comm in header.")
            return make_reply(ReturnCode.ERROR)

        self.rank = rank_map[client_name]
        self.world_size = world_size

        if self.use_gpus:
            # mapping each rank to the first GPU if not set
            device = self.xgb_params.get("device", "cuda:0")
            self.log_info(fl_ctx, f"Training with GPU {device}")
            self.xgb_params["device"] = device

        self.log_info(fl_ctx, f"Using xgb params: {self.xgb_params}")
        params = XGBoostParams(
            xgb_params=self.xgb_params,
            num_rounds=self.num_rounds,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=self.verbose_eval,
        )

        self._server_address = self._get_server_address(fl_ctx)
        self.log_info(fl_ctx, f"server address is {self._server_address}")

        communicator_env = {
            "dmlc_communicator": "federated",
            "federated_server_address": f"{self._server_address}:{xgb_fl_server_port}",
            "federated_world_size": self.world_size,
            "federated_rank": self.rank,
        }
        if secure_comm:
            if not self._get_certificates(fl_ctx):
                return make_reply(ReturnCode.ERROR)

            communicator_env["federated_server_cert_path"] = self._ca_cert_path
            communicator_env["federated_client_key_path"] = self._client_key_path
            communicator_env["federated_client_cert_path"] = self._client_cert_path

        try:
            self.data_loader.initialize(
                client_id=self.client_id,
                rank=self.rank,
            )
            with xgb.collective.CommunicatorContext(**communicator_env):
                # Load the data. Dmatrix must be created with column split mode in CommunicatorContext for vertical FL
                if not self.train_data or not self.val_data:
                    self.train_data, self.val_data = self.data_loader.load_data()

                bst = self.xgb_train(params)

                # Save the model.
                workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
                run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN)
                run_dir = workspace.get_run_dir(run_number)
                bst.save_model(self._get_model_path(run_dir))
                xgb.collective.communicator_print("Finished training\n")
        except Exception as e:
            secure_log_traceback()
            self.log_error(fl_ctx, f"Exception happens when running xgb train: {secure_format_exception(e)}")
            return make_reply(ReturnCode.EXECUTION_EXCEPTION)

        return make_reply(ReturnCode.OK)