Source code for nvflare.private.fed.client.client_aux_runner

# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import threading
import time

from nvflare.apis.event_type import EventType
from nvflare.apis.fl_constant import ReturnCode
from nvflare.apis.fl_context import FLContext
from nvflare.apis.shareable import ReservedHeaderKey, Shareable, make_reply
from nvflare.private.aux_runner import AuxRunner

from .client_engine_executor_spec import ClientEngineExecutorSpec


[docs]class ClientAuxRunner(AuxRunner): """ClientAuxRunner to send the aux messages to the server. Note: The ClientEngine must create a new ClientAuxRunner object for each RUN, and make sure it is added as an event handler! """ def __init__(self): """To init the ClientAuxRunner.""" AuxRunner.__init__(self) self.abort_signal = None self.sender = None self.asked_to_stop = False self.engine = None self.fnf_requests = [] self.fnf_lock = threading.Lock()
[docs] def handle_event(self, event_type: str, fl_ctx: FLContext): AuxRunner.handle_event(self, event_type, fl_ctx) if event_type == EventType.START_RUN: self.engine = fl_ctx.get_engine() self.abort_signal = fl_ctx.get_run_abort_signal() self.sender = threading.Thread(target=self._send_fnf_requests, args=()) self.sender.start() elif event_type == EventType.END_RUN: self.asked_to_stop = True if self.sender and self.sender.is_alive(): self.sender.join()
[docs] def send_aux_request(self, topic: str, request: Shareable, timeout: float, fl_ctx: FLContext) -> Shareable: if not isinstance(topic, str): raise TypeError("invalid topic: expects str but got {}".format(type(topic))) if not topic: raise ValueError("invalid topic: must not be empty") if topic == self.TOPIC_BULK: raise ValueError('topic value "{}" is reserved'.format(topic)) if not isinstance(timeout, float): raise TypeError("invalid timeout: expects float but got {}".format(type(timeout))) if timeout < 0: raise ValueError("invalid timeout value {}: must >= 0.0".format(timeout)) if not isinstance(fl_ctx, FLContext): raise TypeError("fl_ctx must be FLContext but got {}".format(type(fl_ctx))) req_to_send = request req_to_send.set_header(ReservedHeaderKey.TOPIC, topic) req_to_send.set_peer_props(fl_ctx.get_all_public_props()) if timeout <= 0.0: # this is fire-and-forget request with self.fnf_lock: self.fnf_requests.append(req_to_send) return make_reply(ReturnCode.OK) # send regular request engine = fl_ctx.get_engine() if not isinstance(engine, ClientEngineExecutorSpec): raise TypeError("engine must be ClientEngineExecutorSpec, but got {}".format(type(engine))) reply = engine.aux_send(topic=topic, request=req_to_send, timeout=timeout, fl_ctx=fl_ctx) # check whether the RUN should be aborted if not isinstance(reply, Shareable): self.log_error(fl_ctx, "bad reply from peer: expect Shareable but got {}".format(type(reply))) return make_reply(ReturnCode.ERROR) rc = reply.get_return_code() if rc == ReturnCode.RUN_MISMATCH: self.log_info(fl_ctx, "got RUN_MISMATCH - asked engine to abort app") engine.abort_app(job_id=self.run_num, fl_ctx=fl_ctx) return reply
def _send_fnf_requests(self): topic = self.TOPIC_BULK sleep_time = 0.5 while True: time.sleep(sleep_time) if self.abort_signal.triggered: break if len(self.fnf_requests) <= 0: if self.asked_to_stop: break else: sleep_time = 1.0 continue with self.engine.new_context() as fl_ctx: bulk = Shareable() bulk.set_header(ReservedHeaderKey.TOPIC, topic) bulk.set_peer_props(fl_ctx.get_all_public_props()) with self.fnf_lock: bulk[self.DATA_KEY_BULK] = self.fnf_requests reply = self.engine.aux_send(topic=topic, request=bulk, timeout=15.0, fl_ctx=fl_ctx) rc = reply.get_return_code() if rc != ReturnCode.COMMUNICATION_ERROR: # if communication error we'll retry self.fnf_requests = [] sleep_time = 0.5