Source code for torch.distributed.elastic.agent.server.health_check_server
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from typing import Callable
from torch.distributed.elastic.utils.logging import get_logger
log = get_logger(__name__)
__all__ = ["HealthCheckServer", "create_healthcheck_server"]
[docs]class HealthCheckServer:
    """
    Interface for health check monitoring server, which can be extended
    by starting tcp/http server on the specified port.
    Args:
        alive_callback: Callable[[], int], callback to last progress time of agent
        port: int, port number to start tcp/http server
        timeout: int, timeout seconds to decide agent is alive/dead
    """
    _alive_callback: Callable[[], int]
    _port: int
    _timeout: int
    def __init__(
        self, alive_callback: Callable[[], int], port: int, timeout: int
    ) -> None:
        self._alive_callback = alive_callback
        self._port = port
        self._timeout = timeout
[docs]    def start(self) -> None:
        """
        Unsupported functionality for Pytorch, doesn't start any health check server
        """
        log.warning("No health check server started")
[docs]    def stop(self) -> None:
        """
        Function to stop health check server
        """
        log.info("Stopping noop health check server.")
[docs]def create_healthcheck_server(
    alive_callback: Callable[[], int],
    port: int,
    timeout: int,
) -> HealthCheckServer:
    """
    creates health check server object
    """
    return HealthCheckServer(alive_callback, port, timeout)