Triton

pytriton.triton.Triton

Triton(*, config: Optional[TritonConfig] = None, workspace: Union[Workspace, str, Path, None] = None, triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None, security_config: Optional[TritonSecurityConfig] = None)

Bases: TritonBase

Triton Inference Server for Python models.

Initialize Triton Inference Server context for starting server and loading models.

Parameters:

config (Optional[TritonConfig], default: None ) –
TritonConfig object with optional customizations for Triton Inference Server. Configuration can be passed also through environment variables. See TritonConfig.from_env() class method for details.

Order of precedence:
- config defined through config parameter of init method.
- config defined in environment variables
- default TritonConfig values
workspace (Union[Workspace, str, Path, None], default: None ) –

workspace or path where the Triton Model Store and files used by pytriton will be created. If workspace is None random workspace will be created. Workspace will be deleted in Triton.stop().
triton_lifecycle_policy (Optional[TritonLifecyclePolicy], default: None ) –

policy indicating when Triton server is launched and where the model store is located (locally or remotely managed by Triton server). If triton_lifecycle_policy is None, DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local). Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.
security_config (Optional[TritonSecurityConfig], default: None ) –

TritonSecurityConfig object with security settings for token-based access restriction. If not provided, DefaultTritonSecurityConfig is used, which auto-generates a secure token and protects the default endpoints: shared-memory, model-repository, statistics, trace, logging.

Source code in pytriton/triton.py

def __init__(
    self,
    *,
    config: Optional[TritonConfig] = None,
    workspace: Union[Workspace, str, pathlib.Path, None] = None,
    triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None,
    security_config: Optional[TritonSecurityConfig] = None,
):
    """Initialize Triton Inference Server context for starting server and loading models.

    Args:
        config: TritonConfig object with optional customizations for Triton Inference Server.
            Configuration can be passed also through environment variables.
            See [TritonConfig.from_env()][pytriton.triton.TritonConfig.from_env] class method for details.

            Order of precedence:

              - config defined through `config` parameter of init method.
              - config defined in environment variables
              - default TritonConfig values
        workspace: workspace or path where the Triton Model Store and files used by pytriton will be created.
            If workspace is `None` random workspace will be created.
            Workspace will be deleted in [Triton.stop()][pytriton.triton.Triton.stop].
        triton_lifecycle_policy:  policy indicating when Triton server is launched and where the model store is located
            (locally or remotely managed by Triton server). If triton_lifecycle_policy is None,
            DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local).
            Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.
        security_config: TritonSecurityConfig object with security settings for token-based access restriction.
            If not provided, DefaultTritonSecurityConfig is used, which auto-generates a secure token and protects
            the default endpoints: shared-memory, model-repository, statistics, trace, logging.
    """
    _triton_lifecycle_policy = (
        VertextAILifecyclePolicy
        if triton_lifecycle_policy is None and config is not None and config.allow_vertex_ai
        else triton_lifecycle_policy
    ) or DefaultTritonLifecyclePolicy

    def _without_none_values(_d):
        return {name: value for name, value in _d.items() if value is not None}

    default_config_dict = _without_none_values(TritonConfig().to_dict())
    env_config_dict = _without_none_values(TritonConfig.from_env().to_dict())
    explicit_config_dict = _without_none_values(config.to_dict() if config else {})
    config_dict = {**default_config_dict, **env_config_dict, **explicit_config_dict}
    self._config = TritonConfig(**config_dict)
    self._security_config = security_config or TritonSecurityConfig()

    workspace_instance = workspace if isinstance(workspace, Workspace) else Workspace(workspace)
    self._prepare_triton_config(workspace_instance)
    endpoint_protocol = "http" if self._config.allow_http in [True, None] else "grpc"
    super().__init__(
        url=endpoint_utils.get_endpoint(self._triton_server_config, endpoint_protocol),
        workspace=workspace_instance,
        triton_lifecycle_policy=_triton_lifecycle_policy,
        access_token=self._security_config.access_token,
    )
    self._triton_server = None

enter

__enter__() -> Triton

Entering the context launches the triton server.

Returns:

Triton –

A Triton object

Source code in pytriton/triton.py

def __enter__(self) -> "Triton":
    """Entering the context launches the triton server.

    Returns:
        A Triton object
    """
    if self._triton_lifecycle_policy.launch_triton_on_startup:
        self._run_server()
    super().__enter__()
    return self

exit

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

*_ –

unused arguments

Source code in pytriton/triton.py

def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

_initialize_server

_initialize_server() -> None

Initialize Triton Inference Server before binary execution.

Source code in pytriton/triton.py

def _initialize_server(self) -> None:
    """Initialize Triton Inference Server before binary execution."""
    self._triton_inference_server_path = self._prepare_triton_inference_server()
    self._triton_server = TritonServer(
        path=(self._triton_inference_server_path / "bin" / "tritonserver").as_posix(),
        libs_path=get_libs_path(),
        config=self._triton_server_config,
    )

    url = (
        self._triton_server.get_endpoint("http")
        if (self._config.allow_http is None or self._config.allow_http)
        else self._triton_server.get_endpoint("grpc")
    )
    self._log_level_checker = _LogLevelChecker(url, access_token=self._security_config.access_token)

_is_alive_impl

_is_alive_impl() -> bool

Verify is deployed models and server are alive.

Returns:

bool –

True if server and loaded models are alive, False otherwise.

Source code in pytriton/triton.py

def _is_alive_impl(self) -> bool:
    """Verify is deployed models and server are alive.

    Returns:
        True if server and loaded models are alive, False otherwise.
    """
    if not self._triton_server:
        return False

    return self._triton_server.is_alive()

_on_tritonserver_exit

_on_tritonserver_exit(*_) -> None

Handle the Triton Inference Server process exit.

Parameters:

_ –

unused arguments

Source code in pytriton/triton.py

def _on_tritonserver_exit(self, *_) -> None:
    """Handle the Triton Inference Server process exit.

    Args:
        _: unused arguments
    """
    LOGGER.debug("Got callback that tritonserver process finished")
    self.stop()

_prepare_triton_inference_server

_prepare_triton_inference_server() -> Path

Prepare binaries and libraries of Triton Inference Server for execution.

Return

Path where Triton binaries are ready for execution

Source code in pytriton/triton.py

def _prepare_triton_inference_server(self) -> pathlib.Path:
    """Prepare binaries and libraries of Triton Inference Server for execution.

    Return:
        Path where Triton binaries are ready for execution
    """
    triton_inference_server_path = self._workspace.path / "tritonserver"

    LOGGER.debug("Preparing Triton Inference Server binaries and libs for execution.")
    shutil.copytree(
        TRITONSERVER_DIST_DIR,
        triton_inference_server_path,
        ignore=shutil.ignore_patterns("python_backend_stubs", "triton_python_backend_stub"),
    )
    LOGGER.debug("Triton Inference Server binaries copied to %s without stubs.", triton_inference_server_path)

    major = sys.version_info[0]
    minor = sys.version_info[1]
    version = f"{major}.{minor}"

    src_stub_path = get_stub_path(version)
    dst_stub_path = triton_inference_server_path / "backends" / "python" / "triton_python_backend_stub"

    LOGGER.debug("Copying stub for version %s from %s to %s", version, src_stub_path, dst_stub_path)
    shutil.copy(src_stub_path, dst_stub_path)

    LOGGER.debug("Triton Inference Server binaries ready in %s", triton_inference_server_path)

    self._triton_server_config["backend_directory"] = (triton_inference_server_path / "backends").as_posix()
    if "cache_directory" not in self._triton_server_config:
        self._triton_server_config["cache_directory"] = (triton_inference_server_path / "caches").as_posix()
    return triton_inference_server_path

_run_server

_run_server()

Run Triton Inference Server.

Source code in pytriton/triton.py

def _run_server(self):
    """Run Triton Inference Server."""
    if self._triton_server is None:
        self._initialize_server()
    if not self._triton_server.is_alive():
        with self._cv:
            self._stopped = False
        LOGGER.debug("Starting Triton Inference")
        self._triton_server.register_on_exit(self._on_tritonserver_exit)
        self._triton_server.start()

_shm_prefix

_shm_prefix() -> str

Generate unique prefix for shm memory.

Returns:

str –

String with prefix

Source code in pytriton/triton.py

def _shm_prefix(self) -> str:
    """Generate unique prefix for shm memory.

    Returns:
        String with prefix
    """
    hash = codecs.encode(os.urandom(4), "hex").decode()
    pid = os.getpid()
    return f"pytrtion{pid}-{hash}"

_validate_model_name `classmethod`

_validate_model_name(model_name: str) -> None

Validate model name.

Parameters:

model_name (str) –

Model name

Source code in pytriton/triton.py

@classmethod
def _validate_model_name(cls, model_name: str) -> None:
    """Validate model name.

    Args:
        model_name: Model name
    """
    if not model_name:
        raise PyTritonValidationError("Model name cannot be empty")

    if not re.match(r"^[a-zA-Z0-9._-]+$", model_name):
        raise PyTritonValidationError(
            "Model name can only contain alphanumeric characters, dots, underscores and dashes"
        )

_wait_for_models

_wait_for_models() -> None

Log loaded models in console to show the available endpoints.

Source code in pytriton/triton.py

def _wait_for_models(self) -> None:
    """Log loaded models in console to show the available endpoints."""
    self._log_level_checker.check()

    try:
        for model in self._model_manager.models:
            with ModelClient(
                url=self._url,
                model_name=model.model_name,
                model_version=str(model.model_version),
                access_token=self._access_token,
            ) as client:
                # This waits for only tritonserver and lightweight proxy backend to be ready
                # timeout should be short as model is loaded before execution of Triton.start() method
                client.wait_for_model(timeout_s=WAIT_FORM_MODEL_TIMEOUT_S)
    except TimeoutError:
        LOGGER.warning(
            "Could not verify locally if models are ready using %s. Please, check the server logs for details.",
            self._url,
        )

    for model in self._model_manager.models:
        LOGGER.info("Infer function available as model: `%s`", MODEL_URL.format(model_name=model.model_name))
        LOGGER.info("  Status:         `GET  %s`", MODEL_READY_URL.format(model_name=model.model_name))
        LOGGER.info("  Model config:   `GET  %s`", MODEL_CONFIG_URL.format(model_name=model.model_name))
        LOGGER.info("  Inference:      `POST %s`", MODEL_INFER_URL.format(model_name=model.model_name))

    LOGGER.info(
        "Read more about configuring and serving models in "
        "documentation: https://triton-inference-server.github.io/pytriton."
    )
    LOGGER.info("(Press CTRL+C or use the command `kill -SIGINT %d` to send a SIGINT signal and quit)", os.getpid())

_wait_for_server

_wait_for_server() -> None

Wait for Triton Inference Server to be ready.

Source code in pytriton/triton.py

def _wait_for_server(self) -> None:
    """Wait for Triton Inference Server to be ready."""
    self._log_level_checker.check()
    try:
        with contextlib.closing(create_client_from_url(self._url)) as client:
            wait_for_server_ready(client, timeout_s=DEFAULT_TRITON_STARTUP_TIMEOUT_S, headers=self._headers)
    except TimeoutError as e:
        LOGGER.warning(
            "Could not verify locally if Triton Inference Server is ready using %s. "
            "Please, check the server logs for details.",
            self._url,
        )
        raise TimeoutError("Triton Inference Server is not ready after timeout.") from e

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False, trace_config: Optional[List[str]] = None) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

infer_func (Union[Callable, Sequence[Callable]]) –

Inference callable to handle request/response from Triton Inference Server
inputs (Sequence[Tensor]) –

Definition of model inputs
outputs (Sequence[Tensor]) –

Definition of model outputs
model_name (str) –

Name under which model is available in Triton Inference Server. It can only contain
model_version (int, default: 1 ) –

Version of model
config (Optional[ModelConfig], default: None ) –

Model configuration for Triton Inference Server deployment
strict (bool, default: False ) –

Enable strict validation between model config outputs and inference function result
trace_config (Optional[List[str]], default: None ) –

List of trace config parameters

Source code in pytriton/triton.py

def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
    trace_config: Optional[List[str]] = None,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
        trace_config: List of trace config parameters
    """
    self._validate_model_name(model_name)
    model_kwargs = {}
    if trace_config is None:
        triton_config = getattr(self, "_config", None)
        if triton_config is not None:
            trace_config = getattr(triton_config, "trace_config", None)
            if trace_config is not None:
                LOGGER.info("Using trace config from TritonConfig: %s", trace_config)
                model_kwargs["trace_config"] = trace_config
    else:
        model_kwargs["trace_config"] = trace_config
    telemetry_tracer = get_telemetry_tracer()

    # Automatically set telemetry tracer if not set at the proxy side
    if telemetry_tracer is None and trace_config is not None:
        LOGGER.info("Setting telemetry tracer from TritonConfig")
        telemetry_tracer = build_proxy_tracer_from_triton_config(trace_config)
        set_telemetry_tracer(telemetry_tracer)

    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
        **model_kwargs,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

TimeoutError –

if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py

def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

get_access_token

get_access_token() -> str

Get the current access token for endpoint restrictions.

Returns:

str –

The access token string (explicit or generated)

Source code in pytriton/triton.py

def get_access_token(self) -> str:
    """Get the current access token for endpoint restrictions.

    Returns:
        The access token string (explicit or generated)
    """
    return self._security_config.access_token

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py

def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py

def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

run

run() -> None

Run Triton Inference Server.

Source code in pytriton/triton.py

def run(self) -> None:
    """Run Triton Inference Server."""
    self._run_server()
    self.connect()

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py

def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self._run_server()
    super().serve(monitoring_period_s=monitoring_period_s)

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py

def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.triton.RemoteTriton

RemoteTriton(url: str, workspace: Union[Workspace, str, Path, None] = None, access_token: Optional[str] = None)

Bases: TritonBase

RemoteTriton connects to Triton Inference Server running on remote host.

Initialize RemoteTriton.

Parameters:

url (str) –

Triton Inference Server URL in form of ://: If scheme is not provided, http is used as default. If port is not provided, 8000 is used as default for http and 8001 for grpc.
workspace (Union[Workspace, str, Path, None], default: None ) –

path to be created where the files used by pytriton will be stored (e.g. socket files for communication). If workspace is None temporary workspace will be created. Workspace should be created in shared filesystem space between RemoteTriton and Triton Inference Server to allow access to socket files (if you use containers, folder must be shared between containers).
access_token (Optional[str], default: None ) –

Access token for the Triton Inference Server

Source code in pytriton/triton.py

def __init__(
    self, url: str, workspace: Union[Workspace, str, pathlib.Path, None] = None, access_token: Optional[str] = None
):
    """Initialize RemoteTriton.

    Args:
        url: Triton Inference Server URL in form of <scheme>://<host>:<port>
            If scheme is not provided, http is used as default.
            If port is not provided, 8000 is used as default for http and 8001 for grpc.
        workspace: path to be created where the files used by pytriton will be stored
            (e.g. socket files for communication).
            If workspace is `None` temporary workspace will be created.
            Workspace should be created in shared filesystem space between RemoteTriton
            and Triton Inference Server to allow access to socket files
            (if you use containers, folder must be shared between containers).
        access_token: Access token for the Triton Inference Server
    """
    super().__init__(
        url=TritonUrl.from_url(url).with_scheme,
        workspace=workspace,
        triton_lifecycle_policy=TritonLifecyclePolicy(launch_triton_on_startup=True, local_model_store=False),
        access_token=access_token,
    )

    with self._cv:
        self._stopped = False

enter

__enter__() -> RemoteTriton

Entering the context connects to remote Triton server.

Returns:

RemoteTriton –

A RemoteTriton object

Source code in pytriton/triton.py

def __enter__(self) -> "RemoteTriton":
    """Entering the context connects to remote Triton server.

    Returns:
        A RemoteTriton object
    """
    super().__enter__()
    return self

exit

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

*_ –

unused arguments

Source code in pytriton/triton.py

def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

_validate_model_name `classmethod`

_validate_model_name(model_name: str) -> None

Validate model name.

Parameters:

model_name (str) –

Model name

Source code in pytriton/triton.py

@classmethod
def _validate_model_name(cls, model_name: str) -> None:
    """Validate model name.

    Args:
        model_name: Model name
    """
    if not model_name:
        raise PyTritonValidationError("Model name cannot be empty")

    if not re.match(r"^[a-zA-Z0-9._-]+$", model_name):
        raise PyTritonValidationError(
            "Model name can only contain alphanumeric characters, dots, underscores and dashes"
        )

_wait_for_models

_wait_for_models() -> None

Log loaded models in console to show the available endpoints.

Source code in pytriton/triton.py

def _wait_for_models(self) -> None:
    """Log loaded models in console to show the available endpoints."""
    self._log_level_checker.check()

    try:
        for model in self._model_manager.models:
            with ModelClient(
                url=self._url,
                model_name=model.model_name,
                model_version=str(model.model_version),
                access_token=self._access_token,
            ) as client:
                # This waits for only tritonserver and lightweight proxy backend to be ready
                # timeout should be short as model is loaded before execution of Triton.start() method
                client.wait_for_model(timeout_s=WAIT_FORM_MODEL_TIMEOUT_S)
    except TimeoutError:
        LOGGER.warning(
            "Could not verify locally if models are ready using %s. Please, check the server logs for details.",
            self._url,
        )

    for model in self._model_manager.models:
        LOGGER.info("Infer function available as model: `%s`", MODEL_URL.format(model_name=model.model_name))
        LOGGER.info("  Status:         `GET  %s`", MODEL_READY_URL.format(model_name=model.model_name))
        LOGGER.info("  Model config:   `GET  %s`", MODEL_CONFIG_URL.format(model_name=model.model_name))
        LOGGER.info("  Inference:      `POST %s`", MODEL_INFER_URL.format(model_name=model.model_name))

    LOGGER.info(
        "Read more about configuring and serving models in "
        "documentation: https://triton-inference-server.github.io/pytriton."
    )
    LOGGER.info("(Press CTRL+C or use the command `kill -SIGINT %d` to send a SIGINT signal and quit)", os.getpid())

_wait_for_server

_wait_for_server() -> None

Wait for Triton Inference Server to be ready.

Source code in pytriton/triton.py

def _wait_for_server(self) -> None:
    """Wait for Triton Inference Server to be ready."""
    self._log_level_checker.check()
    try:
        with contextlib.closing(create_client_from_url(self._url)) as client:
            wait_for_server_ready(client, timeout_s=DEFAULT_TRITON_STARTUP_TIMEOUT_S, headers=self._headers)
    except TimeoutError as e:
        LOGGER.warning(
            "Could not verify locally if Triton Inference Server is ready using %s. "
            "Please, check the server logs for details.",
            self._url,
        )
        raise TimeoutError("Triton Inference Server is not ready after timeout.") from e

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False, trace_config: Optional[List[str]] = None) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

infer_func (Union[Callable, Sequence[Callable]]) –

Inference callable to handle request/response from Triton Inference Server
inputs (Sequence[Tensor]) –

Definition of model inputs
outputs (Sequence[Tensor]) –

Definition of model outputs
model_name (str) –

Name under which model is available in Triton Inference Server. It can only contain
model_version (int, default: 1 ) –

Version of model
config (Optional[ModelConfig], default: None ) –

Model configuration for Triton Inference Server deployment
strict (bool, default: False ) –

Enable strict validation between model config outputs and inference function result
trace_config (Optional[List[str]], default: None ) –

List of trace config parameters

Source code in pytriton/triton.py

def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
    trace_config: Optional[List[str]] = None,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
        trace_config: List of trace config parameters
    """
    self._validate_model_name(model_name)
    model_kwargs = {}
    if trace_config is None:
        triton_config = getattr(self, "_config", None)
        if triton_config is not None:
            trace_config = getattr(triton_config, "trace_config", None)
            if trace_config is not None:
                LOGGER.info("Using trace config from TritonConfig: %s", trace_config)
                model_kwargs["trace_config"] = trace_config
    else:
        model_kwargs["trace_config"] = trace_config
    telemetry_tracer = get_telemetry_tracer()

    # Automatically set telemetry tracer if not set at the proxy side
    if telemetry_tracer is None and trace_config is not None:
        LOGGER.info("Setting telemetry tracer from TritonConfig")
        telemetry_tracer = build_proxy_tracer_from_triton_config(trace_config)
        set_telemetry_tracer(telemetry_tracer)

    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
        **model_kwargs,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

TimeoutError –

if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py

def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py

def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py

def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py

def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self.connect()
    with self._cv:
        try:
            while self.is_alive():
                self._cv.wait(timeout=monitoring_period_s)
        except KeyboardInterrupt:
            LOGGER.info("SIGINT received, exiting.")
        self.stop()

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py

def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.proxy.types.Request `dataclass`

Request(data: Dict[str, ndarray], parameters: Optional[Dict[str, Union[str, int, bool]]] = None, span: Optional[Any] = None, requested_output_names: Optional[List[str]] = None)

Data class for request data including numpy array inputs.

data `instance-attribute`

data: Dict[str, ndarray]

Input data for the request.

parameters `class-attribute` `instance-attribute`

parameters: Optional[Dict[str, Union[str, int, bool]]] = None

Parameters for the request.

requested_output_names `class-attribute` `instance-attribute`

requested_output_names: Optional[List[str]] = None

Requested output names for the request.

span `class-attribute` `instance-attribute`

span: Optional[Any] = None

Telemetry span for request

delitem

__delitem__(input_name: str)

Delete input data from request.

Source code in pytriton/proxy/types.py

def __delitem__(self, input_name: str):
    """Delete input data from request."""
    del self.data[input_name]

getitem

__getitem__(input_name: str) -> ndarray

Get input data.

Source code in pytriton/proxy/types.py

def __getitem__(self, input_name: str) -> np.ndarray:
    """Get input data."""
    return self.data[input_name]

iter

__iter__()

Iterate over input names.

Source code in pytriton/proxy/types.py

def __iter__(self):
    """Iterate over input names."""
    return iter(self.data)

len

__len__()

Get number of inputs.

Source code in pytriton/proxy/types.py

def __len__(self):
    """Get number of inputs."""
    return len(self.data)

setitem

__setitem__(input_name: str, input_data: ndarray)

Set input data.

Source code in pytriton/proxy/types.py

def __setitem__(self, input_name: str, input_data: np.ndarray):
    """Set input data."""
    self.data[input_name] = input_data

items

items()

Iterate over input names and data.

Source code in pytriton/proxy/types.py

def items(self):
    """Iterate over input names and data."""
    return self.data.items()

keys

keys()

Iterate over input names.

Source code in pytriton/proxy/types.py

def keys(self):
    """Iterate over input names."""
    return self.data.keys()

traced_span

traced_span(span_name)

Yields Open Telemetry a span for the request.

Parameters:

span_name (str) –

Name of the span

Source code in pytriton/proxy/types.py

def traced_span(self, span_name):
    """Yields Open Telemetry a span for the request.

    Args:
        span_name (str): Name of the span
    """
    return traced_span(self, span_name)

values

values()

Iterate over input data.

Source code in pytriton/proxy/types.py

def values(self):
    """Iterate over input data."""
    return self.data.values()

Triton

pytriton.triton.Triton

__enter__

__exit__

_initialize_server

_is_alive_impl

_on_tritonserver_exit

_prepare_triton_inference_server

_run_server

_shm_prefix

_validate_model_name classmethod

_wait_for_models

_wait_for_server

bind

connect

get_access_token

is_alive

is_connected

run

serve

stop

pytriton.triton.RemoteTriton

__enter__

__exit__

_validate_model_name classmethod

_wait_for_models

_wait_for_server

bind

connect

is_alive

is_connected

serve

stop

pytriton.proxy.types.Request dataclass

data instance-attribute

parameters class-attribute instance-attribute

requested_output_names class-attribute instance-attribute

span class-attribute instance-attribute

__delitem__

__getitem__

__iter__

__len__

__setitem__

items

keys

traced_span

values

enter

exit

_validate_model_name `classmethod`

enter

exit

_validate_model_name `classmethod`

pytriton.proxy.types.Request `dataclass`

data `instance-attribute`

parameters `class-attribute` `instance-attribute`

requested_output_names `class-attribute` `instance-attribute`

span `class-attribute` `instance-attribute`

delitem

getitem

iter

len

setitem