Skip to content

Triton

pytriton.triton.Triton

Triton(*, config: Optional[TritonConfig] = None, workspace: Union[Workspace, str, Path, None] = None, triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None)

Bases: TritonBase

Triton Inference Server for Python models.

Initialize Triton Inference Server context for starting server and loading models.

Parameters:

  • config (Optional[TritonConfig], default: None ) –

    TritonConfig object with optional customizations for Triton Inference Server. Configuration can be passed also through environment variables. See TritonConfig.from_env() class method for details.

    Order of precedence:

    • config defined through config parameter of init method.
    • config defined in environment variables
    • default TritonConfig values
  • workspace (Union[Workspace, str, Path, None], default: None ) –

    workspace or path where the Triton Model Store and files used by pytriton will be created. If workspace is None random workspace will be created. Workspace will be deleted in Triton.stop().

  • triton_lifecycle_policy (Optional[TritonLifecyclePolicy], default: None ) –

    policy indicating when Triton server is launched and where the model store is located (locally or remotely managed by Triton server). If triton_lifecycle_policy is None, DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local). Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.

Source code in pytriton/triton.py
def __init__(
    self,
    *,
    config: Optional[TritonConfig] = None,
    workspace: Union[Workspace, str, pathlib.Path, None] = None,
    triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None,
):
    """Initialize Triton Inference Server context for starting server and loading models.

    Args:
        config: TritonConfig object with optional customizations for Triton Inference Server.
            Configuration can be passed also through environment variables.
            See [TritonConfig.from_env()][pytriton.triton.TritonConfig.from_env] class method for details.

            Order of precedence:

              - config defined through `config` parameter of init method.
              - config defined in environment variables
              - default TritonConfig values
        workspace: workspace or path where the Triton Model Store and files used by pytriton will be created.
            If workspace is `None` random workspace will be created.
            Workspace will be deleted in [Triton.stop()][pytriton.triton.Triton.stop].
        triton_lifecycle_policy:  policy indicating when Triton server is launched and where the model store is located
            (locally or remotely managed by Triton server). If triton_lifecycle_policy is None,
            DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local).
            Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.
    """
    _triton_lifecycle_policy = (
        VertextAILifecyclePolicy
        if triton_lifecycle_policy is None and config is not None and config.allow_vertex_ai
        else triton_lifecycle_policy
    ) or DefaultTritonLifecyclePolicy

    def _without_none_values(_d):
        return {name: value for name, value in _d.items() if value is not None}

    default_config_dict = _without_none_values(TritonConfig().to_dict())
    env_config_dict = _without_none_values(TritonConfig.from_env().to_dict())
    explicit_config_dict = _without_none_values(config.to_dict() if config else {})
    config_dict = {**default_config_dict, **env_config_dict, **explicit_config_dict}
    self._config = TritonConfig(**config_dict)
    workspace_instance = workspace if isinstance(workspace, Workspace) else Workspace(workspace)
    self._prepare_triton_config(workspace_instance)
    endpoint_protocol = "http" if self._config.allow_http in [True, None] else "grpc"
    super().__init__(
        url=endpoint_utils.get_endpoint(self._triton_server_config, endpoint_protocol),
        workspace=workspace_instance,
        triton_lifecycle_policy=_triton_lifecycle_policy,
    )
    self._triton_server = None

__enter__

__enter__() -> Triton

Entering the context launches the triton server.

Returns:

Source code in pytriton/triton.py
def __enter__(self) -> "Triton":
    """Entering the context launches the triton server.

    Returns:
        A Triton object
    """
    if self._triton_lifecycle_policy.launch_triton_on_startup:
        self._run_server()
    super().__enter__()
    return self

__exit__

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

  • *_

    unused arguments

Source code in pytriton/triton.py
def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

  • infer_func (Union[Callable, Sequence[Callable]]) –

    Inference callable to handle request/response from Triton Inference Server

  • inputs (Sequence[Tensor]) –

    Definition of model inputs

  • outputs (Sequence[Tensor]) –

    Definition of model outputs

  • model_name (str) –

    Name under which model is available in Triton Inference Server. It can only contain

  • model_version (int, default: 1 ) –

    Version of model

  • config (Optional[ModelConfig], default: None ) –

    Model configuration for Triton Inference Server deployment

  • strict (bool, default: False ) –

    Enable strict validation between model config outputs and inference function result

Source code in pytriton/triton.py
def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
    """
    self._validate_model_name(model_name)
    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

  • TimeoutError

    if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py
def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py
def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py
def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

run

run() -> None

Run Triton Inference Server.

Source code in pytriton/triton.py
def run(self) -> None:
    """Run Triton Inference Server."""
    self._run_server()
    self.connect()

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

  • monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

    the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py
def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self._run_server()
    super().serve(monitoring_period_s=monitoring_period_s)

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py
def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.triton.RemoteTriton

RemoteTriton(url: str, workspace: Union[Workspace, str, Path, None] = None)

Bases: TritonBase

RemoteTriton connects to Triton Inference Server running on remote host.

Initialize RemoteTriton.

Parameters:

  • url (str) –

    Triton Inference Server URL in form of ://: If scheme is not provided, http is used as default. If port is not provided, 8000 is used as default for http and 8001 for grpc.

  • workspace (Union[Workspace, str, Path, None], default: None ) –

    path to be created where the files used by pytriton will be stored (e.g. socket files for communication). If workspace is None temporary workspace will be created. Workspace should be created in shared filesystem space between RemoteTriton and Triton Inference Server to allow access to socket files (if you use containers, folder must be shared between containers).

Source code in pytriton/triton.py
def __init__(self, url: str, workspace: Union[Workspace, str, pathlib.Path, None] = None):
    """Initialize RemoteTriton.

    Args:
        url: Triton Inference Server URL in form of <scheme>://<host>:<port>
            If scheme is not provided, http is used as default.
            If port is not provided, 8000 is used as default for http and 8001 for grpc.
        workspace: path to be created where the files used by pytriton will be stored
            (e.g. socket files for communication).
            If workspace is `None` temporary workspace will be created.
            Workspace should be created in shared filesystem space between RemoteTriton
            and Triton Inference Server to allow access to socket files
            (if you use containers, folder must be shared between containers).

    """
    super().__init__(
        url=TritonUrl.from_url(url).with_scheme,
        workspace=workspace,
        triton_lifecycle_policy=TritonLifecyclePolicy(launch_triton_on_startup=True, local_model_store=False),
    )

    with self._cv:
        self._stopped = False

__enter__

__enter__() -> RemoteTriton

Entering the context connects to remote Triton server.

Returns:

Source code in pytriton/triton.py
def __enter__(self) -> "RemoteTriton":
    """Entering the context connects to remote Triton server.

    Returns:
        A RemoteTriton object
    """
    super().__enter__()
    return self

__exit__

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

  • *_

    unused arguments

Source code in pytriton/triton.py
def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

  • infer_func (Union[Callable, Sequence[Callable]]) –

    Inference callable to handle request/response from Triton Inference Server

  • inputs (Sequence[Tensor]) –

    Definition of model inputs

  • outputs (Sequence[Tensor]) –

    Definition of model outputs

  • model_name (str) –

    Name under which model is available in Triton Inference Server. It can only contain

  • model_version (int, default: 1 ) –

    Version of model

  • config (Optional[ModelConfig], default: None ) –

    Model configuration for Triton Inference Server deployment

  • strict (bool, default: False ) –

    Enable strict validation between model config outputs and inference function result

Source code in pytriton/triton.py
def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
    """
    self._validate_model_name(model_name)
    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

  • TimeoutError

    if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py
def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py
def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py
def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

  • monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

    the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py
def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self.connect()
    with self._cv:
        try:
            while self.is_alive():
                self._cv.wait(timeout=monitoring_period_s)
        except KeyboardInterrupt:
            LOGGER.info("SIGINT received, exiting.")
        self.stop()

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py
def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.proxy.types.Request dataclass

Request(data: Dict[str, ndarray], parameters: Optional[Dict[str, Union[str, int, bool]]] = None)

Data class for request data including numpy array inputs.

data instance-attribute

data: Dict[str, ndarray]

Input data for the request.

parameters class-attribute instance-attribute

parameters: Optional[Dict[str, Union[str, int, bool]]] = None

Parameters for the request.

__delitem__

__delitem__(input_name: str)

Delete input data from request.

Source code in pytriton/proxy/types.py
def __delitem__(self, input_name: str):
    """Delete input data from request."""
    del self.data[input_name]

__getitem__

__getitem__(input_name: str) -> ndarray

Get input data.

Source code in pytriton/proxy/types.py
def __getitem__(self, input_name: str) -> np.ndarray:
    """Get input data."""
    return self.data[input_name]

__iter__

__iter__()

Iterate over input names.

Source code in pytriton/proxy/types.py
def __iter__(self):
    """Iterate over input names."""
    return iter(self.data)

__len__

__len__()

Get number of inputs.

Source code in pytriton/proxy/types.py
def __len__(self):
    """Get number of inputs."""
    return len(self.data)

__setitem__

__setitem__(input_name: str, input_data: ndarray)

Set input data.

Source code in pytriton/proxy/types.py
def __setitem__(self, input_name: str, input_data: np.ndarray):
    """Set input data."""
    self.data[input_name] = input_data

items

items()

Iterate over input names and data.

Source code in pytriton/proxy/types.py
def items(self):
    """Iterate over input names and data."""
    return self.data.items()

keys

keys()

Iterate over input names.

Source code in pytriton/proxy/types.py
def keys(self):
    """Iterate over input names."""
    return self.data.keys()

values

values()

Iterate over input data.

Source code in pytriton/proxy/types.py
def values(self):
    """Iterate over input data."""
    return self.data.values()