Triton

pytriton.triton.Triton

Triton(*, config: Optional[TritonConfig] = None, workspace: Union[Workspace, str, Path, None] = None, triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None)

Bases: TritonBase

Triton Inference Server for Python models.

Initialize Triton Inference Server context for starting server and loading models.

Parameters:

config (Optional[TritonConfig], default: None ) –
TritonConfig object with optional customizations for Triton Inference Server. Configuration can be passed also through environment variables. See TritonConfig.from_env() class method for details.

Order of precedence:
- config defined through config parameter of init method.
- config defined in environment variables
- default TritonConfig values
workspace (Union[Workspace, str, Path, None], default: None ) –

workspace or path where the Triton Model Store and files used by pytriton will be created. If workspace is None random workspace will be created. Workspace will be deleted in Triton.stop().
triton_lifecycle_policy (Optional[TritonLifecyclePolicy], default: None ) –

policy indicating when Triton server is launched and where the model store is located (locally or remotely managed by Triton server). If triton_lifecycle_policy is None, DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local). Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.

Source code in pytriton/triton.py

def __init__(
    self,
    *,
    config: Optional[TritonConfig] = None,
    workspace: Union[Workspace, str, pathlib.Path, None] = None,
    triton_lifecycle_policy: Optional[TritonLifecyclePolicy] = None,
):
    """Initialize Triton Inference Server context for starting server and loading models.

    Args:
        config: TritonConfig object with optional customizations for Triton Inference Server.
            Configuration can be passed also through environment variables.
            See [TritonConfig.from_env()][pytriton.triton.TritonConfig.from_env] class method for details.

            Order of precedence:

              - config defined through `config` parameter of init method.
              - config defined in environment variables
              - default TritonConfig values
        workspace: workspace or path where the Triton Model Store and files used by pytriton will be created.
            If workspace is `None` random workspace will be created.
            Workspace will be deleted in [Triton.stop()][pytriton.triton.Triton.stop].
        triton_lifecycle_policy:  policy indicating when Triton server is launched and where the model store is located
            (locally or remotely managed by Triton server). If triton_lifecycle_policy is None,
            DefaultTritonLifecyclePolicy is used by default (Triton server is launched on startup and model store is not local).
            Only if triton_lifecycle_policy is None and config.allow_vertex_ai is True, VertextAILifecyclePolicy is used instead.
    """
    _triton_lifecycle_policy = (
        VertextAILifecyclePolicy
        if triton_lifecycle_policy is None and config is not None and config.allow_vertex_ai
        else triton_lifecycle_policy
    ) or DefaultTritonLifecyclePolicy

    def _without_none_values(_d):
        return {name: value for name, value in _d.items() if value is not None}

    default_config_dict = _without_none_values(TritonConfig().to_dict())
    env_config_dict = _without_none_values(TritonConfig.from_env().to_dict())
    explicit_config_dict = _without_none_values(config.to_dict() if config else {})
    config_dict = {**default_config_dict, **env_config_dict, **explicit_config_dict}
    self._config = TritonConfig(**config_dict)
    workspace_instance = workspace if isinstance(workspace, Workspace) else Workspace(workspace)
    self._prepare_triton_config(workspace_instance)
    endpoint_protocol = "http" if self._config.allow_http in [True, None] else "grpc"
    super().__init__(
        url=endpoint_utils.get_endpoint(self._triton_server_config, endpoint_protocol),
        workspace=workspace_instance,
        triton_lifecycle_policy=_triton_lifecycle_policy,
    )
    self._triton_server = None

enter

__enter__() -> Triton

Entering the context launches the triton server.

Returns:

Triton –

A Triton object

Source code in pytriton/triton.py

def __enter__(self) -> "Triton":
    """Entering the context launches the triton server.

    Returns:
        A Triton object
    """
    if self._triton_lifecycle_policy.launch_triton_on_startup:
        self._run_server()
    super().__enter__()
    return self

exit

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

*_ –

unused arguments

Source code in pytriton/triton.py

def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

infer_func (Union[Callable, Sequence[Callable]]) –

Inference callable to handle request/response from Triton Inference Server
inputs (Sequence[Tensor]) –

Definition of model inputs
outputs (Sequence[Tensor]) –

Definition of model outputs
model_name (str) –

Name under which model is available in Triton Inference Server. It can only contain
model_version (int, default: 1 ) –

Version of model
config (Optional[ModelConfig], default: None ) –

Model configuration for Triton Inference Server deployment
strict (bool, default: False ) –

Enable strict validation between model config outputs and inference function result

Source code in pytriton/triton.py

def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
    """
    self._validate_model_name(model_name)
    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

TimeoutError –

if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py

def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py

def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py

def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

run

run() -> None

Run Triton Inference Server.

Source code in pytriton/triton.py

def run(self) -> None:
    """Run Triton Inference Server."""
    self._run_server()
    self.connect()

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py

def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self._run_server()
    super().serve(monitoring_period_s=monitoring_period_s)

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py

def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.triton.RemoteTriton

RemoteTriton(url: str, workspace: Union[Workspace, str, Path, None] = None)

Bases: TritonBase

RemoteTriton connects to Triton Inference Server running on remote host.

Initialize RemoteTriton.

Parameters:

url (str) –

Triton Inference Server URL in form of ://: If scheme is not provided, http is used as default. If port is not provided, 8000 is used as default for http and 8001 for grpc.
workspace (Union[Workspace, str, Path, None], default: None ) –

path to be created where the files used by pytriton will be stored (e.g. socket files for communication). If workspace is None temporary workspace will be created. Workspace should be created in shared filesystem space between RemoteTriton and Triton Inference Server to allow access to socket files (if you use containers, folder must be shared between containers).

Source code in pytriton/triton.py

def __init__(self, url: str, workspace: Union[Workspace, str, pathlib.Path, None] = None):
    """Initialize RemoteTriton.

    Args:
        url: Triton Inference Server URL in form of <scheme>://<host>:<port>
            If scheme is not provided, http is used as default.
            If port is not provided, 8000 is used as default for http and 8001 for grpc.
        workspace: path to be created where the files used by pytriton will be stored
            (e.g. socket files for communication).
            If workspace is `None` temporary workspace will be created.
            Workspace should be created in shared filesystem space between RemoteTriton
            and Triton Inference Server to allow access to socket files
            (if you use containers, folder must be shared between containers).

    """
    super().__init__(
        url=TritonUrl.from_url(url).with_scheme,
        workspace=workspace,
        triton_lifecycle_policy=TritonLifecyclePolicy(launch_triton_on_startup=True, local_model_store=False),
    )

    with self._cv:
        self._stopped = False

enter

__enter__() -> RemoteTriton

Entering the context connects to remote Triton server.

Returns:

RemoteTriton –

A RemoteTriton object

Source code in pytriton/triton.py

def __enter__(self) -> "RemoteTriton":
    """Entering the context connects to remote Triton server.

    Returns:
        A RemoteTriton object
    """
    super().__enter__()
    return self

exit

__exit__(*_) -> None

Exit the context stopping the process and cleaning the workspace.

Parameters:

*_ –

unused arguments

Source code in pytriton/triton.py

def __exit__(self, *_) -> None:
    """Exit the context stopping the process and cleaning the workspace.

    Args:
        *_: unused arguments
    """
    self.stop()

bind

bind(model_name: str, infer_func: Union[Callable, Sequence[Callable]], inputs: Sequence[Tensor], outputs: Sequence[Tensor], model_version: int = 1, config: Optional[ModelConfig] = None, strict: bool = False) -> None

Create a model with given name and inference callable binding into Triton Inference Server.

More information about model configuration: https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

Parameters:

infer_func (Union[Callable, Sequence[Callable]]) –

Inference callable to handle request/response from Triton Inference Server
inputs (Sequence[Tensor]) –

Definition of model inputs
outputs (Sequence[Tensor]) –

Definition of model outputs
model_name (str) –

Name under which model is available in Triton Inference Server. It can only contain
model_version (int, default: 1 ) –

Version of model
config (Optional[ModelConfig], default: None ) –

Model configuration for Triton Inference Server deployment
strict (bool, default: False ) –

Enable strict validation between model config outputs and inference function result

Source code in pytriton/triton.py

def bind(
    self,
    model_name: str,
    infer_func: Union[Callable, Sequence[Callable]],
    inputs: Sequence[Tensor],
    outputs: Sequence[Tensor],
    model_version: int = 1,
    config: Optional[ModelConfig] = None,
    strict: bool = False,
) -> None:
    """Create a model with given name and inference callable binding into Triton Inference Server.

    More information about model configuration:
    https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md

    Args:
        infer_func: Inference callable to handle request/response from Triton Inference Server
        (or list of inference callable for multi instance model)
        inputs: Definition of model inputs
        outputs: Definition of model outputs
        model_name: Name under which model is available in Triton Inference Server. It can only contain
        alphanumeric characters, dots, underscores and dashes.
        model_version: Version of model
        config: Model configuration for Triton Inference Server deployment
        strict: Enable strict validation between model config outputs and inference function result
    """
    self._validate_model_name(model_name)
    model = Model(
        model_name=model_name,
        model_version=model_version,
        inference_fn=infer_func,
        inputs=inputs,
        outputs=outputs,
        config=config if config else ModelConfig(),
        workspace=self._workspace,
        triton_context=self._triton_context,
        strict=strict,
    )
    model.on_model_event(self._on_model_event)

    self._model_manager.add_model(model, self.is_connected())

connect

connect() -> None

Connect to Triton Inference Server.

Raises:

TimeoutError –

if Triton Inference Server is not ready after timeout

Source code in pytriton/triton.py

def connect(self) -> None:
    """Connect to Triton Inference Server.

    Raises:
        TimeoutError: if Triton Inference Server is not ready after timeout
    """
    with self._cv:
        if self._connected:
            LOGGER.debug("Triton Inference already connected.")
            return

        self._wait_for_server()
        if self._triton_lifecycle_policy.local_model_store:
            self._model_manager.setup_models()
        else:
            self._model_manager.load_models()

        self._wait_for_models()
        self._connected = True

is_alive

is_alive() -> bool

Check if Triton Inference Server is alive.

Source code in pytriton/triton.py

def is_alive(self) -> bool:
    """Check if Triton Inference Server is alive."""
    if not self._is_alive_impl():
        return False

    for model in self._model_manager.models:
        if not model.is_alive():
            return False
    return True

is_connected

is_connected() -> bool

Check if Triton Inference Server is connected.

Source code in pytriton/triton.py

def is_connected(self) -> bool:
    """Check if Triton Inference Server is connected."""
    with self._cv:
        return self._connected

serve

serve(monitoring_period_s: float = MONITORING_PERIOD_S) -> None

Run Triton Inference Server and lock thread for serving requests/response.

Parameters:

monitoring_period_s (float, default: MONITORING_PERIOD_S ) –

the timeout of monitoring if Triton and models are available. Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend are still alive and sleep again. If triton or proxy is not alive - method returns.

Source code in pytriton/triton.py

def serve(self, monitoring_period_s: float = MONITORING_PERIOD_S) -> None:
    """Run Triton Inference Server and lock thread for serving requests/response.

    Args:
        monitoring_period_s: the timeout of monitoring if Triton and models are available.
            Every monitoring_period_s seconds main thread wakes up and check if triton server and proxy backend
            are still alive and sleep again. If triton or proxy is not alive - method returns.
    """
    self.connect()
    with self._cv:
        try:
            while self.is_alive():
                self._cv.wait(timeout=monitoring_period_s)
        except KeyboardInterrupt:
            LOGGER.info("SIGINT received, exiting.")
        self.stop()

stop

stop() -> bool

Stop Triton Inference Server and clean workspace.

Source code in pytriton/triton.py

def stop(self) -> bool:
    """Stop Triton Inference Server and clean workspace."""
    with self._cv:
        if self._stopped:
            LOGGER.debug("Triton Inference already stopped.")
            return False
        self._stopped = True
        self._connected = False
        atexit.unregister(self.stop)
    self._pre_stop_impl()
    self._model_manager.clean()
    self._workspace.clean()

    with self._cv:
        self._cv.notify_all()
    LOGGER.debug("Stopped Triton Inference server and proxy backends")
    self._log_level_checker.check(skip_update=True)

    return True

pytriton.proxy.types.Request `dataclass`

Request(data: Dict[str, ndarray], parameters: Optional[Dict[str, Union[str, int, bool]]] = None)

Data class for request data including numpy array inputs.

data `instance-attribute`

data: Dict[str, ndarray]

Input data for the request.

parameters `class-attribute` `instance-attribute`

parameters: Optional[Dict[str, Union[str, int, bool]]] = None

Parameters for the request.

delitem

__delitem__(input_name: str)

Delete input data from request.

Source code in pytriton/proxy/types.py

def __delitem__(self, input_name: str):
    """Delete input data from request."""
    del self.data[input_name]

getitem

__getitem__(input_name: str) -> ndarray

Get input data.

Source code in pytriton/proxy/types.py

def __getitem__(self, input_name: str) -> np.ndarray:
    """Get input data."""
    return self.data[input_name]

iter

__iter__()

Iterate over input names.

Source code in pytriton/proxy/types.py

def __iter__(self):
    """Iterate over input names."""
    return iter(self.data)

len

__len__()

Get number of inputs.

Source code in pytriton/proxy/types.py

def __len__(self):
    """Get number of inputs."""
    return len(self.data)

setitem

__setitem__(input_name: str, input_data: ndarray)

Set input data.

Source code in pytriton/proxy/types.py

def __setitem__(self, input_name: str, input_data: np.ndarray):
    """Set input data."""
    self.data[input_name] = input_data

items

items()

Iterate over input names and data.

Source code in pytriton/proxy/types.py

def items(self):
    """Iterate over input names and data."""
    return self.data.items()

keys

keys()

Iterate over input names.

Source code in pytriton/proxy/types.py

def keys(self):
    """Iterate over input names."""
    return self.data.keys()

values

values()

Iterate over input data.

Source code in pytriton/proxy/types.py

def values(self):
    """Iterate over input data."""
    return self.data.values()

Triton

pytriton.triton.Triton

__enter__

__exit__

bind

connect

is_alive

is_connected

run

serve

stop

pytriton.triton.RemoteTriton

__enter__

__exit__

bind

connect

is_alive

is_connected

serve

stop

pytriton.proxy.types.Request dataclass

data instance-attribute

parameters class-attribute instance-attribute

__delitem__

__getitem__

__iter__

__len__

__setitem__

items

keys

values

enter

exit

enter

exit

pytriton.proxy.types.Request `dataclass`

data `instance-attribute`

parameters `class-attribute` `instance-attribute`

delitem

getitem

iter

len

setitem