feat(ml)!: cuda and openvino acceleration (#5619)

* cuda and openvino ep, refactor, update dockerfile * updated workflow * typing fixes * added tests * updated ml test gh action * updated README * updated docker-compose * added compute to hwaccel.yml * updated gh matrix updated gh matrix updated gh matrix updated gh matrix updated gh matrix give up * remove cuda/arm64 build * add hwaccel image tags to docker-compose * remove unnecessary quotes * add suffix to git tag * fixed kwargs in base model * armnn ld_library_path * update pyproject.toml * add armnn workflow * formatting * consolidate hwaccel files, update docker compose * update hw transcoding docs * add ml hwaccel docs * update dev and prod docker-compose * added armnn prerequisite docs * support 3.10 * updated docker-compose comments * formatting * test coverage * don't set arena extend strategy for openvino * working openvino * formatting * fix dockerfile * added type annotation * add wsl configuration for openvino * updated lock file * copy python3 * comment out extends section * fix platforms * simplify workflow suffix tagging * simplify aio transcoding doc * update docs and workflow for `hwaccel.yml` change * revert docs
2025-11-07 17:27:20 +00:00 · 2024-01-21 18:22:39 -05:00 · 2024-01-21 18:22:39 -05:00 · 95cfe22866
commit 95cfe22866
parent 6b419a984c
23 changed files with 962 additions and 460 deletions
--- a/machine-learning/app/models/base.py
+++ b/machine-learning/app/models/base.py
@ -11,6 +11,7 @@ from huggingface_hub import snapshot_download
 from typing_extensions import Buffer

 import ann.ann
+from app.models.constants import SUPPORTED_PROVIDERS

 from ..config import get_cache_dir, get_hf_model_name, log, settings
 from ..schemas import ModelType
@ -24,36 +25,17 @@ class InferenceModel(ABC):
        self,
        model_name: str,
        cache_dir: Path | str | None = None,
-        inter_op_num_threads: int = settings.model_inter_op_threads,
-        intra_op_num_threads: int = settings.model_intra_op_threads,
+        providers: list[str] | None = None,
+        provider_options: list[dict[str, Any]] | None = None,
+        sess_options: ort.SessionOptions | None = None,
        **model_kwargs: Any,
    ) -> None:
-        self.model_name = model_name
        self.loaded = False
-        self._cache_dir = Path(cache_dir) if cache_dir is not None else None
-        self.providers = model_kwargs.pop("providers", ["CPUExecutionProvider"])
-        #  don't pre-allocate more memory than needed
-        self.provider_options = model_kwargs.pop(
-            "provider_options", [{"arena_extend_strategy": "kSameAsRequested"}] * len(self.providers)
-        )
-        log.debug(
-            (
-                f"Setting '{self.model_name}' execution providers to {self.providers} "
-                "in descending order of preference"
-            ),
-        )
-        log.debug(f"Setting execution provider options to {self.provider_options}")
-        self.sess_options = PicklableSessionOptions()
-        # avoid thread contention between models
-        if inter_op_num_threads > 1:
-            self.sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
-
-        log.debug(f"Setting execution_mode to {self.sess_options.execution_mode.name}")
-        log.debug(f"Setting inter_op_num_threads to {inter_op_num_threads}")
-        log.debug(f"Setting intra_op_num_threads to {intra_op_num_threads}")
-        self.sess_options.inter_op_num_threads = inter_op_num_threads
-        self.sess_options.intra_op_num_threads = intra_op_num_threads
-        self.sess_options.enable_cpu_mem_arena = False
+        self.model_name = model_name
+        self.cache_dir = Path(cache_dir) if cache_dir is not None else self.cache_dir_default
+        self.providers = providers if providers is not None else self.providers_default
+        self.provider_options = provider_options if provider_options is not None else self.provider_options_default
+        self.sess_options = sess_options if sess_options is not None else self.sess_options_default

    def download(self) -> None:
        if not self.cached:
@ -95,33 +77,9 @@ class InferenceModel(ABC):
    def _load(self) -> None:
        ...

-    @property
-    def model_type(self) -> ModelType:
-        return self._model_type
-
-    @property
-    def cache_dir(self) -> Path:
-        return self._cache_dir if self._cache_dir is not None else get_cache_dir(self.model_name, self.model_type)
-
-    @cache_dir.setter
-    def cache_dir(self, cache_dir: Path) -> None:
-        self._cache_dir = cache_dir
-
-    @property
-    def cached(self) -> bool:
-        return self.cache_dir.exists() and any(self.cache_dir.iterdir())
-
-    @classmethod
-    def from_model_type(cls, model_type: ModelType, model_name: str, **model_kwargs: Any) -> InferenceModel:
-        subclasses = {subclass._model_type: subclass for subclass in cls.__subclasses__()}
-        if model_type not in subclasses:
-            raise ValueError(f"Unsupported model type: {model_type}")
-
-        return subclasses[model_type](model_name, **model_kwargs)
-
    def clear_cache(self) -> None:
        if not self.cache_dir.exists():
-            log.warn(
+            log.warning(
                f"Attempted to clear cache for model '{self.model_name}', but cache directory does not exist",
            )
            return
@ -132,7 +90,7 @@ class InferenceModel(ABC):
            log.info(f"Cleared cache directory for model '{self.model_name}'.")
            rmtree(self.cache_dir)
        else:
-            log.warn(
+            log.warning(
                (
                    f"Encountered file instead of directory at cache path "
                    f"for '{self.model_name}'. Removing file and replacing with a directory."
@ -156,6 +114,107 @@ class InferenceModel(ABC):
            raise ValueError(f"the file model_path='{model_path}' does not exist")
        return session

+    @property
+    def model_type(self) -> ModelType:
+        return self._model_type
+
+    @property
+    def cache_dir(self) -> Path:
+        return self._cache_dir
+
+    @cache_dir.setter
+    def cache_dir(self, cache_dir: Path) -> None:
+        self._cache_dir = cache_dir
+
+    @property
+    def cache_dir_default(self) -> Path:
+        return get_cache_dir(self.model_name, self.model_type)
+
+    @property
+    def cached(self) -> bool:
+        return self.cache_dir.exists() and any(self.cache_dir.iterdir())
+
+    @property
+    def providers(self) -> list[str]:
+        return self._providers
+
+    @providers.setter
+    def providers(self, providers: list[str]) -> None:
+        log.debug(
+            (f"Setting '{self.model_name}' execution providers to {providers}, " "in descending order of preference"),
+        )
+        self._providers = providers
+
+    @property
+    def providers_default(self) -> list[str]:
+        available_providers = set(ort.get_available_providers())
+        log.debug(f"Available ORT providers: {available_providers}")
+        return [provider for provider in SUPPORTED_PROVIDERS if provider in available_providers]
+
+    @property
+    def provider_options(self) -> list[dict[str, Any]]:
+        return self._provider_options
+
+    @provider_options.setter
+    def provider_options(self, provider_options: list[dict[str, Any]]) -> None:
+        log.debug(f"Setting execution provider options to {provider_options}")
+        self._provider_options = provider_options
+
+    @property
+    def provider_options_default(self) -> list[dict[str, Any]]:
+        options = []
+        for provider in self.providers:
+            match provider:
+                case "CPUExecutionProvider" | "CUDAExecutionProvider":
+                    option = {"arena_extend_strategy": "kSameAsRequested"}
+                case "OpenVINOExecutionProvider":
+                    try:
+                        device_ids: list[str] = ort.capi._pybind_state.get_available_openvino_device_ids()
+                        log.debug(f"Available OpenVINO devices: {device_ids}")
+                        gpu_devices = [device_id for device_id in device_ids if device_id.startswith("GPU")]
+                        option = {"device_id": gpu_devices[0]} if gpu_devices else {}
+                    except AttributeError as e:
+                        log.warning("Failed to get OpenVINO device IDs. Using default options.")
+                        log.error(e)
+                        option = {}
+                case _:
+                    option = {}
+            options.append(option)
+        return options
+
+    @property
+    def sess_options(self) -> ort.SessionOptions:
+        return self._sess_options
+
+    @sess_options.setter
+    def sess_options(self, sess_options: ort.SessionOptions) -> None:
+        log.debug(f"Setting execution_mode to {sess_options.execution_mode.name}")
+        log.debug(f"Setting inter_op_num_threads to {sess_options.inter_op_num_threads}")
+        log.debug(f"Setting intra_op_num_threads to {sess_options.intra_op_num_threads}")
+        self._sess_options = sess_options
+
+    @property
+    def sess_options_default(self) -> ort.SessionOptions:
+        sess_options = PicklableSessionOptions()
+        sess_options.enable_cpu_mem_arena = False
+
+        # avoid thread contention between models
+        if settings.model_inter_op_threads > 0:
+            sess_options.inter_op_num_threads = settings.model_inter_op_threads
+        # these defaults work well for CPU, but bottleneck GPU
+        elif settings.model_inter_op_threads == 0 and self.providers == ["CPUExecutionProvider"]:
+            sess_options.inter_op_num_threads = 1
+
+        if settings.model_intra_op_threads > 0:
+            sess_options.intra_op_num_threads = settings.model_intra_op_threads
+        elif settings.model_intra_op_threads == 0 and self.providers == ["CPUExecutionProvider"]:
+            sess_options.intra_op_num_threads = 2
+
+        if sess_options.inter_op_num_threads > 1:
+            sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
+
+        return sess_options
+

 # HF deep copies configs, so we need to make session options picklable
 class PicklableSessionOptions(ort.SessionOptions):  # type: ignore[misc]