chore(ml): improve shutdown (#5689)

2025-11-07 17:27:20 +00:00 · 2023-12-14 14:51:24 -05:00 · 2023-12-14 14:51:24 -05:00 · d729c863c8
commit d729c863c8
parent 9768931275
4 changed files with 80 additions and 48 deletions
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@ -1,15 +1,16 @@
 import asyncio
 import gc
 import os
+import signal
 import sys
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any
+from typing import Any, Iterator
 from zipfile import BadZipFile

 import orjson
-from fastapi import FastAPI, Form, HTTPException, UploadFile
+from fastapi import Depends, FastAPI, Form, HTTPException, UploadFile
 from fastapi.responses import ORJSONResponse
 from onnxruntime.capi.onnxruntime_pybind11_state import InvalidProtobuf, NoSuchFile
 from starlette.formparsers import MultiPartParser
@ -27,9 +28,16 @@ from .schemas import (
 MultiPartParser.max_file_size = 2**26  # spools to disk if payload is 64 MiB or larger
 app = FastAPI()

+model_cache = ModelCache(ttl=settings.model_ttl, revalidate=settings.model_ttl > 0)
+thread_pool: ThreadPoolExecutor | None = None
+lock = threading.Lock()
+active_requests = 0
+last_called: float | None = None

-def init_state() -> None:
-    app.state.model_cache = ModelCache(ttl=settings.model_ttl, revalidate=settings.model_ttl > 0)
+
+@app.on_event("startup")
+def startup() -> None:
+    global thread_pool
    log.info(
        (
            "Created in-memory cache with unloading "
@ -37,17 +45,30 @@ def init_state() -> None:
        )
    )
    # asyncio is a huge bottleneck for performance, so we use a thread pool to run blocking code
-    app.state.thread_pool = ThreadPoolExecutor(settings.request_threads) if settings.request_threads > 0 else None
-    app.state.lock = threading.Lock()
-    app.state.last_called = None
+    thread_pool = ThreadPoolExecutor(settings.request_threads) if settings.request_threads > 0 else None
    if settings.model_ttl > 0 and settings.model_ttl_poll_s > 0:
        asyncio.ensure_future(idle_shutdown_task())
    log.info(f"Initialized request thread pool with {settings.request_threads} threads.")


-@app.on_event("startup")
-async def startup_event() -> None:
-    init_state()
+@app.on_event("shutdown")
+def shutdown() -> None:
+    log.handlers.clear()
+    for model in model_cache.cache._cache.values():
+        del model
+    if thread_pool is not None:
+        thread_pool.shutdown()
+    gc.collect()
+
+
+def update_state() -> Iterator[None]:
+    global active_requests, last_called
+    active_requests += 1
+    last_called = time.time()
+    try:
+        yield
+    finally:
+        active_requests -= 1


@app.get("/", response_model=MessageResponse)
@ -60,7 +81,7 @@ def ping() -> str:
    return "pong"


-@app.post("/predict")
+@app.post("/predict", dependencies=[Depends(update_state)])
 async def predict(
    model_name: str = Form(alias="modelName"),
    model_type: ModelType = Form(alias="modelType"),
@ -79,17 +100,16 @@ async def predict(
    except orjson.JSONDecodeError:
        raise HTTPException(400, f"Invalid options JSON: {options}")

-    model = await load(await app.state.model_cache.get(model_name, model_type, **kwargs))
+    model = await load(await model_cache.get(model_name, model_type, **kwargs))
    model.configure(**kwargs)
    outputs = await run(model, inputs)
    return ORJSONResponse(outputs)


 async def run(model: InferenceModel, inputs: Any) -> Any:
-    app.state.last_called = time.time()
-    if app.state.thread_pool is None:
+    if thread_pool is None:
        return model.predict(inputs)
-    return await asyncio.get_running_loop().run_in_executor(app.state.thread_pool, model.predict, inputs)
+    return await asyncio.get_running_loop().run_in_executor(thread_pool, model.predict, inputs)


 async def load(model: InferenceModel) -> InferenceModel:
@ -97,15 +117,15 @@ async def load(model: InferenceModel) -> InferenceModel:
        return model

    def _load() -> None:
-        with app.state.lock:
+        with lock:
            model.load()

    loop = asyncio.get_running_loop()
    try:
-        if app.state.thread_pool is None:
+        if thread_pool is None:
            model.load()
        else:
-            await loop.run_in_executor(app.state.thread_pool, _load)
+            await loop.run_in_executor(thread_pool, _load)
        return model
    except (OSError, InvalidProtobuf, BadZipFile, NoSuchFile):
        log.warn(
@ -115,32 +135,23 @@ async def load(model: InferenceModel) -> InferenceModel:
            )
        )
        model.clear_cache()
-        if app.state.thread_pool is None:
+        if thread_pool is None:
            model.load()
        else:
-            await loop.run_in_executor(app.state.thread_pool, _load)
+            await loop.run_in_executor(thread_pool, _load)
        return model


 async def idle_shutdown_task() -> None:
    while True:
        log.debug("Checking for inactivity...")
-        if app.state.last_called is not None and time.time() - app.state.last_called > settings.model_ttl:
+        if (
+            last_called is not None
+            and not active_requests
+            and not lock.locked()
+            and time.time() - last_called > settings.model_ttl
+        ):
            log.info("Shutting down due to inactivity.")
-            loop = asyncio.get_running_loop()
-            for task in asyncio.all_tasks(loop):
-                if task is not asyncio.current_task():
-                    try:
-                        task.cancel()
-                    except asyncio.CancelledError:
-                        pass
-            sys.stderr.close()
-            sys.stdout.close()
-            sys.stdout = sys.stderr = open(os.devnull, "w")
-            try:
-                await app.state.model_cache.cache.clear()
-                gc.collect()
-                loop.stop()
-            except asyncio.CancelledError:
-                pass
+            os.kill(os.getpid(), signal.SIGINT)
+            break
        await asyncio.sleep(settings.model_ttl_poll_s)