feat: availability checks (#22185)

2025-11-07 17:27:20 +00:00 · 2025-09-19 12:18:42 -04:00 · 2025-09-19 12:18:42 -04:00 · 3f2e0780d5
commit 3f2e0780d5
parent 52363cf0fb
25 changed files with 361 additions and 138 deletions
--- a/server/src/bin/sync-sql.ts
+++ b/server/src/bin/sync-sql.ts
@ -15,6 +15,7 @@ import { repositories } from 'src/repositories';
 import { AccessRepository } from 'src/repositories/access.repository';
 import { ConfigRepository } from 'src/repositories/config.repository';
 import { LoggingRepository } from 'src/repositories/logging.repository';
+import { MachineLearningRepository } from 'src/repositories/machine-learning.repository';
 import { SyncRepository } from 'src/repositories/sync.repository';
 import { AuthService } from 'src/services/auth.service';
 import { getKyselyConfig } from 'src/utils/database';
@ -57,7 +58,7 @@ class SqlGenerator {
    try {
      await this.setup();
      for (const Repository of repositories) {
-        if (Repository === LoggingRepository) {
+        if (Repository === LoggingRepository || Repository === MachineLearningRepository) {
          continue;
        }
        await this.process(Repository);
--- a/server/src/config.ts
+++ b/server/src/config.ts
@ -54,6 +54,11 @@ export interface SystemConfig {
  machineLearning: {
    enabled: boolean;
    urls: string[];
+    availabilityChecks: {
+      enabled: boolean;
+      timeout: number;
+      interval: number;
+    };
    clip: {
      enabled: boolean;
      modelName: string;
@ -176,6 +181,8 @@ export interface SystemConfig {
  };
 }

+export type MachineLearningConfig = SystemConfig['machineLearning'];
+
 export const defaults = Object.freeze<SystemConfig>({
  backup: {
    database: {
@ -227,6 +234,11 @@ export const defaults = Object.freeze<SystemConfig>({
  machineLearning: {
    enabled: process.env.IMMICH_MACHINE_LEARNING_ENABLED !== 'false',
    urls: [process.env.IMMICH_MACHINE_LEARNING_URL || 'http://immich-machine-learning:3003'],
+    availabilityChecks: {
+      enabled: true,
+      timeout: Number(process.env.IMMICH_MACHINE_LEARNING_PING_TIMEOUT) || 2000,
+      interval: 30_000,
+    },
    clip: {
      enabled: true,
      modelName: 'ViT-B-32__openai',
--- a/server/src/constants.ts
+++ b/server/src/constants.ts
@ -51,11 +51,6 @@ export const serverVersion = new SemVer(version);
 export const AUDIT_LOG_MAX_DURATION = Duration.fromObject({ days: 100 });
 export const ONE_HOUR = Duration.fromObject({ hours: 1 });

-export const MACHINE_LEARNING_PING_TIMEOUT = Number(process.env.MACHINE_LEARNING_PING_TIMEOUT || 2000);
-export const MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME = Number(
-  process.env.MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME || 30_000,
-);
-
 export const citiesFile = 'cities500.txt';
 export const reverseGeocodeMaxDistance = 25_000;

--- a/server/src/dtos/system-config.dto.ts
+++ b/server/src/dtos/system-config.dto.ts
@ -1,5 +1,5 @@
 import { ApiProperty } from '@nestjs/swagger';
-import { Exclude, Transform, Type } from 'class-transformer';
+import { Type } from 'class-transformer';
 import {
  ArrayMinSize,
  IsInt,
@ -15,7 +15,6 @@ import {
  ValidateNested,
 } from 'class-validator';
 import { SystemConfig } from 'src/config';
-import { PropertyLifecycle } from 'src/decorators';
 import { CLIPConfig, DuplicateDetectionConfig, FacialRecognitionConfig } from 'src/dtos/model-config.dto';
 import {
  AudioCodec,
@ -257,21 +256,32 @@ class SystemConfigLoggingDto {
  level!: LogLevel;
 }

+class MachineLearningAvailabilityChecksDto {
+  @ValidateBoolean()
+  enabled!: boolean;
+
+  @IsInt()
+  timeout!: number;
+
+  @IsInt()
+  interval!: number;
+}
+
 class SystemConfigMachineLearningDto {
  @ValidateBoolean()
  enabled!: boolean;

-  @PropertyLifecycle({ deprecatedAt: 'v1.122.0' })
-  @Exclude()
-  url?: string;
-
  @IsUrl({ require_tld: false, allow_underscores: true }, { each: true })
  @ArrayMinSize(1)
-  @Transform(({ obj, value }) => (obj.url ? [obj.url] : value))
  @ValidateIf((dto) => dto.enabled)
  @ApiProperty({ type: 'array', items: { type: 'string', format: 'uri' }, minItems: 1 })
  urls!: string[];

+  @Type(() => MachineLearningAvailabilityChecksDto)
+  @ValidateNested()
+  @IsObject()
+  availabilityChecks!: MachineLearningAvailabilityChecksDto;
+
  @Type(() => CLIPConfig)
  @ValidateNested()
  @IsObject()
--- a/server/src/repositories/logging.repository.ts
+++ b/server/src/repositories/logging.repository.ts
@ -142,6 +142,10 @@ export class LoggingRepository {
    this.handleMessage(LogLevel.Fatal, message, details);
  }

+  deprecate(message: string) {
+    this.warn(`[Deprecated] ${message}`);
+  }
+
  private handleFunction(level: LogLevel, message: LogFunction, details: LogDetails[]) {
    if (this.logger.isLevelEnabled(level)) {
      this.handleMessage(level, message(), details);
--- a/server/src/repositories/machine-learning.repository.ts
+++ b/server/src/repositories/machine-learning.repository.ts
@ -1,6 +1,7 @@
 import { Injectable } from '@nestjs/common';
+import { Duration } from 'luxon';
 import { readFile } from 'node:fs/promises';
-import { MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME, MACHINE_LEARNING_PING_TIMEOUT } from 'src/constants';
+import { MachineLearningConfig } from 'src/config';
 import { CLIPConfig } from 'src/dtos/model-config.dto';
 import { LoggingRepository } from 'src/repositories/logging.repository';

@ -57,82 +58,100 @@ export type TextEncodingOptions = ModelOptions & { language?: string };

@Injectable()
 export class MachineLearningRepository {
-  // Note that deleted URL's are not removed from this map (ie: they're leaked)
-  // Cleaning them up is low priority since there should be very few over a
-  // typical server uptime cycle
-  private urlAvailability: {
-    [url: string]:
-      | {
-          active: boolean;
-          lastChecked: number;
-        }
-      | undefined;
-  };
+  private healthyMap: Record<string, boolean> = {};
+  private interval?: ReturnType<typeof setInterval>;
+  private _config?: MachineLearningConfig;
+
+  private get config(): MachineLearningConfig {
+    if (!this._config) {
+      throw new Error('Machine learning repository not been setup');
+    }
+
+    return this._config;
+  }

  constructor(private logger: LoggingRepository) {
    this.logger.setContext(MachineLearningRepository.name);
-    this.urlAvailability = {};
  }

-  private setUrlAvailability(url: string, active: boolean) {
-    const current = this.urlAvailability[url];
-    if (current?.active !== active) {
-      this.logger.verbose(`Setting ${url} ML server to ${active ? 'active' : 'inactive'}.`);
+  setup(config: MachineLearningConfig) {
+    this._config = config;
+    this.teardown();
+
+    // delete old servers
+    for (const url of Object.keys(this.healthyMap)) {
+      if (!config.urls.includes(url)) {
+        delete this.healthyMap[url];
+      }
    }
-    this.urlAvailability[url] = {
-      active,
-      lastChecked: Date.now(),
-    };
+
+    if (!config.availabilityChecks.enabled) {
+      return;
+    }
+
+    this.tick();
+    this.interval = setInterval(
+      () => this.tick(),
+      Duration.fromObject({ milliseconds: config.availabilityChecks.interval }).as('milliseconds'),
+    );
  }

-  private async checkAvailability(url: string) {
-    let active = false;
+  teardown() {
+    if (this.interval) {
+      clearInterval(this.interval);
+    }
+  }
+
+  private tick() {
+    for (const url of this.config.urls) {
+      void this.check(url);
+    }
+  }
+
+  private async check(url: string) {
+    let healthy = false;
    try {
      const response = await fetch(new URL('/ping', url), {
-        signal: AbortSignal.timeout(MACHINE_LEARNING_PING_TIMEOUT),
+        signal: AbortSignal.timeout(this.config.availabilityChecks.timeout),
      });
-      active = response.ok;
+      if (response.ok) {
+        healthy = true;
+      }
    } catch {
      // nothing to do here
    }
-    this.setUrlAvailability(url, active);
-    return active;
+
+    this.setHealthy(url, healthy);
  }

-  private async shouldSkipUrl(url: string) {
-    const availability = this.urlAvailability[url];
-    if (availability === undefined) {
-      // If this is a new endpoint, then check inline and skip if it fails
-      if (!(await this.checkAvailability(url))) {
-        return true;
-      }
-      return false;
+  private setHealthy(url: string, healthy: boolean) {
+    if (this.healthyMap[url] !== healthy) {
+      this.logger.log(`Machine learning server became ${healthy ? 'healthy' : 'unhealthy'} (${url}).`);
    }
-    if (!availability.active && Date.now() - availability.lastChecked < MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME) {
-      // If this is an old inactive endpoint that hasn't been checked in a
-      // while then check but don't wait for the result, just skip it
-      // This avoids delays on every search whilst allowing higher priority
-      // ML servers to recover over time.
-      void this.checkAvailability(url);
+
+    this.healthyMap[url] = healthy;
+  }
+
+  private isHealthy(url: string) {
+    if (!this.config.availabilityChecks.enabled) {
      return true;
    }
-    return false;
+
+    return this.healthyMap[url];
  }

-  private async predict<T>(urls: string[], payload: ModelPayload, config: MachineLearningRequest): Promise<T> {
+  private async predict<T>(payload: ModelPayload, config: MachineLearningRequest): Promise<T> {
    const formData = await this.getFormData(payload, config);
-    let urlCounter = 0;
-    for (const url of urls) {
-      urlCounter++;
-      const isLast = urlCounter >= urls.length;
-      if (!isLast && (await this.shouldSkipUrl(url))) {
-        continue;
-      }

+    for (const url of [
+      // try healthy servers first
+      ...this.config.urls.filter((url) => this.isHealthy(url)),
+      ...this.config.urls.filter((url) => !this.isHealthy(url)),
+    ]) {
      try {
        const response = await fetch(new URL('/predict', url), { method: 'POST', body: formData });
        if (response.ok) {
-          this.setUrlAvailability(url, true);
+          this.setHealthy(url, true);
          return response.json();
        }

@ -144,20 +163,21 @@ export class MachineLearningRepository {
          `Machine learning request to "${url}" failed: ${error instanceof Error ? error.message : error}`,
        );
      }
-      this.setUrlAvailability(url, false);
+
+      this.setHealthy(url, false);
    }

    throw new Error(`Machine learning request '${JSON.stringify(config)}' failed for all URLs`);
  }

-  async detectFaces(urls: string[], imagePath: string, { modelName, minScore }: FaceDetectionOptions) {
+  async detectFaces(imagePath: string, { modelName, minScore }: FaceDetectionOptions) {
    const request = {
      [ModelTask.FACIAL_RECOGNITION]: {
        [ModelType.DETECTION]: { modelName, options: { minScore } },
        [ModelType.RECOGNITION]: { modelName },
      },
    };
-    const response = await this.predict<FacialRecognitionResponse>(urls, { imagePath }, request);
+    const response = await this.predict<FacialRecognitionResponse>({ imagePath }, request);
    return {
      imageHeight: response.imageHeight,
      imageWidth: response.imageWidth,
@ -165,15 +185,15 @@ export class MachineLearningRepository {
    };
  }

-  async encodeImage(urls: string[], imagePath: string, { modelName }: CLIPConfig) {
+  async encodeImage(imagePath: string, { modelName }: CLIPConfig) {
    const request = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: { modelName } } };
-    const response = await this.predict<ClipVisualResponse>(urls, { imagePath }, request);
+    const response = await this.predict<ClipVisualResponse>({ imagePath }, request);
    return response[ModelTask.SEARCH];
  }

-  async encodeText(urls: string[], text: string, { language, modelName }: TextEncodingOptions) {
+  async encodeText(text: string, { language, modelName }: TextEncodingOptions) {
    const request = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: { modelName, options: { language } } } };
-    const response = await this.predict<ClipTextualResponse>(urls, { text }, request);
+    const response = await this.predict<ClipTextualResponse>({ text }, request);
    return response[ModelTask.SEARCH];
  }

--- a/server/src/services/person.service.spec.ts
+++ b/server/src/services/person.service.spec.ts
@ -729,7 +729,6 @@ describe(PersonService.name, () => {
      mocks.assetJob.getForDetectFacesJob.mockResolvedValue({ ...assetStub.image, files: [assetStub.image.files[1]] });
      await sut.handleDetectFaces({ id: assetStub.image.id });
      expect(mocks.machineLearning.detectFaces).toHaveBeenCalledWith(
-        ['http://immich-machine-learning:3003'],
        '/uploads/user-id/thumbs/path.jpg',
        expect.objectContaining({ minScore: 0.7, modelName: 'buffalo_l' }),
      );
--- a/server/src/services/person.service.ts
+++ b/server/src/services/person.service.ts
@ -316,7 +316,6 @@ export class PersonService extends BaseService {
    }

    const { imageHeight, imageWidth, faces } = await this.machineLearningRepository.detectFaces(
-      machineLearning.urls,
      previewFile.path,
      machineLearning.facialRecognition,
    );
--- a/server/src/services/search.service.spec.ts
+++ b/server/src/services/search.service.spec.ts
@ -211,7 +211,6 @@ describe(SearchService.name, () => {
      await sut.searchSmart(authStub.user1, { query: 'test' });

      expect(mocks.machineLearning.encodeText).toHaveBeenCalledWith(
-        [expect.any(String)],
        'test',
        expect.objectContaining({ modelName: expect.any(String) }),
      );
@ -225,7 +224,6 @@ describe(SearchService.name, () => {
      await sut.searchSmart(authStub.user1, { query: 'test', page: 2, size: 50 });

      expect(mocks.machineLearning.encodeText).toHaveBeenCalledWith(
-        [expect.any(String)],
        'test',
        expect.objectContaining({ modelName: expect.any(String) }),
      );
@ -243,7 +241,6 @@ describe(SearchService.name, () => {
      await sut.searchSmart(authStub.user1, { query: 'test' });

      expect(mocks.machineLearning.encodeText).toHaveBeenCalledWith(
-        [expect.any(String)],
        'test',
        expect.objectContaining({ modelName: 'ViT-B-16-SigLIP__webli' }),
      );
@ -253,7 +250,6 @@ describe(SearchService.name, () => {
      await sut.searchSmart(authStub.user1, { query: 'test', language: 'de' });

      expect(mocks.machineLearning.encodeText).toHaveBeenCalledWith(
-        [expect.any(String)],
        'test',
        expect.objectContaining({ language: 'de' }),
      );
--- a/server/src/services/search.service.ts
+++ b/server/src/services/search.service.ts
@ -118,7 +118,7 @@ export class SearchService extends BaseService {
      const key = machineLearning.clip.modelName + dto.query + dto.language;
      embedding = this.embeddingCache.get(key);
      if (!embedding) {
-        embedding = await this.machineLearningRepository.encodeText(machineLearning.urls, dto.query, {
+        embedding = await this.machineLearningRepository.encodeText(dto.query, {
          modelName: machineLearning.clip.modelName,
          language: dto.language,
        });
--- a/server/src/services/smart-info.service.spec.ts
+++ b/server/src/services/smart-info.service.spec.ts
@ -205,7 +205,6 @@ describe(SmartInfoService.name, () => {
      expect(await sut.handleEncodeClip({ id: assetStub.image.id })).toEqual(JobStatus.Success);

      expect(mocks.machineLearning.encodeImage).toHaveBeenCalledWith(
-        ['http://immich-machine-learning:3003'],
        '/uploads/user-id/thumbs/path.jpg',
        expect.objectContaining({ modelName: 'ViT-B-32__openai' }),
      );
@ -242,7 +241,6 @@ describe(SmartInfoService.name, () => {

      expect(mocks.database.wait).toHaveBeenCalledWith(512);
      expect(mocks.machineLearning.encodeImage).toHaveBeenCalledWith(
-        ['http://immich-machine-learning:3003'],
        '/uploads/user-id/thumbs/path.jpg',
        expect.objectContaining({ modelName: 'ViT-B-32__openai' }),
      );
--- a/server/src/services/smart-info.service.ts
+++ b/server/src/services/smart-info.service.ts
@ -108,11 +108,7 @@ export class SmartInfoService extends BaseService {
      return JobStatus.Skipped;
    }

-    const embedding = await this.machineLearningRepository.encodeImage(
-      machineLearning.urls,
-      asset.files[0].path,
-      machineLearning.clip,
-    );
+    const embedding = await this.machineLearningRepository.encodeImage(asset.files[0].path, machineLearning.clip);

    if (this.databaseRepository.isBusy(DatabaseLock.CLIPDimSize)) {
      this.logger.verbose(`Waiting for CLIP dimension size to be updated`);
--- a/server/src/services/system-config.service.spec.ts
+++ b/server/src/services/system-config.service.spec.ts
@ -82,6 +82,11 @@ const updatedConfig = Object.freeze<SystemConfig>({
  machineLearning: {
    enabled: true,
    urls: ['http://immich-machine-learning:3003'],
+    availabilityChecks: {
+      enabled: true,
+      interval: 30_000,
+      timeout: 2000,
+    },
    clip: {
      enabled: true,
      modelName: 'ViT-B-32__openai',
--- a/server/src/services/system-config.service.ts
+++ b/server/src/services/system-config.service.ts
@ -16,6 +16,20 @@ export class SystemConfigService extends BaseService {
  async onBootstrap() {
    const config = await this.getConfig({ withCache: false });
    await this.eventRepository.emit('ConfigInit', { newConfig: config });
+
+    if (
+      process.env.IMMICH_MACHINE_LEARNING_PING_TIMEOUT ||
+      process.env.IMMICH_MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME
+    ) {
+      this.logger.deprecate(
+        'IMMICH_MACHINE_LEARNING_PING_TIMEOUT and MACHINE_LEARNING_AVAILABILITY_BACKOFF_TIME have been moved to system config(`machineLearning.availabilityChecks`) and will be removed in a future release.',
+      );
+    }
+  }
+
+  @OnEvent({ name: 'AppShutdown' })
+  onShutdown() {
+    this.machineLearningRepository.teardown();
  }

  async getSystemConfig(): Promise<SystemConfigDto> {
@ -28,12 +42,14 @@ export class SystemConfigService extends BaseService {
  }

  @OnEvent({ name: 'ConfigInit', priority: -100 })
-  onConfigInit({ newConfig: { logging } }: ArgOf<'ConfigInit'>) {
+  onConfigInit({ newConfig: { logging, machineLearning } }: ArgOf<'ConfigInit'>) {
    const { logLevel: envLevel } = this.configRepository.getEnv();
    const configLevel = logging.enabled ? logging.level : false;
    const level = envLevel ?? configLevel;
    this.logger.setLogLevel(level);
    this.logger.log(`LogLevel=${level} ${envLevel ? '(set via IMMICH_LOG_LEVEL)' : '(set via system config)'}`);
+
+    this.machineLearningRepository.setup(machineLearning);
  }

  @OnEvent({ name: 'ConfigUpdate', server: true })