feat(ml): composable ml (#9973)

* modularize model classes * various fixes * expose port * change response * round coordinates * simplify preload * update server * simplify interface simplify * update tests * composable endpoint * cleanup fixes remove unnecessary interface support text input, cleanup * ew camelcase * update server server fixes fix typing * ml fixes update locustfile fixes * cleaner response * better repo response * update tests formatting and typing rename * undo compose change * linting fix type actually fix typing * stricter typing fix detection-only response no need for defaultdict * update spec file update api linting * update e2e * unnecessary dimension * remove commented code * remove duplicate code * remove unused imports * add batch dim
2025-11-14 17:36:12 +00:00 · 2024-06-06 23:09:47 -04:00 · 2024-06-06 23:09:47 -04:00 · 2b1b43a7e4
commit 2b1b43a7e4
parent 7a46f80ddc
39 changed files with 982 additions and 999 deletions
--- a/server/src/interfaces/machine-learning.interface.ts
+++ b/server/src/interfaces/machine-learning.interface.ts
@ -1,15 +1,5 @@
-import { CLIPConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
-
 export const IMachineLearningRepository = 'IMachineLearningRepository';

-export interface VisionModelInput {
-  imagePath: string;
-}
-
-export interface TextModelInput {
-  text: string;
-}
-
 export interface BoundingBox {
  x1: number;
  y1: number;
@ -17,26 +7,51 @@ export interface BoundingBox {
  y2: number;
 }

-export interface DetectFaceResult {
-  imageWidth: number;
-  imageHeight: number;
-  boundingBox: BoundingBox;
-  score: number;
-  embedding: number[];
+export enum ModelTask {
+  FACIAL_RECOGNITION = 'facial-recognition',
+  SEARCH = 'clip',
 }

 export enum ModelType {
-  FACIAL_RECOGNITION = 'facial-recognition',
-  CLIP = 'clip',
+  DETECTION = 'detection',
+  PIPELINE = 'pipeline',
+  RECOGNITION = 'recognition',
+  TEXTUAL = 'textual',
+  VISUAL = 'visual',
 }

-export enum CLIPMode {
-  VISION = 'vision',
-  TEXT = 'text',
+export type ModelPayload = { imagePath: string } | { text: string };
+
+type ModelOptions = { modelName: string };
+
+export type FaceDetectionOptions = ModelOptions & { minScore: number };
+
+type VisualResponse = { imageHeight: number; imageWidth: number };
+export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } };
+export type ClipVisualResponse = { [ModelTask.SEARCH]: number[] } & VisualResponse;
+
+export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: ModelOptions } };
+export type ClipTextualResponse = { [ModelTask.SEARCH]: number[] };
+
+export type FacialRecognitionRequest = {
+  [ModelTask.FACIAL_RECOGNITION]: {
+    [ModelType.DETECTION]: FaceDetectionOptions;
+    [ModelType.RECOGNITION]: ModelOptions;
+  };
+};
+
+export interface Face {
+  boundingBox: BoundingBox;
+  embedding: number[];
+  score: number;
 }

+export type FacialRecognitionResponse = { [ModelTask.FACIAL_RECOGNITION]: Face[] } & VisualResponse;
+export type DetectedFaces = { faces: Face[] } & VisualResponse;
+export type MachineLearningRequest = ClipVisualRequest | ClipTextualRequest | FacialRecognitionRequest;
+
 export interface IMachineLearningRepository {
-  encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise<number[]>;
-  encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise<number[]>;
-  detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise<DetectFaceResult[]>;
+  encodeImage(url: string, imagePath: string, config: ModelOptions): Promise<number[]>;
+  encodeText(url: string, text: string, config: ModelOptions): Promise<number[]>;
+  detectFaces(url: string, imagePath: string, config: FaceDetectionOptions): Promise<DetectedFaces>;
 }
--- a/server/src/interfaces/search.interface.ts
+++ b/server/src/interfaces/search.interface.ts
@ -37,8 +37,6 @@ export interface SearchExploreItem<T> {
  items: SearchExploreItemSet<T>;
 }

-export type Embedding = number[];
-
 export interface SearchAssetIDOptions {
  checksum?: Buffer;
  deviceAssetId?: string;
@ -106,7 +104,7 @@ export interface SearchExifOptions {
 }

 export interface SearchEmbeddingOptions {
-  embedding: Embedding;
+  embedding: number[];
  userIds: string[];
 }

@ -154,7 +152,7 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {

 export interface AssetDuplicateSearch {
  assetId: string;
-  embedding: Embedding;
+  embedding: number[];
  maxDistance?: number;
  type: AssetType;
  userIds: string[];