feat(ml): composable ml (#9973)

* modularize model classes

* various fixes

* expose port

* change response

* round coordinates

* simplify preload

* update server

* simplify interface

simplify

* update tests

* composable endpoint

* cleanup

fixes

remove unnecessary interface

support text input, cleanup

* ew camelcase

* update server

server fixes

fix typing

* ml fixes

update locustfile

fixes

* cleaner response

* better repo response

* update tests

formatting and typing

rename

* undo compose change

* linting

fix type

actually fix typing

* stricter typing

fix detection-only response

no need for defaultdict

* update spec file

update api

linting

* update e2e

* unnecessary dimension

* remove commented code

* remove duplicate code

* remove unused imports

* add batch dim
This commit is contained in:
Mert 2024-06-06 23:09:47 -04:00 committed by GitHub
parent 7a46f80ddc
commit 2b1b43a7e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
39 changed files with 982 additions and 999 deletions

View file

@ -1,15 +1,5 @@
import { CLIPConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
export const IMachineLearningRepository = 'IMachineLearningRepository';
export interface VisionModelInput {
imagePath: string;
}
export interface TextModelInput {
text: string;
}
export interface BoundingBox {
x1: number;
y1: number;
@ -17,26 +7,51 @@ export interface BoundingBox {
y2: number;
}
export interface DetectFaceResult {
imageWidth: number;
imageHeight: number;
boundingBox: BoundingBox;
score: number;
embedding: number[];
export enum ModelTask {
FACIAL_RECOGNITION = 'facial-recognition',
SEARCH = 'clip',
}
export enum ModelType {
FACIAL_RECOGNITION = 'facial-recognition',
CLIP = 'clip',
DETECTION = 'detection',
PIPELINE = 'pipeline',
RECOGNITION = 'recognition',
TEXTUAL = 'textual',
VISUAL = 'visual',
}
export enum CLIPMode {
VISION = 'vision',
TEXT = 'text',
export type ModelPayload = { imagePath: string } | { text: string };
type ModelOptions = { modelName: string };
export type FaceDetectionOptions = ModelOptions & { minScore: number };
type VisualResponse = { imageHeight: number; imageWidth: number };
export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } };
export type ClipVisualResponse = { [ModelTask.SEARCH]: number[] } & VisualResponse;
export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: ModelOptions } };
export type ClipTextualResponse = { [ModelTask.SEARCH]: number[] };
export type FacialRecognitionRequest = {
[ModelTask.FACIAL_RECOGNITION]: {
[ModelType.DETECTION]: FaceDetectionOptions;
[ModelType.RECOGNITION]: ModelOptions;
};
};
export interface Face {
boundingBox: BoundingBox;
embedding: number[];
score: number;
}
export type FacialRecognitionResponse = { [ModelTask.FACIAL_RECOGNITION]: Face[] } & VisualResponse;
export type DetectedFaces = { faces: Face[] } & VisualResponse;
export type MachineLearningRequest = ClipVisualRequest | ClipTextualRequest | FacialRecognitionRequest;
export interface IMachineLearningRepository {
encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise<number[]>;
encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise<number[]>;
detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise<DetectFaceResult[]>;
encodeImage(url: string, imagePath: string, config: ModelOptions): Promise<number[]>;
encodeText(url: string, text: string, config: ModelOptions): Promise<number[]>;
detectFaces(url: string, imagePath: string, config: FaceDetectionOptions): Promise<DetectedFaces>;
}

View file

@ -37,8 +37,6 @@ export interface SearchExploreItem<T> {
items: SearchExploreItemSet<T>;
}
export type Embedding = number[];
export interface SearchAssetIDOptions {
checksum?: Buffer;
deviceAssetId?: string;
@ -106,7 +104,7 @@ export interface SearchExifOptions {
}
export interface SearchEmbeddingOptions {
embedding: Embedding;
embedding: number[];
userIds: string[];
}
@ -154,7 +152,7 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {
export interface AssetDuplicateSearch {
assetId: string;
embedding: Embedding;
embedding: number[];
maxDistance?: number;
type: AssetType;
userIds: string[];