mirror of
https://github.com/immich-app/immich
synced 2025-11-14 17:36:12 +00:00
feat(ml): composable ml (#9973)
* modularize model classes * various fixes * expose port * change response * round coordinates * simplify preload * update server * simplify interface simplify * update tests * composable endpoint * cleanup fixes remove unnecessary interface support text input, cleanup * ew camelcase * update server server fixes fix typing * ml fixes update locustfile fixes * cleaner response * better repo response * update tests formatting and typing rename * undo compose change * linting fix type actually fix typing * stricter typing fix detection-only response no need for defaultdict * update spec file update api linting * update e2e * unnecessary dimension * remove commented code * remove duplicate code * remove unused imports * add batch dim
This commit is contained in:
parent
7a46f80ddc
commit
2b1b43a7e4
39 changed files with 982 additions and 999 deletions
|
|
@ -1,15 +1,5 @@
|
|||
import { CLIPConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
|
||||
|
||||
export const IMachineLearningRepository = 'IMachineLearningRepository';
|
||||
|
||||
export interface VisionModelInput {
|
||||
imagePath: string;
|
||||
}
|
||||
|
||||
export interface TextModelInput {
|
||||
text: string;
|
||||
}
|
||||
|
||||
export interface BoundingBox {
|
||||
x1: number;
|
||||
y1: number;
|
||||
|
|
@ -17,26 +7,51 @@ export interface BoundingBox {
|
|||
y2: number;
|
||||
}
|
||||
|
||||
export interface DetectFaceResult {
|
||||
imageWidth: number;
|
||||
imageHeight: number;
|
||||
boundingBox: BoundingBox;
|
||||
score: number;
|
||||
embedding: number[];
|
||||
export enum ModelTask {
|
||||
FACIAL_RECOGNITION = 'facial-recognition',
|
||||
SEARCH = 'clip',
|
||||
}
|
||||
|
||||
export enum ModelType {
|
||||
FACIAL_RECOGNITION = 'facial-recognition',
|
||||
CLIP = 'clip',
|
||||
DETECTION = 'detection',
|
||||
PIPELINE = 'pipeline',
|
||||
RECOGNITION = 'recognition',
|
||||
TEXTUAL = 'textual',
|
||||
VISUAL = 'visual',
|
||||
}
|
||||
|
||||
export enum CLIPMode {
|
||||
VISION = 'vision',
|
||||
TEXT = 'text',
|
||||
export type ModelPayload = { imagePath: string } | { text: string };
|
||||
|
||||
type ModelOptions = { modelName: string };
|
||||
|
||||
export type FaceDetectionOptions = ModelOptions & { minScore: number };
|
||||
|
||||
type VisualResponse = { imageHeight: number; imageWidth: number };
|
||||
export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } };
|
||||
export type ClipVisualResponse = { [ModelTask.SEARCH]: number[] } & VisualResponse;
|
||||
|
||||
export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: ModelOptions } };
|
||||
export type ClipTextualResponse = { [ModelTask.SEARCH]: number[] };
|
||||
|
||||
export type FacialRecognitionRequest = {
|
||||
[ModelTask.FACIAL_RECOGNITION]: {
|
||||
[ModelType.DETECTION]: FaceDetectionOptions;
|
||||
[ModelType.RECOGNITION]: ModelOptions;
|
||||
};
|
||||
};
|
||||
|
||||
export interface Face {
|
||||
boundingBox: BoundingBox;
|
||||
embedding: number[];
|
||||
score: number;
|
||||
}
|
||||
|
||||
export type FacialRecognitionResponse = { [ModelTask.FACIAL_RECOGNITION]: Face[] } & VisualResponse;
|
||||
export type DetectedFaces = { faces: Face[] } & VisualResponse;
|
||||
export type MachineLearningRequest = ClipVisualRequest | ClipTextualRequest | FacialRecognitionRequest;
|
||||
|
||||
export interface IMachineLearningRepository {
|
||||
encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise<number[]>;
|
||||
encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise<number[]>;
|
||||
detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise<DetectFaceResult[]>;
|
||||
encodeImage(url: string, imagePath: string, config: ModelOptions): Promise<number[]>;
|
||||
encodeText(url: string, text: string, config: ModelOptions): Promise<number[]>;
|
||||
detectFaces(url: string, imagePath: string, config: FaceDetectionOptions): Promise<DetectedFaces>;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,8 +37,6 @@ export interface SearchExploreItem<T> {
|
|||
items: SearchExploreItemSet<T>;
|
||||
}
|
||||
|
||||
export type Embedding = number[];
|
||||
|
||||
export interface SearchAssetIDOptions {
|
||||
checksum?: Buffer;
|
||||
deviceAssetId?: string;
|
||||
|
|
@ -106,7 +104,7 @@ export interface SearchExifOptions {
|
|||
}
|
||||
|
||||
export interface SearchEmbeddingOptions {
|
||||
embedding: Embedding;
|
||||
embedding: number[];
|
||||
userIds: string[];
|
||||
}
|
||||
|
||||
|
|
@ -154,7 +152,7 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {
|
|||
|
||||
export interface AssetDuplicateSearch {
|
||||
assetId: string;
|
||||
embedding: Embedding;
|
||||
embedding: number[];
|
||||
maxDistance?: number;
|
||||
type: AssetType;
|
||||
userIds: string[];
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue