feat(server): near-duplicate detection (#8228)

* duplicate detection job, entity, config

* queueing

* job panel, update api

* use embedding in db instead of fetching

* disable concurrency

* only queue visible assets

* handle multiple duplicateIds

* update concurrent queue check

* add provider

* add web placeholder, server endpoint, migration, various fixes

* update sql

* select embedding by default

* rename variable

* simplify

* remove separate entity, handle re-running with different threshold, set default back to 0.02

* fix tests

* add tests

* add index to entity

* formatting

* update asset mock

* fix `upsertJobStatus` signature

* update sql

* formatting

* default to 0.03

* optimize clustering

* use asset's `duplicateId` if present

* update sql

* update tests

* expose admin setting

* refactor

* formatting

* skip if ml is disabled

* debug trash e2e

* remove from web

* remove from sidebar

* test if ml is disabled

* update sql

* separate duplicate detection from clip in config, disable by default for now

* fix doc

* lower minimum `maxDistance`

* update api

* Add and Use Duplicate Detection Feature Flag (#9364)

* Add Duplicate Detection Flag

* Use Duplicate Detection Flag

* Attempt Fixes for Failing Checks

* lower minimum `maxDistance`

* fix tests

---------

Co-authored-by: mertalev <101130780+mertalev@users.noreply.github.com>

* chore: fixes and additions after rebase

* chore: update api (remove new Role enum)

* fix: left join smart search so getAll works without machine learning

* test: trash e2e go back to checking length of assets is zero

* chore: regen api after rebase

* test: fix tests after rebase

* redundant join

---------

Co-authored-by: Nicholas Flamy <30300649+NicholasFlamy@users.noreply.github.com>
Co-authored-by: Zack Pollard <zackpollard@ymail.com>
Co-authored-by: Zack Pollard <zack@futo.org>
This commit is contained in:
Mert 2024-05-16 13:08:37 -04:00 committed by GitHub
parent 673e97e71d
commit 64636c0618
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
61 changed files with 1254 additions and 61 deletions

View file

@ -40,6 +40,7 @@ export enum WithoutProperty {
ENCODED_VIDEO = 'encoded-video',
EXIF = 'exif',
SMART_SEARCH = 'smart-search',
DUPLICATE = 'duplicate',
OBJECT_TAGS = 'object-tags',
FACES = 'faces',
PERSON = 'person',
@ -60,6 +61,7 @@ export interface AssetBuilderOptions {
isArchived?: boolean;
isFavorite?: boolean;
isTrashed?: boolean;
isDuplicate?: boolean;
albumId?: string;
personId?: string;
userIds?: string[];
@ -143,6 +145,12 @@ export interface AssetDeltaSyncOptions {
limit: number;
}
export interface AssetUpdateDuplicateOptions {
targetDuplicateId: string | null;
assetIds: string[];
duplicateIds: string[];
}
export type AssetPathEntity = Pick<AssetEntity, 'id' | 'originalPath' | 'isOffline'>;
export const IAssetRepository = 'IAssetRepository';
@ -176,6 +184,7 @@ export interface IAssetRepository {
getAll(pagination: PaginationOptions, options?: AssetSearchOptions): Paginated<AssetEntity>;
getAllByDeviceId(userId: string, deviceId: string): Promise<string[]>;
updateAll(ids: string[], options: Partial<AssetUpdateAllOptions>): Promise<void>;
updateDuplicates(options: AssetUpdateDuplicateOptions): Promise<void>;
update(asset: AssetUpdateOptions): Promise<void>;
remove(asset: AssetEntity): Promise<void>;
softDeleteAll(ids: string[]): Promise<void>;
@ -186,9 +195,10 @@ export interface IAssetRepository {
getTimeBuckets(options: TimeBucketOptions): Promise<TimeBucketItem[]>;
getTimeBucket(timeBucket: string, options: TimeBucketOptions): Promise<AssetEntity[]>;
upsertExif(exif: Partial<ExifEntity>): Promise<void>;
upsertJobStatus(jobStatus: Partial<AssetJobStatusEntity>): Promise<void>;
upsertJobStatus(...jobStatus: Partial<AssetJobStatusEntity>[]): Promise<void>;
getAssetIdByCity(userId: string, options: AssetExploreFieldOptions): Promise<SearchExploreItem<string>>;
getAssetIdByTag(userId: string, options: AssetExploreFieldOptions): Promise<SearchExploreItem<string>>;
getDuplicates(options: AssetBuilderOptions): Promise<AssetEntity[]>;
getAllForUserFullSync(options: AssetFullSyncOptions): Promise<AssetEntity[]>;
getChangedDeltaSync(options: AssetDeltaSyncOptions): Promise<AssetEntity[]>;
}

View file

@ -5,6 +5,7 @@ export enum QueueName {
FACE_DETECTION = 'faceDetection',
FACIAL_RECOGNITION = 'facialRecognition',
SMART_SEARCH = 'smartSearch',
DUPLICATE_DETECTION = 'duplicateDetection',
BACKGROUND_TASK = 'backgroundTask',
STORAGE_TEMPLATE_MIGRATION = 'storageTemplateMigration',
MIGRATION = 'migration',
@ -16,7 +17,7 @@ export enum QueueName {
export type ConcurrentQueueName = Exclude<
QueueName,
QueueName.STORAGE_TEMPLATE_MIGRATION | QueueName.FACIAL_RECOGNITION
QueueName.STORAGE_TEMPLATE_MIGRATION | QueueName.FACIAL_RECOGNITION | QueueName.DUPLICATE_DETECTION
>;
export enum JobCommand {
@ -86,6 +87,10 @@ export enum JobName {
QUEUE_SMART_SEARCH = 'queue-smart-search',
SMART_SEARCH = 'smart-search',
// duplicate detection
QUEUE_DUPLICATE_DETECTION = 'queue-duplicate-detection',
DUPLICATE_DETECTION = 'duplicate-detection',
// XMP sidecars
QUEUE_SIDECAR = 'queue-sidecar',
SIDECAR_DISCOVERY = 'sidecar-discovery',
@ -212,6 +217,10 @@ export type JobItem =
| { name: JobName.QUEUE_SMART_SEARCH; data: IBaseJob }
| { name: JobName.SMART_SEARCH; data: IEntityJob }
// Duplicate Detection
| { name: JobName.QUEUE_DUPLICATE_DETECTION; data: IBaseJob }
| { name: JobName.DUPLICATE_DETECTION; data: IEntityJob }
// Filesystem
| { name: JobName.DELETE_FILES; data: IDeleteFilesJob }

View file

@ -152,15 +152,29 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {
maxDistance?: number;
}
export interface AssetDuplicateSearch {
assetId: string;
embedding: Embedding;
userIds: string[];
maxDistance?: number;
}
export interface FaceSearchResult {
distance: number;
face: AssetFaceEntity;
}
export interface AssetDuplicateResult {
assetId: string;
duplicateId: string | null;
distance: number;
}
export interface ISearchRepository {
init(modelName: string): Promise<void>;
searchMetadata(pagination: SearchPaginationOptions, options: AssetSearchOptions): Paginated<AssetEntity>;
searchSmart(pagination: SearchPaginationOptions, options: SmartSearchOptions): Paginated<AssetEntity>;
searchDuplicates(options: AssetDuplicateSearch): Promise<AssetDuplicateResult[]>;
searchFaces(search: FaceEmbeddingSearch): Promise<FaceSearchResult[]>;
upsert(assetId: string, embedding: number[]): Promise<void>;
searchPlaces(placeName: string): Promise<GeodataPlacesEntity[]>;