From 76109c5c6b2ff7746601fd0c6a02a159b7de5356 Mon Sep 17 00:00:00 2001 From: Ryan Gregg Date: Wed, 8 Oct 2025 21:39:18 -0700 Subject: [PATCH] Add additional dedupe logic --- web/src/lib/utils/duplicate-utils.spec.ts | 16 ++++ web/src/lib/utils/duplicate-utils.ts | 94 ++++++++++++++++++++--- 2 files changed, 98 insertions(+), 12 deletions(-) diff --git a/web/src/lib/utils/duplicate-utils.spec.ts b/web/src/lib/utils/duplicate-utils.spec.ts index 4fa427989a..5937cd6dfb 100644 --- a/web/src/lib/utils/duplicate-utils.spec.ts +++ b/web/src/lib/utils/duplicate-utils.spec.ts @@ -11,6 +11,22 @@ describe('choosing a duplicate', () => { expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[0]); }); + it('prefers DNG over CR2 even when the DNG is smaller', () => { + const assets = [ + { originalMimeType: 'image/x-canon-cr2', exifInfo: { fileSizeInByte: 500 } }, + { originalMimeType: 'image/x-adobe-dng', exifInfo: { fileSizeInByte: 200 } }, + ]; + expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[1]); + }); + + it('prefers HEIC over JPEG even when the HEIC is smaller', () => { + const assets = [ + { originalMimeType: 'image/jpeg', exifInfo: { fileSizeInByte: 400 } }, + { originalMimeType: 'image/heic', exifInfo: { fileSizeInByte: 150 } }, + ]; + expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[1]); + }); + it('picks the asset with the most exif data if multiple assets have the same file size', () => { const assets = [ { exifInfo: { fileSizeInByte: 200, rating: 5, fNumber: 1 } }, diff --git a/web/src/lib/utils/duplicate-utils.ts b/web/src/lib/utils/duplicate-utils.ts index 1c783a3667..1279742c4c 100644 --- a/web/src/lib/utils/duplicate-utils.ts +++ b/web/src/lib/utils/duplicate-utils.ts @@ -2,10 +2,83 @@ import { getExifCount } from '$lib/utils/exif-utils'; import type { AssetResponseDto } from '@immich/sdk'; import { sortBy } from 'lodash-es'; +const formatPreferenceGroups: string[][] = [ + ['image/x-adobe-dng', 'image/dng', 'dng'], + [ + 'image/x-canon-cr3', + 'image/x-canon-cr2', + 'image/x-nikon-nef', + 'image/x-sony-arw', + 'image/x-olympus-orf', + 'image/x-fuji-raf', + 'image/x-panasonic-rw2', + 'image/x-panasonic-raw', + 'image/x-pentax-pef', + 'image/x-samsung-srw', + 'cr3', + 'cr2', + 'nef', + 'arw', + 'orf', + 'raf', + 'rw2', + 'raw', + 'pef', + 'srw', + ], + ['image/heic', 'image/heif', 'heic', 'heif'], + ['image/avif', 'avif'], + ['image/jpeg', 'image/jpg', 'jpeg', 'jpg'], +]; + +const DEFAULT_FORMAT_PRIORITY = formatPreferenceGroups.length; + +const formatPriorityLookup = formatPreferenceGroups.reduce>((lookup, group, index) => { + for (const format of group) { + lookup.set(format, index); + } + return lookup; +}, new Map()); + +const getExtension = (path?: string) => { + if (!path) { + return undefined; + } + + const index = path.lastIndexOf('.'); + if (index === -1 || index === path.length - 1) { + return undefined; + } + + return path.slice(index + 1).toLowerCase(); +}; + +const getAssetFormatPriority = (asset: AssetResponseDto) => { + const candidates = [ + asset.originalMimeType?.toLowerCase(), + getExtension(asset.originalFileName), + getExtension(asset.originalPath), + ]; + + for (const candidate of candidates) { + if (!candidate) { + continue; + } + + const priority = formatPriorityLookup.get(candidate); + if (priority !== undefined) { + return priority; + } + } + + return DEFAULT_FORMAT_PRIORITY; +}; + /** * Suggests the best duplicate asset to keep from a list of duplicates. * * The best asset is determined by the following criteria: + * - Preferred original file format (based on mime type or extension) * - Largest image file size in bytes * - Largest count of exif data * @@ -13,18 +86,15 @@ import { sortBy } from 'lodash-es'; * @returns The best asset to keep */ export const suggestDuplicate = (assets: AssetResponseDto[]): AssetResponseDto | undefined => { - let duplicateAssets = sortBy(assets, (asset) => asset.exifInfo?.fileSizeInByte ?? 0); - - // Update the list to only include assets with the largest file size - duplicateAssets = duplicateAssets.filter( - (asset) => asset.exifInfo?.fileSizeInByte === duplicateAssets.at(-1)?.exifInfo?.fileSizeInByte, - ); - - // If there are multiple assets with the same file size, sort the list by the count of exif data - if (duplicateAssets.length >= 2) { - duplicateAssets = sortBy(duplicateAssets, getExifCount); + if (assets.length === 0) { + return undefined; } - // Return the last asset in the list - return duplicateAssets.pop(); + const sorted = sortBy(assets, [ + (asset) => getAssetFormatPriority(asset), + (asset) => -(asset.exifInfo?.fileSizeInByte ?? 0), + (asset) => -getExifCount(asset), + ]); + + return sorted[0]; };