Add additional dedupe logic

This commit is contained in:
Ryan Gregg 2025-10-08 21:39:18 -07:00
parent 53680d9643
commit 76109c5c6b
2 changed files with 98 additions and 12 deletions

View file

@ -11,6 +11,22 @@ describe('choosing a duplicate', () => {
expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[0]);
});
it('prefers DNG over CR2 even when the DNG is smaller', () => {
const assets = [
{ originalMimeType: 'image/x-canon-cr2', exifInfo: { fileSizeInByte: 500 } },
{ originalMimeType: 'image/x-adobe-dng', exifInfo: { fileSizeInByte: 200 } },
];
expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[1]);
});
it('prefers HEIC over JPEG even when the HEIC is smaller', () => {
const assets = [
{ originalMimeType: 'image/jpeg', exifInfo: { fileSizeInByte: 400 } },
{ originalMimeType: 'image/heic', exifInfo: { fileSizeInByte: 150 } },
];
expect(suggestDuplicate(assets as AssetResponseDto[])).toEqual(assets[1]);
});
it('picks the asset with the most exif data if multiple assets have the same file size', () => {
const assets = [
{ exifInfo: { fileSizeInByte: 200, rating: 5, fNumber: 1 } },

View file

@ -2,10 +2,83 @@ import { getExifCount } from '$lib/utils/exif-utils';
import type { AssetResponseDto } from '@immich/sdk';
import { sortBy } from 'lodash-es';
const formatPreferenceGroups: string[][] = [
['image/x-adobe-dng', 'image/dng', 'dng'],
[
'image/x-canon-cr3',
'image/x-canon-cr2',
'image/x-nikon-nef',
'image/x-sony-arw',
'image/x-olympus-orf',
'image/x-fuji-raf',
'image/x-panasonic-rw2',
'image/x-panasonic-raw',
'image/x-pentax-pef',
'image/x-samsung-srw',
'cr3',
'cr2',
'nef',
'arw',
'orf',
'raf',
'rw2',
'raw',
'pef',
'srw',
],
['image/heic', 'image/heif', 'heic', 'heif'],
['image/avif', 'avif'],
['image/jpeg', 'image/jpg', 'jpeg', 'jpg'],
];
const DEFAULT_FORMAT_PRIORITY = formatPreferenceGroups.length;
const formatPriorityLookup = formatPreferenceGroups.reduce<Map<string, number>>((lookup, group, index) => {
for (const format of group) {
lookup.set(format, index);
}
return lookup;
}, new Map());
const getExtension = (path?: string) => {
if (!path) {
return undefined;
}
const index = path.lastIndexOf('.');
if (index === -1 || index === path.length - 1) {
return undefined;
}
return path.slice(index + 1).toLowerCase();
};
const getAssetFormatPriority = (asset: AssetResponseDto) => {
const candidates = [
asset.originalMimeType?.toLowerCase(),
getExtension(asset.originalFileName),
getExtension(asset.originalPath),
];
for (const candidate of candidates) {
if (!candidate) {
continue;
}
const priority = formatPriorityLookup.get(candidate);
if (priority !== undefined) {
return priority;
}
}
return DEFAULT_FORMAT_PRIORITY;
};
/**
* Suggests the best duplicate asset to keep from a list of duplicates.
*
* The best asset is determined by the following criteria:
* - Preferred original file format (based on mime type or extension)
* - Largest image file size in bytes
* - Largest count of exif data
*
@ -13,18 +86,15 @@ import { sortBy } from 'lodash-es';
* @returns The best asset to keep
*/
export const suggestDuplicate = (assets: AssetResponseDto[]): AssetResponseDto | undefined => {
let duplicateAssets = sortBy(assets, (asset) => asset.exifInfo?.fileSizeInByte ?? 0);
// Update the list to only include assets with the largest file size
duplicateAssets = duplicateAssets.filter(
(asset) => asset.exifInfo?.fileSizeInByte === duplicateAssets.at(-1)?.exifInfo?.fileSizeInByte,
);
// If there are multiple assets with the same file size, sort the list by the count of exif data
if (duplicateAssets.length >= 2) {
duplicateAssets = sortBy(duplicateAssets, getExifCount);
if (assets.length === 0) {
return undefined;
}
// Return the last asset in the list
return duplicateAssets.pop();
const sorted = sortBy(assets, [
(asset) => getAssetFormatPriority(asset),
(asset) => -(asset.exifInfo?.fileSizeInByte ?? 0),
(asset) => -getExifCount(asset),
]);
return sorted[0];
};