feat(cli): use a queue for duplicate and upload (#10750)

* feat(cli): use a queue for duplicate and upload

Using a queue to process the files makes the file duplicate detection and asset upload more stable and tolerant of network errors. If an error occurs, the whole command will not stop; the task will be retried (3 times) before logging the error and moving to the next step.

The new queue abstraction is using [fastq](https://www.npmjs.com/package/fastq) internally.

* chore(cli): queue.push return promise which resolve with task

* test(cli): add spec for uploadFiles and checkForDuplicates
This commit is contained in:
Simon Thiboutôt 2024-07-08 23:39:07 -04:00 committed by GitHub
parent af94f0f979
commit eb89208abb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 456 additions and 39 deletions

View file

@ -1,10 +1,18 @@
import { platform } from 'node:os';
import { UploadOptionsDto, getAlbumName } from 'src/commands/asset';
import { describe, expect, it } from 'vitest';
import * as fs from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { describe, expect, it, vi } from 'vitest';
describe('Unit function tests', () => {
import { Action, checkBulkUpload, defaults, Reason } from '@immich/sdk';
import createFetchMock from 'vitest-fetch-mock';
import { checkForDuplicates, getAlbumName, uploadFiles, UploadOptionsDto } from './asset';
vi.mock('@immich/sdk');
describe('getAlbumName', () => {
it('should return a non-undefined value', () => {
if (platform() === 'win32') {
if (os.platform() === 'win32') {
// This is meaningless for Unix systems.
expect(getAlbumName(String.raw`D:\test\Filename.txt`, {} as UploadOptionsDto)).toBe('test');
}
@ -17,3 +25,177 @@ describe('Unit function tests', () => {
);
});
});
describe('uploadFiles', () => {
const testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-'));
const testFilePath = path.join(testDir, 'test.png');
const testFileData = 'test';
const baseUrl = 'http://example.com';
const apiKey = 'key';
const retry = 3;
const fetchMocker = createFetchMock(vi);
beforeEach(() => {
// Create a test file
fs.writeFileSync(testFilePath, testFileData);
// Defaults
vi.mocked(defaults).baseUrl = baseUrl;
vi.mocked(defaults).headers = { 'x-api-key': apiKey };
fetchMocker.enableMocks();
fetchMocker.resetMocks();
});
it('returns new assets when upload file is successful', async () => {
fetchMocker.doMockIf(new RegExp(`${baseUrl}/assets$`), () => {
return {
status: 200,
body: JSON.stringify({ id: 'fc5621b1-86f6-44a1-9905-403e607df9f5', status: 'created' }),
};
});
await expect(uploadFiles([testFilePath], { concurrency: 1 })).resolves.toEqual([
{
filepath: testFilePath,
id: 'fc5621b1-86f6-44a1-9905-403e607df9f5',
},
]);
});
it('returns new assets when upload file retry is successful', async () => {
let counter = 0;
fetchMocker.doMockIf(new RegExp(`${baseUrl}/assets$`), () => {
counter++;
if (counter < retry) {
throw new Error('Network error');
}
return {
status: 200,
body: JSON.stringify({ id: 'fc5621b1-86f6-44a1-9905-403e607df9f5', status: 'created' }),
};
});
await expect(uploadFiles([testFilePath], { concurrency: 1 })).resolves.toEqual([
{
filepath: testFilePath,
id: 'fc5621b1-86f6-44a1-9905-403e607df9f5',
},
]);
});
it('returns new assets when upload file retry is failed', async () => {
fetchMocker.doMockIf(new RegExp(`${baseUrl}/assets$`), () => {
throw new Error('Network error');
});
await expect(uploadFiles([testFilePath], { concurrency: 1 })).resolves.toEqual([]);
});
});
describe('checkForDuplicates', () => {
const testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'test-'));
const testFilePath = path.join(testDir, 'test.png');
const testFileData = 'test';
const testFileChecksum = 'a94a8fe5ccb19ba61c4c0873d391e987982fbbd3'; // SHA1
const retry = 3;
beforeEach(() => {
// Create a test file
fs.writeFileSync(testFilePath, testFileData);
});
it('checks duplicates', async () => {
vi.mocked(checkBulkUpload).mockResolvedValue({
results: [
{
action: Action.Accept,
id: testFilePath,
},
],
});
await checkForDuplicates([testFilePath], { concurrency: 1 });
expect(checkBulkUpload).toHaveBeenCalledWith({
assetBulkUploadCheckDto: {
assets: [
{
checksum: testFileChecksum,
id: testFilePath,
},
],
},
});
});
it('returns duplicates when check duplicates is rejected', async () => {
vi.mocked(checkBulkUpload).mockResolvedValue({
results: [
{
action: Action.Reject,
id: testFilePath,
assetId: 'fc5621b1-86f6-44a1-9905-403e607df9f5',
reason: Reason.Duplicate,
},
],
});
await expect(checkForDuplicates([testFilePath], { concurrency: 1 })).resolves.toEqual({
duplicates: [
{
filepath: testFilePath,
id: 'fc5621b1-86f6-44a1-9905-403e607df9f5',
},
],
newFiles: [],
});
});
it('returns new assets when check duplicates is accepted', async () => {
vi.mocked(checkBulkUpload).mockResolvedValue({
results: [
{
action: Action.Accept,
id: testFilePath,
},
],
});
await expect(checkForDuplicates([testFilePath], { concurrency: 1 })).resolves.toEqual({
duplicates: [],
newFiles: [testFilePath],
});
});
it('returns results when check duplicates retry is successful', async () => {
let mocked = vi.mocked(checkBulkUpload);
for (let i = 1; i < retry; i++) {
mocked = mocked.mockRejectedValueOnce(new Error('Network error'));
}
mocked.mockResolvedValue({
results: [
{
action: Action.Accept,
id: testFilePath,
},
],
});
await expect(checkForDuplicates([testFilePath], { concurrency: 1 })).resolves.toEqual({
duplicates: [],
newFiles: [testFilePath],
});
});
it('returns results when check duplicates retry is failed', async () => {
vi.mocked(checkBulkUpload).mockRejectedValue(new Error('Network error'));
await expect(checkForDuplicates([testFilePath], { concurrency: 1 })).resolves.toEqual({
duplicates: [],
newFiles: [],
});
});
});

View file

@ -16,6 +16,7 @@ import { chunk } from 'lodash-es';
import { Stats, createReadStream } from 'node:fs';
import { stat, unlink } from 'node:fs/promises';
import path, { basename } from 'node:path';
import { Queue } from 'src/queue';
import { BaseOptions, authenticate, crawl, sha1 } from 'src/utils';
const s = (count: number) => (count === 1 ? '' : 's');
@ -83,7 +84,7 @@ const scan = async (pathsToCrawl: string[], options: UploadOptionsDto) => {
return files;
};
const checkForDuplicates = async (files: string[], { concurrency, skipHash }: UploadOptionsDto) => {
export const checkForDuplicates = async (files: string[], { concurrency, skipHash }: UploadOptionsDto) => {
if (skipHash) {
console.log('Skipping hash check, assuming all files are new');
return { newFiles: files, duplicates: [] };
@ -99,32 +100,50 @@ const checkForDuplicates = async (files: string[], { concurrency, skipHash }: Up
const newFiles: string[] = [];
const duplicates: Asset[] = [];
try {
// TODO refactor into a queue
for (const items of chunk(files, concurrency)) {
const dto = await Promise.all(items.map(async (filepath) => ({ id: filepath, checksum: await sha1(filepath) })));
const { results } = await checkBulkUpload({ assetBulkUploadCheckDto: { assets: dto } });
for (const { id: filepath, assetId, action } of results as AssetBulkUploadCheckResults) {
const queue = new Queue<string[], AssetBulkUploadCheckResults>(
async (filepaths: string[]) => {
const dto = await Promise.all(
filepaths.map(async (filepath) => ({ id: filepath, checksum: await sha1(filepath) })),
);
const response = await checkBulkUpload({ assetBulkUploadCheckDto: { assets: dto } });
const results = response.results as AssetBulkUploadCheckResults;
for (const { id: filepath, assetId, action } of results) {
if (action === Action.Accept) {
newFiles.push(filepath);
} else {
// rejects are always duplicates
duplicates.push({ id: assetId as string, filepath });
}
progressBar.increment();
}
}
} finally {
progressBar.stop();
progressBar.increment(filepaths.length);
return results;
},
{ concurrency, retry: 3 },
);
for (const items of chunk(files, concurrency)) {
await queue.push(items);
}
await queue.drained();
progressBar.stop();
console.log(`Found ${newFiles.length} new files and ${duplicates.length} duplicate${s(duplicates.length)}`);
// Report failures
const failedTasks = queue.tasks.filter((task) => task.status === 'failed');
if (failedTasks.length > 0) {
console.log(`Failed to verify ${failedTasks.length} file${s(failedTasks.length)}:`);
for (const task of failedTasks) {
console.log(`- ${task.data} - ${task.error}`);
}
}
return { newFiles, duplicates };
};
const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptionsDto): Promise<Asset[]> => {
export const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptionsDto): Promise<Asset[]> => {
if (files.length === 0) {
console.log('All assets were already uploaded, nothing to do.');
return [];
@ -158,37 +177,52 @@ const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptio
const newAssets: Asset[] = [];
try {
for (const items of chunk(files, concurrency)) {
await Promise.all(
items.map(async (filepath) => {
const stats = statsMap.get(filepath) as Stats;
const response = await uploadFile(filepath, stats);
const queue = new Queue<string, AssetMediaResponseDto>(
async (filepath: string) => {
const stats = statsMap.get(filepath);
if (!stats) {
throw new Error(`Stats not found for ${filepath}`);
}
newAssets.push({ id: response.id, filepath });
const response = await uploadFile(filepath, stats);
newAssets.push({ id: response.id, filepath });
if (response.status === AssetMediaStatus.Duplicate) {
duplicateCount++;
duplicateSize += stats.size ?? 0;
} else {
successCount++;
successSize += stats.size ?? 0;
}
if (response.status === AssetMediaStatus.Duplicate) {
duplicateCount++;
duplicateSize += stats.size ?? 0;
} else {
successCount++;
successSize += stats.size ?? 0;
}
uploadProgress.update(successSize, { value_formatted: byteSize(successSize + duplicateSize) });
uploadProgress.update(successSize, { value_formatted: byteSize(successSize + duplicateSize) });
return response;
},
{ concurrency, retry: 3 },
);
return response;
}),
);
}
} finally {
uploadProgress.stop();
for (const filepath of files) {
await queue.push(filepath);
}
await queue.drained();
uploadProgress.stop();
console.log(`Successfully uploaded ${successCount} new asset${s(successCount)} (${byteSize(successSize)})`);
if (duplicateCount > 0) {
console.log(`Skipped ${duplicateCount} duplicate asset${s(duplicateCount)} (${byteSize(duplicateSize)})`);
}
// Report failures
const failedTasks = queue.tasks.filter((task) => task.status === 'failed');
if (failedTasks.length > 0) {
console.log(`Failed to upload ${failedTasks.length} asset${s(failedTasks.length)}:`);
for (const task of failedTasks) {
console.log(`- ${task.data} - ${task.error}`);
}
}
return newAssets;
};