diff --git a/i18n/en.json b/i18n/en.json index 18517733a2..f6dd1ed27f 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -149,8 +149,12 @@ "machine_learning_ocr_description": "Use machine learning to recognize text in images", "machine_learning_ocr_enabled": "Enable OCR", "machine_learning_ocr_enabled_description": "If disabled, images will not be encoded for text recognition.", - "machine_learning_ocr_min_score": "Minimum recognition score", - "machine_learning_ocr_min_score_description": "Minimum confidence score for text to be recognized from 0-1. Lower values will recognize more text but may result in false positives.", + "machine_learning_ocr_min_detection_box_score": "Minimum detection box score", + "machine_learning_ocr_min_detection_box_score_description": "Minimum confidence score for a text box to be detected from 0-1. The detection result box is considered a text box if the average score of all pixels within the box is greater than this threshold. Lower values will detect more text boxes but may result in false positives.", + "machine_learning_ocr_min_detection_score": "Minimum detection score", + "machine_learning_ocr_min_detection_score_description": "Minimum confidence score for text to be detected from 0-1. The output probability map, only pixels with scores greater than this threshold are considered text pixels. Lower values will detect more text but may result in false positives.", + "machine_learning_ocr_min_recognition_score": "Minimum recognition score", + "machine_learning_ocr_min_score_recognition_description": "Minimum confidence score for text to be recognized from 0-1. Only text results with scores greater than this threshold are retained. The default value for this parameter is 0, meaning no threshold is applied.", "machine_learning_ocr_model": "OCR model", "machine_learning_ocr_model_description": "Choose an OCR model. PP‑OCRv5_server is based on a deeper network architecture, resulting in a larger model size and higher accuracy. It is suitable for deployment on high-performance servers and can perform robust text recognition in complex images. In contrast, PP‑OCRv5_mobile employs pruning and lightweight design, with fewer parameters, faster loading, and more efficient computation, making it ideal for edge devices while still maintaining excellent OCR performance.", "machine_learning_ocr_orientation_classify_enabled": "Enable orientation classify", diff --git a/i18n/zh_SIMPLIFIED.json b/i18n/zh_SIMPLIFIED.json index 0552e2a81d..5e9ef86f75 100644 --- a/i18n/zh_SIMPLIFIED.json +++ b/i18n/zh_SIMPLIFIED.json @@ -149,8 +149,12 @@ "machine_learning_ocr_description": "使用机器学习识别图片中的文本", "machine_learning_ocr_enabled": "启用文本识别", "machine_learning_ocr_enabled_description": "如果禁用,则不会对图像编码以用于文本识别。", - "machine_learning_ocr_min_score": "最低识别分数", - "machine_learning_ocr_min_score_description": "文本识别的最小置信度分数范围是0到1。较低的值将识别出更多的文本,但可能导致误报。", + "machine_learning_ocr_min_detection_box_score": "最低检测框分数", + "machine_learning_ocr_min_detection_box_score_description": "文本框被检测到的最小置信度分数范围是0到1。检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域。较低的值将检测到更多的文本框,但可能导致误报。", + "machine_learning_ocr_min_detection_score": "最低检测分数", + "machine_learning_ocr_min_detection_score_description": "文本检测到的最小置信度分数范围是0到1。输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点。较低的值将检测到更多的文本,但可能导致误报。", + "machine_learning_ocr_min_recognition_score": "最低识别分数", + "machine_learning_ocr_min_score_recognition_description": "文本识别的最小置信度分数范围是0到1。得分大于该阈值的文本结果会被保留。默认值为 0,表示不设置阈值。", "machine_learning_ocr_model": "文本识别模型", "machine_learning_ocr_model_description": "选择一个文本识别模型。PP‑OCRv5_server 基于更深的网络结构,模型体积较大,准确率更高,适用于部署在高性能服务器上,可在复杂图像中进行稳健的文本识别;而 PP‑OCRv5_mobile 则通过剪枝与轻量化设计,参数量更少、加载更快、计算更高效,适合在边缘设备上运行,同时仍保持出色的 OCR 性能。", "machine_learning_ocr_orientation_classify_enabled": "启用方向分类", diff --git a/machine-learning/immich_ml/models/ocr/paddle.py b/machine-learning/immich_ml/models/ocr/paddle.py index 880a38179f..801f5b5281 100644 --- a/machine-learning/immich_ml/models/ocr/paddle.py +++ b/machine-learning/immich_ml/models/ocr/paddle.py @@ -12,10 +12,9 @@ class PaddleOCRecognizer(InferenceModel): depends = [] identity = (ModelType.OCR, ModelTask.OCR) - def __init__(self, model_name: str, min_score: float = 0.9, **model_kwargs: Any) -> None: - self.min_score = model_kwargs.pop("minScore", min_score) - self.orientation_classify_enabled = model_kwargs.pop("orientationClassifyEnabled", True) - self.unwarping_enabled = model_kwargs.pop("unwarpingEnabled", True) + def __init__(self, model_name: str, **model_kwargs: Any) -> None: + self.orientation_classify_enabled = model_kwargs.get("orientationClassifyEnabled", False) + self.unwarping_enabled = model_kwargs.get("unwarpingEnabled", False) super().__init__(model_name, **model_kwargs) self._load() self.loaded = True @@ -28,23 +27,25 @@ class PaddleOCRecognizer(InferenceModel): use_doc_unwarping=self.unwarping_enabled, ) + def configure(self, **kwargs: Any) -> None: + self.min_detection_score = kwargs.get("minDetectionScore", 0.3) + self.min_detection_box_score = kwargs.get("minDetectionBoxScore", 0.6) + self.min_recognition_score = kwargs.get("minRecognitionScore", 0.0) + def _predict(self, inputs: NDArray[np.uint8] | bytes | Image.Image, **kwargs: Any) -> List[OCROutput]: inputs = decode_cv2(inputs) - results = self.model.predict(inputs) - valid_texts_and_scores = [ - (text, score, box) - for result in results - for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys']) - if score >= self.min_score - ] - if not valid_texts_and_scores: - return [] - + results = self.model.predict( + inputs, + text_det_thresh=self.min_detection_score, + text_det_box_thresh=self.min_detection_box_score, + text_rec_score_thresh=self.min_recognition_score + ) return [ OCROutput( text=text, confidence=score, x1=box[0][0], y1=box[0][1], x2=box[1][0], y2=box[1][1], x3=box[2][0], y3=box[2][1], x4=box[3][0], y4=box[3][1] ) - for text, score, box in valid_texts_and_scores + for result in results + for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys']) ] diff --git a/open-api/immich-openapi-specs.json b/open-api/immich-openapi-specs.json index 97a7af6715..28125efdb3 100644 --- a/open-api/immich-openapi-specs.json +++ b/open-api/immich-openapi-specs.json @@ -12918,10 +12918,22 @@ "enabled": { "type": "boolean" }, - "minScore": { + "minDetectionBoxScore": { "format": "double", "maximum": 1, - "minimum": 0.1, + "minimum": 0, + "type": "number" + }, + "minDetectionScore": { + "format": "double", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "minRecognitionScore": { + "format": "double", + "maximum": 1, + "minimum": 0, "type": "number" }, "modelName": { @@ -12936,7 +12948,9 @@ }, "required": [ "enabled", - "minScore", + "minDetectionBoxScore", + "minDetectionScore", + "minRecognitionScore", "modelName", "orientationClassifyEnabled", "unwarpingEnabled" diff --git a/server/src/config.ts b/server/src/config.ts index ce6eceb6fe..88f1325203 100644 --- a/server/src/config.ts +++ b/server/src/config.ts @@ -72,7 +72,9 @@ export interface SystemConfig { ocr: { enabled: boolean; modelName: string; - minScore: number; + minDetectionBoxScore: number; + minDetectionScore: number; + minRecognitionScore: number; unwarpingEnabled: boolean; orientationClassifyEnabled: boolean; }; @@ -253,7 +255,9 @@ export const defaults = Object.freeze({ ocr: { enabled: true, modelName: 'PP-OCRv5_server', - minScore: 0.9, + minDetectionBoxScore: 0.6, + minDetectionScore: 0.3, + minRecognitionScore: 0.0, unwarpingEnabled: false, orientationClassifyEnabled: false, }, diff --git a/server/src/dtos/model-config.dto.ts b/server/src/dtos/model-config.dto.ts index 8f63ed8a54..5baaf5bb04 100644 --- a/server/src/dtos/model-config.dto.ts +++ b/server/src/dtos/model-config.dto.ts @@ -49,11 +49,25 @@ export class FacialRecognitionConfig extends ModelConfig { export class OcrConfig extends ModelConfig { @IsNumber() - @Min(0.1) + @Min(0) @Max(1) @Type(() => Number) @ApiProperty({ type: 'number', format: 'double' }) - minScore!: number; + minDetectionBoxScore!: number; + + @IsNumber() + @Min(0) + @Max(1) + @Type(() => Number) + @ApiProperty({ type: 'number', format: 'double' }) + minDetectionScore!: number; + + @IsNumber() + @Min(0) + @Max(1) + @Type(() => Number) + @ApiProperty({ type: 'number', format: 'double' }) + minRecognitionScore!: number; @ValidateBoolean() unwarpingEnabled!: boolean; diff --git a/server/src/repositories/asset-job.repository.ts b/server/src/repositories/asset-job.repository.ts index d78ffb37bc..0e545e12b1 100644 --- a/server/src/repositories/asset-job.repository.ts +++ b/server/src/repositories/asset-job.repository.ts @@ -356,10 +356,10 @@ export class AssetJobRepository { .$if(!force, (qb) => qb .innerJoin('asset_job_status', 'asset_job_status.assetId', 'assets.id') - .where('asset_job_status.ocrAt', 'is', null) - .where('assets.visibility', '!=', AssetVisibility.HIDDEN), + .where('asset_job_status.ocrAt', 'is', null), ) .where('assets.deletedAt', 'is', null) + .where('assets.visibility', '!=', AssetVisibility.HIDDEN) .stream(); } diff --git a/server/src/repositories/machine-learning.repository.ts b/server/src/repositories/machine-learning.repository.ts index 6bae7eeaaf..633d495296 100644 --- a/server/src/repositories/machine-learning.repository.ts +++ b/server/src/repositories/machine-learning.repository.ts @@ -31,7 +31,7 @@ export type ModelPayload = { imagePath: string } | { text: string }; type ModelOptions = { modelName: string }; export type FaceDetectionOptions = ModelOptions & { minScore: number }; -export type OcrOptions = ModelOptions & { minScore: number, unwarpingEnabled: boolean, orientationClassifyEnabled: boolean }; +export type OcrOptions = ModelOptions & { minDetectionBoxScore: number, minDetectionScore: number, minRecognitionScore: number, unwarpingEnabled: boolean, orientationClassifyEnabled: boolean }; type VisualResponse = { imageHeight: number; imageWidth: number }; export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } }; export type ClipVisualResponse = { [ModelTask.SEARCH]: string } & VisualResponse; @@ -49,9 +49,10 @@ export type OCR = { x4: number; y4: number; text: string; + confidence: number; }; -export type OcrRequest = { [ModelTask.OCR]: { [ModelType.OCR]: ModelOptions & { options: { minScore: number } } } }; +export type OcrRequest = { [ModelTask.OCR]: { [ModelType.OCR]: ModelOptions & { options: { minDetectionScore: number, minRecognitionScore: number } } } }; export type OcrResponse = { [ModelTask.OCR]: OCR[] } & VisualResponse; export type FacialRecognitionRequest = { @@ -210,8 +211,8 @@ export class MachineLearningRepository { return formData; } - async ocr(urls: string[], imagePath: string, { modelName, minScore, unwarpingEnabled, orientationClassifyEnabled }: OcrOptions) { - const request = { [ModelTask.OCR]: { [ModelType.OCR]: { modelName, options: { minScore, unwarpingEnabled, orientationClassifyEnabled } } } }; + async ocr(urls: string[], imagePath: string, { modelName, minDetectionBoxScore, minDetectionScore, minRecognitionScore, unwarpingEnabled, orientationClassifyEnabled }: OcrOptions) { + const request = { [ModelTask.OCR]: { [ModelType.OCR]: { modelName, options: { minDetectionBoxScore, minDetectionScore, minRecognitionScore, unwarpingEnabled, orientationClassifyEnabled } } } }; const response = await this.predict(urls, { imagePath }, request); return response[ModelTask.OCR]; } diff --git a/server/src/repositories/ocr.repository.ts b/server/src/repositories/ocr.repository.ts index 38c1d3e52f..a9fd594ed3 100644 --- a/server/src/repositories/ocr.repository.ts +++ b/server/src/repositories/ocr.repository.ts @@ -1,11 +1,10 @@ import { Injectable } from '@nestjs/common'; -import { Kysely, sql } from 'kysely'; +import { Kysely, QueryCreator, sql } from 'kysely'; import { InjectKysely } from 'nestjs-kysely'; import { DB } from 'src/db'; import { DummyValue, GenerateSql } from 'src/decorators'; export interface OcrInsertData { - assetId: string; x1: number; y1: number; x2: number; @@ -15,6 +14,7 @@ export interface OcrInsertData { x4: number; y4: number; text: string; + confidence: number; } @Injectable() @@ -32,16 +32,47 @@ export class OcrRepository { async deleteAll(): Promise { await sql`truncate ${sql.table('asset_ocr')}`.execute(this.db); + await sql`truncate ${sql.table('ocr_search')}`.execute(this.db); } - async insertMany(ocrDataList: OcrInsertData[]): Promise { + async upsert(assetId: string, ocrDataList: OcrInsertData[]): Promise { if (ocrDataList.length === 0) { return; } - - await this.db - .insertInto('asset_ocr') - .values(ocrDataList) - .execute(); + + const assetOcrData = ocrDataList.map(item => ({ + assetId, + ...item, + })); + + const searchText = ocrDataList.map(item => item.text.trim()).join(''); + + await this.db.transaction().execute(async (trx: Kysely) => { + await trx + .with('deleted_ocr', (db: QueryCreator) => + db.deleteFrom('asset_ocr').where('assetId', '=', assetId).returningAll() + ) + .insertInto('asset_ocr') + .values(assetOcrData) + .execute(); + + if (searchText.trim()) { + await trx + .with('deleted_search', (db: QueryCreator) => + db.deleteFrom('ocr_search').where('assetId', '=', assetId).returningAll() + ) + .insertInto('ocr_search') + .values({ + assetId, + text: searchText, + }) + .execute(); + } else { + await trx + .deleteFrom('ocr_search') + .where('assetId', '=', assetId) + .execute(); + } + }); } } diff --git a/server/src/repositories/search.repository.ts b/server/src/repositories/search.repository.ts index bacd953c92..06898bc09e 100644 --- a/server/src/repositories/search.repository.ts +++ b/server/src/repositories/search.repository.ts @@ -322,8 +322,8 @@ export class SearchRepository { } const items = await searchAssetBuilder(this.db, options) - .innerJoin('asset_ocr', 'assets.id', 'asset_ocr.assetId') - .where('asset_ocr.text', 'ilike', `%${options.ocr}%`) + .innerJoin('ocr_search', 'assets.id', 'ocr_search.assetId') + .where('ocr_search.text', 'ilike', `%${options.ocr}%`) .limit(pagination.size + 1) .offset((pagination.page - 1) * pagination.size) .execute(); diff --git a/server/src/schema/index.ts b/server/src/schema/index.ts index a57bdef1b9..337a18e1b7 100644 --- a/server/src/schema/index.ts +++ b/server/src/schema/index.ts @@ -48,6 +48,7 @@ import { MemoryTable } from 'src/schema/tables/memory.table'; import { MoveTable } from 'src/schema/tables/move.table'; import { NaturalEarthCountriesTable } from 'src/schema/tables/natural-earth-countries.table'; import { NotificationTable } from 'src/schema/tables/notification.table'; +import { OcrSearchTable } from 'src/schema/tables/ocr-search.table'; import { PartnerAuditTable } from 'src/schema/tables/partner-audit.table'; import { PartnerTable } from 'src/schema/tables/partner.table'; import { PersonAuditTable } from 'src/schema/tables/person-audit.table'; @@ -103,6 +104,7 @@ export class ImmichDatabase { MoveTable, NaturalEarthCountriesTable, NotificationTable, + OcrSearchTable, PartnerAuditTable, PartnerTable, PersonTable, diff --git a/server/src/schema/migrations/1748871815291-CreateOCRTable.ts b/server/src/schema/migrations/1748926208942-CreateAssetOCRTable.ts similarity index 90% rename from server/src/schema/migrations/1748871815291-CreateOCRTable.ts rename to server/src/schema/migrations/1748926208942-CreateAssetOCRTable.ts index 2c1b210424..4bf4d61dd8 100644 --- a/server/src/schema/migrations/1748871815291-CreateOCRTable.ts +++ b/server/src/schema/migrations/1748926208942-CreateAssetOCRTable.ts @@ -1,7 +1,7 @@ import { Kysely, sql } from 'kysely'; export async function up(db: Kysely): Promise { - await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "x1" integer NOT NULL, "y1" integer NOT NULL, "x2" integer NOT NULL, "y2" integer NOT NULL, "x3" integer NOT NULL, "y3" integer NOT NULL, "x4" integer NOT NULL, "y4" integer NOT NULL, "text" text NOT NULL);`.execute(db); + await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "x1" integer NOT NULL, "y1" integer NOT NULL, "x2" integer NOT NULL, "y2" integer NOT NULL, "x3" integer NOT NULL, "y3" integer NOT NULL, "x4" integer NOT NULL, "y4" integer NOT NULL, "text" text NOT NULL, "confidence" double precision NOT NULL);`.execute(db); await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22" PRIMARY KEY ("id");`.execute(db); await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "FK_dc592ec504976f5636e28bb84c6" FOREIGN KEY ("assetId") REFERENCES "assets" ("id") ON UPDATE CASCADE ON DELETE CASCADE;`.execute(db); await sql`CREATE INDEX "IDX_dc592ec504976f5636e28bb84c" ON "asset_ocr" ("assetId")`.execute(db); diff --git a/server/src/schema/migrations/1748929348618-CreateOCRSearchTable.ts b/server/src/schema/migrations/1748929348618-CreateOCRSearchTable.ts new file mode 100644 index 0000000000..388779a2fd --- /dev/null +++ b/server/src/schema/migrations/1748929348618-CreateOCRSearchTable.ts @@ -0,0 +1,13 @@ +import { Kysely, sql } from 'kysely'; + +export async function up(db: Kysely): Promise { + await sql`CREATE TABLE "ocr_search" ("assetId" uuid NOT NULL, "text" text NOT NULL);`.execute(db); + await sql`ALTER TABLE "ocr_search" ADD CONSTRAINT "PK_a8299b7f08ef223f6d32f4482a7" PRIMARY KEY ("assetId");`.execute(db); + await sql`ALTER TABLE "ocr_search" ADD CONSTRAINT "FK_a8299b7f08ef223f6d32f4482a7" FOREIGN KEY ("assetId") REFERENCES "assets" ("id") ON UPDATE CASCADE ON DELETE CASCADE;`.execute(db); +} + +export async function down(db: Kysely): Promise { + await sql`ALTER TABLE "ocr_search" DROP CONSTRAINT "PK_a8299b7f08ef223f6d32f4482a7";`.execute(db); + await sql`ALTER TABLE "ocr_search" DROP CONSTRAINT "FK_a8299b7f08ef223f6d32f4482a7";`.execute(db); + await sql`DROP TABLE "ocr_search";`.execute(db); +} diff --git a/server/src/schema/tables/asset-ocr.table.ts b/server/src/schema/tables/asset-ocr.table.ts index 90c2b23913..e7244c2bfa 100644 --- a/server/src/schema/tables/asset-ocr.table.ts +++ b/server/src/schema/tables/asset-ocr.table.ts @@ -39,4 +39,7 @@ export class AssetOcrTable { @Column({ type: 'text' }) text!: string; + + @Column({ type: 'double precision' }) + confidence!: number; } diff --git a/server/src/schema/tables/ocr-search.table.ts b/server/src/schema/tables/ocr-search.table.ts new file mode 100644 index 0000000000..e23fd963a9 --- /dev/null +++ b/server/src/schema/tables/ocr-search.table.ts @@ -0,0 +1,15 @@ +import { AssetTable } from 'src/schema/tables/asset.table'; +import { Column, ForeignKeyColumn, Table } from 'src/sql-tools'; + +@Table('ocr_search') +export class OcrSearchTable { + @ForeignKeyColumn(() => AssetTable, { + onDelete: 'CASCADE', + onUpdate: 'CASCADE', + primary: true, + }) + assetId!: string; + + @Column({ type: 'text' }) + text!: string; +} \ No newline at end of file diff --git a/server/src/services/ocr.service.ts b/server/src/services/ocr.service.ts index 15ef589524..e027fbf8cf 100644 --- a/server/src/services/ocr.service.ts +++ b/server/src/services/ocr.service.ts @@ -77,7 +77,6 @@ export class OcrService extends BaseService { try { const ocrDataList = ocrResults.map(result => ({ - assetId: id, x1: result.x1, y1: result.y1, x2: result.x2, @@ -87,10 +86,10 @@ export class OcrService extends BaseService { x4: result.x4, y4: result.y4, text: result.text.trim(), + confidence: result.confidence, })); - await this.ocrRepository.insertMany(ocrDataList); - + await this.ocrRepository.upsert(id, ocrDataList); await this.assetRepository.upsertJobStatus({ assetId: asset.id, ocrAt: new Date(), diff --git a/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte b/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte index 9ab8e88eb7..182dccae62 100644 --- a/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte +++ b/web/src/lib/components/admin-page/settings/machine-learning-settings/machine-learning-settings.svelte @@ -262,14 +262,38 @@ + + + +