feat: enhance OCR configuration and functionality

- Updated OCR settings to include minimum detection box score, minimum detection score, and minimum recognition score. - Refactored PaddleOCRecognizer to utilize new scoring parameters. - Introduced new database tables for asset OCR data and search functionality. - Modified related services and repositories to support the new OCR features. - Updated translations for improved clarity in settings UI.
2025-11-07 17:27:20 +00:00 · 2025-06-03 16:19:50 +08:00 · 2025-06-03 16:19:50 +08:00 · 4d8e51ede6
commit 4d8e51ede6
parent df36a09cd3
17 changed files with 180 additions and 51 deletions
--- a/machine-learning/immich_ml/models/ocr/paddle.py
+++ b/machine-learning/immich_ml/models/ocr/paddle.py
@ -12,10 +12,9 @@ class PaddleOCRecognizer(InferenceModel):
    depends = []
    identity = (ModelType.OCR, ModelTask.OCR)

-    def __init__(self, model_name: str, min_score: float = 0.9, **model_kwargs: Any) -> None:
-        self.min_score = model_kwargs.pop("minScore", min_score)
-        self.orientation_classify_enabled = model_kwargs.pop("orientationClassifyEnabled", True)
-        self.unwarping_enabled = model_kwargs.pop("unwarpingEnabled", True)
+    def __init__(self, model_name: str, **model_kwargs: Any) -> None:
+        self.orientation_classify_enabled = model_kwargs.get("orientationClassifyEnabled", False)
+        self.unwarping_enabled = model_kwargs.get("unwarpingEnabled", False)
        super().__init__(model_name, **model_kwargs)
        self._load()
        self.loaded = True
@ -28,23 +27,25 @@ class PaddleOCRecognizer(InferenceModel):
            use_doc_unwarping=self.unwarping_enabled,
        )

+    def configure(self, **kwargs: Any) -> None:
+        self.min_detection_score = kwargs.get("minDetectionScore", 0.3)
+        self.min_detection_box_score = kwargs.get("minDetectionBoxScore", 0.6)
+        self.min_recognition_score = kwargs.get("minRecognitionScore", 0.0)
+
    def _predict(self, inputs: NDArray[np.uint8] | bytes | Image.Image, **kwargs: Any) -> List[OCROutput]:
        inputs = decode_cv2(inputs)
-        results = self.model.predict(inputs)
-        valid_texts_and_scores = [
-            (text, score, box)
-            for result in results
-            for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys'])
-            if score >= self.min_score
-        ]
-        if not valid_texts_and_scores:
-            return []
-        
+        results = self.model.predict(
+            inputs,
+            text_det_thresh=self.min_detection_score,
+            text_det_box_thresh=self.min_detection_box_score,
+            text_rec_score_thresh=self.min_recognition_score
+        )
        return [
            OCROutput(
                text=text, confidence=score,
                x1=box[0][0], y1=box[0][1], x2=box[1][0], y2=box[1][1],
                x3=box[2][0], y3=box[2][1], x4=box[3][0], y4=box[3][1]
            )
-            for text, score, box in valid_texts_and_scores
+            for result in results
+            for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys'])
        ]