mirror of
https://github.com/immich-app/immich
synced 2025-11-07 17:27:20 +00:00
refactor(ocr): update OCR schema and response structure to use individual coordinates instead of bounding box, and adjust related service and repository files
This commit is contained in:
parent
46ef02342d
commit
df36a09cd3
7 changed files with 66 additions and 34 deletions
|
|
@ -34,13 +34,17 @@ class PaddleOCRecognizer(InferenceModel):
|
|||
valid_texts_and_scores = [
|
||||
(text, score, box)
|
||||
for result in results
|
||||
for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_boxes'].tolist())
|
||||
for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys'])
|
||||
if score >= self.min_score
|
||||
]
|
||||
if not valid_texts_and_scores:
|
||||
return []
|
||||
|
||||
return [
|
||||
OCROutput(text=text, confidence=score, boundingBox={"x1": box[0], "y1": box[1], "x2": box[2], "y2": box[3]})
|
||||
OCROutput(
|
||||
text=text, confidence=score,
|
||||
x1=box[0][0], y1=box[0][1], x2=box[1][0], y2=box[1][1],
|
||||
x3=box[2][0], y3=box[2][1], x4=box[3][0], y4=box[3][1]
|
||||
)
|
||||
for text, score, box in valid_texts_and_scores
|
||||
]
|
||||
|
|
|
|||
|
|
@ -90,7 +90,14 @@ FacialRecognitionOutput = list[DetectedFace]
|
|||
class OCROutput(TypedDict):
|
||||
text: str
|
||||
confidence: float
|
||||
boundingBox: BoundingBox
|
||||
x1: int
|
||||
y1: int
|
||||
x2: int
|
||||
y2: int
|
||||
x3: int
|
||||
y3: int
|
||||
x4: int
|
||||
y4: int
|
||||
|
||||
|
||||
class PipelineEntry(TypedDict):
|
||||
|
|
|
|||
|
|
@ -40,13 +40,19 @@ export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: Mo
|
|||
export type ClipTextualResponse = { [ModelTask.SEARCH]: string };
|
||||
|
||||
export type OCR = {
|
||||
boundingBox: BoundingBox;
|
||||
x1: number;
|
||||
y1: number;
|
||||
x2: number;
|
||||
y2: number;
|
||||
x3: number;
|
||||
y3: number;
|
||||
x4: number;
|
||||
y4: number;
|
||||
text: string;
|
||||
confidence: number;
|
||||
};
|
||||
|
||||
export type OcrRequest = { [ModelTask.OCR]: { [ModelType.OCR]: ModelOptions & { options: { minScore: number } } } };
|
||||
export type OcrResponse = { [ModelTask.OCR]: OCR | OCR[] } & VisualResponse;
|
||||
export type OcrResponse = { [ModelTask.OCR]: OCR[] } & VisualResponse;
|
||||
|
||||
export type FacialRecognitionRequest = {
|
||||
[ModelTask.FACIAL_RECOGNITION]: {
|
||||
|
|
|
|||
|
|
@ -6,10 +6,14 @@ import { DummyValue, GenerateSql } from 'src/decorators';
|
|||
|
||||
export interface OcrInsertData {
|
||||
assetId: string;
|
||||
boundingBoxX1: number;
|
||||
boundingBoxY1: number;
|
||||
boundingBoxX2: number;
|
||||
boundingBoxY2: number;
|
||||
x1: number;
|
||||
y1: number;
|
||||
x2: number;
|
||||
y2: number;
|
||||
x3: number;
|
||||
y3: number;
|
||||
x4: number;
|
||||
y4: number;
|
||||
text: string;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,14 @@
|
|||
import { Kysely, sql } from 'kysely';
|
||||
|
||||
export async function up(db: Kysely<any>): Promise<void> {
|
||||
await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "boundingBoxX1" integer NOT NULL DEFAULT 0, "boundingBoxY1" integer NOT NULL DEFAULT 0, "boundingBoxX2" integer NOT NULL DEFAULT 0, "boundingBoxY2" integer NOT NULL DEFAULT 0, "text" text NOT NULL);`.execute(db);
|
||||
await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "x1" integer NOT NULL, "y1" integer NOT NULL, "x2" integer NOT NULL, "y2" integer NOT NULL, "x3" integer NOT NULL, "y3" integer NOT NULL, "x4" integer NOT NULL, "y4" integer NOT NULL, "text" text NOT NULL);`.execute(db);
|
||||
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22" PRIMARY KEY ("id");`.execute(db);
|
||||
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "FK_dc592ec504976f5636e28bb84c6" FOREIGN KEY ("assetId") REFERENCES "assets" ("id") ON UPDATE CASCADE ON DELETE CASCADE;`.execute(db);
|
||||
await sql`CREATE INDEX "IDX_dc592ec504976f5636e28bb84c" ON "asset_ocr" ("assetId")`.execute(db);
|
||||
}
|
||||
|
||||
export async function down(db: Kysely<any>): Promise<void> {
|
||||
await sql`DROP INDEX "IDX_dc592ec504976f5636e28bb84c";`.execute(db);
|
||||
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22";`.execute(db);
|
||||
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "FK_dc592ec504976f5636e28bb84c6";`.execute(db);
|
||||
await sql`DROP TABLE "asset_ocr";`.execute(db);
|
||||
|
|
@ -9,21 +9,33 @@ export class AssetOcrTable {
|
|||
@ForeignKeyColumn(() => AssetTable, {
|
||||
onDelete: 'CASCADE',
|
||||
onUpdate: 'CASCADE',
|
||||
index: false,
|
||||
index: true,
|
||||
})
|
||||
assetId!: string;
|
||||
|
||||
@Column({ default: 0, type: 'integer' })
|
||||
boundingBoxX1!: number;
|
||||
@Column({ type: 'integer' })
|
||||
x1!: number;
|
||||
|
||||
@Column({ default: 0, type: 'integer' })
|
||||
boundingBoxY1!: number;
|
||||
@Column({ type: 'integer' })
|
||||
y1!: number;
|
||||
|
||||
@Column({ default: 0, type: 'integer' })
|
||||
boundingBoxX2!: number;
|
||||
@Column({ type: 'integer' })
|
||||
x2!: number;
|
||||
|
||||
@Column({ default: 0, type: 'integer' })
|
||||
boundingBoxY2!: number;
|
||||
@Column({ type: 'integer' })
|
||||
y2!: number;
|
||||
|
||||
@Column({ type: 'integer' })
|
||||
x3!: number;
|
||||
|
||||
@Column({ type: 'integer' })
|
||||
y3!: number;
|
||||
|
||||
@Column({ type: 'integer' })
|
||||
x4!: number;
|
||||
|
||||
@Column({ type: 'integer' })
|
||||
y4!: number;
|
||||
|
||||
@Column({ type: 'text' })
|
||||
text!: string;
|
||||
|
|
|
|||
|
|
@ -66,14 +66,7 @@ export class OcrService extends BaseService {
|
|||
machineLearning.ocr
|
||||
);
|
||||
|
||||
const resultsArray = Array.isArray(ocrResults) ? ocrResults : [ocrResults];
|
||||
const validResults = resultsArray.filter(result =>
|
||||
result &&
|
||||
result.text &&
|
||||
result.text.trim().length > 0
|
||||
);
|
||||
|
||||
if (validResults.length === 0) {
|
||||
if (ocrResults.length === 0) {
|
||||
this.logger.warn(`No valid OCR results for document ${id}`);
|
||||
await this.assetRepository.upsertJobStatus({
|
||||
assetId: asset.id,
|
||||
|
|
@ -83,12 +76,16 @@ export class OcrService extends BaseService {
|
|||
}
|
||||
|
||||
try {
|
||||
const ocrDataList = validResults.map(result => ({
|
||||
const ocrDataList = ocrResults.map(result => ({
|
||||
assetId: id,
|
||||
boundingBoxX1: result.boundingBox.x1,
|
||||
boundingBoxY1: result.boundingBox.y1,
|
||||
boundingBoxX2: result.boundingBox.x2,
|
||||
boundingBoxY2: result.boundingBox.y2,
|
||||
x1: result.x1,
|
||||
y1: result.y1,
|
||||
x2: result.x2,
|
||||
y2: result.y2,
|
||||
x3: result.x3,
|
||||
y3: result.y3,
|
||||
x4: result.x4,
|
||||
y4: result.y4,
|
||||
text: result.text.trim(),
|
||||
}));
|
||||
|
||||
|
|
@ -99,7 +96,7 @@ export class OcrService extends BaseService {
|
|||
ocrAt: new Date(),
|
||||
});
|
||||
|
||||
this.logger.debug(`Processed ${validResults.length} OCR result(s) for ${id}`);
|
||||
this.logger.debug(`Processed ${ocrResults.length} OCR result(s) for ${id}`);
|
||||
return JobStatus.SUCCESS;
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to insert OCR results for ${id}:`, error);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue