mirror of
https://github.com/immich-app/immich
synced 2025-11-07 17:27:20 +00:00
refactor(ocr): update OCR schema and response structure to use individual coordinates instead of bounding box, and adjust related service and repository files
This commit is contained in:
parent
46ef02342d
commit
df36a09cd3
7 changed files with 66 additions and 34 deletions
|
|
@ -34,13 +34,17 @@ class PaddleOCRecognizer(InferenceModel):
|
||||||
valid_texts_and_scores = [
|
valid_texts_and_scores = [
|
||||||
(text, score, box)
|
(text, score, box)
|
||||||
for result in results
|
for result in results
|
||||||
for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_boxes'].tolist())
|
for text, score, box in zip(result['rec_texts'], result['rec_scores'], result['rec_polys'])
|
||||||
if score >= self.min_score
|
if score >= self.min_score
|
||||||
]
|
]
|
||||||
if not valid_texts_and_scores:
|
if not valid_texts_and_scores:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
return [
|
return [
|
||||||
OCROutput(text=text, confidence=score, boundingBox={"x1": box[0], "y1": box[1], "x2": box[2], "y2": box[3]})
|
OCROutput(
|
||||||
|
text=text, confidence=score,
|
||||||
|
x1=box[0][0], y1=box[0][1], x2=box[1][0], y2=box[1][1],
|
||||||
|
x3=box[2][0], y3=box[2][1], x4=box[3][0], y4=box[3][1]
|
||||||
|
)
|
||||||
for text, score, box in valid_texts_and_scores
|
for text, score, box in valid_texts_and_scores
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,14 @@ FacialRecognitionOutput = list[DetectedFace]
|
||||||
class OCROutput(TypedDict):
|
class OCROutput(TypedDict):
|
||||||
text: str
|
text: str
|
||||||
confidence: float
|
confidence: float
|
||||||
boundingBox: BoundingBox
|
x1: int
|
||||||
|
y1: int
|
||||||
|
x2: int
|
||||||
|
y2: int
|
||||||
|
x3: int
|
||||||
|
y3: int
|
||||||
|
x4: int
|
||||||
|
y4: int
|
||||||
|
|
||||||
|
|
||||||
class PipelineEntry(TypedDict):
|
class PipelineEntry(TypedDict):
|
||||||
|
|
|
||||||
|
|
@ -40,13 +40,19 @@ export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: Mo
|
||||||
export type ClipTextualResponse = { [ModelTask.SEARCH]: string };
|
export type ClipTextualResponse = { [ModelTask.SEARCH]: string };
|
||||||
|
|
||||||
export type OCR = {
|
export type OCR = {
|
||||||
boundingBox: BoundingBox;
|
x1: number;
|
||||||
|
y1: number;
|
||||||
|
x2: number;
|
||||||
|
y2: number;
|
||||||
|
x3: number;
|
||||||
|
y3: number;
|
||||||
|
x4: number;
|
||||||
|
y4: number;
|
||||||
text: string;
|
text: string;
|
||||||
confidence: number;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type OcrRequest = { [ModelTask.OCR]: { [ModelType.OCR]: ModelOptions & { options: { minScore: number } } } };
|
export type OcrRequest = { [ModelTask.OCR]: { [ModelType.OCR]: ModelOptions & { options: { minScore: number } } } };
|
||||||
export type OcrResponse = { [ModelTask.OCR]: OCR | OCR[] } & VisualResponse;
|
export type OcrResponse = { [ModelTask.OCR]: OCR[] } & VisualResponse;
|
||||||
|
|
||||||
export type FacialRecognitionRequest = {
|
export type FacialRecognitionRequest = {
|
||||||
[ModelTask.FACIAL_RECOGNITION]: {
|
[ModelTask.FACIAL_RECOGNITION]: {
|
||||||
|
|
|
||||||
|
|
@ -6,10 +6,14 @@ import { DummyValue, GenerateSql } from 'src/decorators';
|
||||||
|
|
||||||
export interface OcrInsertData {
|
export interface OcrInsertData {
|
||||||
assetId: string;
|
assetId: string;
|
||||||
boundingBoxX1: number;
|
x1: number;
|
||||||
boundingBoxY1: number;
|
y1: number;
|
||||||
boundingBoxX2: number;
|
x2: number;
|
||||||
boundingBoxY2: number;
|
y2: number;
|
||||||
|
x3: number;
|
||||||
|
y3: number;
|
||||||
|
x4: number;
|
||||||
|
y4: number;
|
||||||
text: string;
|
text: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
import { Kysely, sql } from 'kysely';
|
import { Kysely, sql } from 'kysely';
|
||||||
|
|
||||||
export async function up(db: Kysely<any>): Promise<void> {
|
export async function up(db: Kysely<any>): Promise<void> {
|
||||||
await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "boundingBoxX1" integer NOT NULL DEFAULT 0, "boundingBoxY1" integer NOT NULL DEFAULT 0, "boundingBoxX2" integer NOT NULL DEFAULT 0, "boundingBoxY2" integer NOT NULL DEFAULT 0, "text" text NOT NULL);`.execute(db);
|
await sql`CREATE TABLE "asset_ocr" ("id" uuid NOT NULL DEFAULT uuid_generate_v4(), "assetId" uuid NOT NULL, "x1" integer NOT NULL, "y1" integer NOT NULL, "x2" integer NOT NULL, "y2" integer NOT NULL, "x3" integer NOT NULL, "y3" integer NOT NULL, "x4" integer NOT NULL, "y4" integer NOT NULL, "text" text NOT NULL);`.execute(db);
|
||||||
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22" PRIMARY KEY ("id");`.execute(db);
|
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22" PRIMARY KEY ("id");`.execute(db);
|
||||||
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "FK_dc592ec504976f5636e28bb84c6" FOREIGN KEY ("assetId") REFERENCES "assets" ("id") ON UPDATE CASCADE ON DELETE CASCADE;`.execute(db);
|
await sql`ALTER TABLE "asset_ocr" ADD CONSTRAINT "FK_dc592ec504976f5636e28bb84c6" FOREIGN KEY ("assetId") REFERENCES "assets" ("id") ON UPDATE CASCADE ON DELETE CASCADE;`.execute(db);
|
||||||
|
await sql`CREATE INDEX "IDX_dc592ec504976f5636e28bb84c" ON "asset_ocr" ("assetId")`.execute(db);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function down(db: Kysely<any>): Promise<void> {
|
export async function down(db: Kysely<any>): Promise<void> {
|
||||||
|
await sql`DROP INDEX "IDX_dc592ec504976f5636e28bb84c";`.execute(db);
|
||||||
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22";`.execute(db);
|
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "PK_5c37b36ceef9ac1f688b6c6bf22";`.execute(db);
|
||||||
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "FK_dc592ec504976f5636e28bb84c6";`.execute(db);
|
await sql`ALTER TABLE "asset_ocr" DROP CONSTRAINT "FK_dc592ec504976f5636e28bb84c6";`.execute(db);
|
||||||
await sql`DROP TABLE "asset_ocr";`.execute(db);
|
await sql`DROP TABLE "asset_ocr";`.execute(db);
|
||||||
|
|
@ -9,21 +9,33 @@ export class AssetOcrTable {
|
||||||
@ForeignKeyColumn(() => AssetTable, {
|
@ForeignKeyColumn(() => AssetTable, {
|
||||||
onDelete: 'CASCADE',
|
onDelete: 'CASCADE',
|
||||||
onUpdate: 'CASCADE',
|
onUpdate: 'CASCADE',
|
||||||
index: false,
|
index: true,
|
||||||
})
|
})
|
||||||
assetId!: string;
|
assetId!: string;
|
||||||
|
|
||||||
@Column({ default: 0, type: 'integer' })
|
@Column({ type: 'integer' })
|
||||||
boundingBoxX1!: number;
|
x1!: number;
|
||||||
|
|
||||||
@Column({ default: 0, type: 'integer' })
|
@Column({ type: 'integer' })
|
||||||
boundingBoxY1!: number;
|
y1!: number;
|
||||||
|
|
||||||
@Column({ default: 0, type: 'integer' })
|
@Column({ type: 'integer' })
|
||||||
boundingBoxX2!: number;
|
x2!: number;
|
||||||
|
|
||||||
@Column({ default: 0, type: 'integer' })
|
@Column({ type: 'integer' })
|
||||||
boundingBoxY2!: number;
|
y2!: number;
|
||||||
|
|
||||||
|
@Column({ type: 'integer' })
|
||||||
|
x3!: number;
|
||||||
|
|
||||||
|
@Column({ type: 'integer' })
|
||||||
|
y3!: number;
|
||||||
|
|
||||||
|
@Column({ type: 'integer' })
|
||||||
|
x4!: number;
|
||||||
|
|
||||||
|
@Column({ type: 'integer' })
|
||||||
|
y4!: number;
|
||||||
|
|
||||||
@Column({ type: 'text' })
|
@Column({ type: 'text' })
|
||||||
text!: string;
|
text!: string;
|
||||||
|
|
|
||||||
|
|
@ -66,14 +66,7 @@ export class OcrService extends BaseService {
|
||||||
machineLearning.ocr
|
machineLearning.ocr
|
||||||
);
|
);
|
||||||
|
|
||||||
const resultsArray = Array.isArray(ocrResults) ? ocrResults : [ocrResults];
|
if (ocrResults.length === 0) {
|
||||||
const validResults = resultsArray.filter(result =>
|
|
||||||
result &&
|
|
||||||
result.text &&
|
|
||||||
result.text.trim().length > 0
|
|
||||||
);
|
|
||||||
|
|
||||||
if (validResults.length === 0) {
|
|
||||||
this.logger.warn(`No valid OCR results for document ${id}`);
|
this.logger.warn(`No valid OCR results for document ${id}`);
|
||||||
await this.assetRepository.upsertJobStatus({
|
await this.assetRepository.upsertJobStatus({
|
||||||
assetId: asset.id,
|
assetId: asset.id,
|
||||||
|
|
@ -83,12 +76,16 @@ export class OcrService extends BaseService {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const ocrDataList = validResults.map(result => ({
|
const ocrDataList = ocrResults.map(result => ({
|
||||||
assetId: id,
|
assetId: id,
|
||||||
boundingBoxX1: result.boundingBox.x1,
|
x1: result.x1,
|
||||||
boundingBoxY1: result.boundingBox.y1,
|
y1: result.y1,
|
||||||
boundingBoxX2: result.boundingBox.x2,
|
x2: result.x2,
|
||||||
boundingBoxY2: result.boundingBox.y2,
|
y2: result.y2,
|
||||||
|
x3: result.x3,
|
||||||
|
y3: result.y3,
|
||||||
|
x4: result.x4,
|
||||||
|
y4: result.y4,
|
||||||
text: result.text.trim(),
|
text: result.text.trim(),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
|
@ -99,7 +96,7 @@ export class OcrService extends BaseService {
|
||||||
ocrAt: new Date(),
|
ocrAt: new Date(),
|
||||||
});
|
});
|
||||||
|
|
||||||
this.logger.debug(`Processed ${validResults.length} OCR result(s) for ${id}`);
|
this.logger.debug(`Processed ${ocrResults.length} OCR result(s) for ${id}`);
|
||||||
return JobStatus.SUCCESS;
|
return JobStatus.SUCCESS;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.error(`Failed to insert OCR results for ${id}:`, error);
|
this.logger.error(`Failed to insert OCR results for ${id}:`, error);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue