Files
drivers_bot/app/services/ocr_provider.py
2026-05-12 19:45:08 +09:00

49 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from dataclasses import dataclass
from app.services.vehicle_identity import normalize_license_plate, validate_vin
@dataclass
class OcrCandidate:
type: str
value: str
confidence: float
@dataclass
class OcrResult:
recognized_text: str
candidates: list[OcrCandidate]
class OCRProvider:
async def recognize(self, content: bytes, filename: str | None = None) -> OcrResult:
raise NotImplementedError
class StubOCRProvider(OCRProvider):
async def recognize(self, content: bytes, filename: str | None = None) -> OcrResult:
text = " ".join(
[
filename or "",
content.decode("utf-8", errors="ignore"),
]
)
compact = re.sub(r"\s+", " ", text).strip()
candidates: list[OcrCandidate] = []
for raw in re.findall(r"\b[A-HJ-NPR-Z0-9]{17}\b", compact.upper()):
try:
candidates.append(OcrCandidate(type="vin", value=validate_vin(raw) or raw, confidence=0.84))
except ValueError:
continue
for raw in re.findall(r"\b[0-9A-ZА-Я가-힣][0-9A-ZА-Я가-힣\-\s]{4,10}\b", compact.upper()):
normalized = normalize_license_plate(raw)
if normalized and 5 <= len(normalized) <= 10 and not any(item.value == normalized for item in candidates):
candidates.append(OcrCandidate(type="license_plate", value=normalized, confidence=0.62))
return OcrResult(recognized_text=compact, candidates=candidates[:8])
def get_ocr_provider() -> OCRProvider:
return StubOCRProvider()