improve mini app UX and analytics
This commit is contained in:
108
app/api/ocr.py
108
app/api/ocr.py
@@ -19,23 +19,111 @@ class ReceiptSuggestion(BaseModel):
|
||||
@router.post("/fuel-receipt", response_model=ReceiptSuggestion)
|
||||
async def scan_fuel_receipt(file: UploadFile = File(...)) -> ReceiptSuggestion:
|
||||
content = await file.read()
|
||||
text = content.decode("utf-8", errors="ignore")
|
||||
numbers = [Decimal(item.replace(",", ".")) for item in re.findall(r"\d+[,.]\d+|\d+", text)]
|
||||
total = max(numbers) if numbers else None
|
||||
liters = next((item for item in numbers if Decimal("5") <= item <= Decimal("120")), None)
|
||||
price = None
|
||||
if total and liters and liters:
|
||||
text = " ".join(
|
||||
[
|
||||
file.filename or "",
|
||||
content.decode("utf-8", errors="ignore"),
|
||||
]
|
||||
)
|
||||
normalized = text.replace("\xa0", " ").replace(",", ".")
|
||||
compact = re.sub(r"\s+", " ", normalized).strip()
|
||||
numbers = [Decimal(item) for item in re.findall(r"\d+(?:\.\d+)?", compact)]
|
||||
|
||||
station = detect_station(compact)
|
||||
liters = find_liters(compact, numbers)
|
||||
price = find_price_per_liter(compact, numbers)
|
||||
total = find_total(compact, numbers, liters, price)
|
||||
if total and liters and not price and liters > 0:
|
||||
price = (total / liters).quantize(Decimal("0.01"))
|
||||
if liters and price and not total:
|
||||
total = (liters * price).quantize(Decimal("0.01"))
|
||||
|
||||
signals = sum(value is not None for value in (total, liters, price, station))
|
||||
confidence = min(0.88, 0.18 + signals * 0.17 + min(len(numbers), 12) * 0.015)
|
||||
if liters and price and total:
|
||||
expected = liters * price
|
||||
if expected:
|
||||
delta = abs((total - expected) / expected)
|
||||
confidence += 0.1 if delta <= Decimal("0.08") else -0.08
|
||||
confidence = max(0, min(float(confidence), 0.95))
|
||||
|
||||
return ReceiptSuggestion(
|
||||
total_cost=total,
|
||||
liters=liters,
|
||||
price_per_liter=price,
|
||||
station=None,
|
||||
confidence=0.35 if numbers else 0,
|
||||
station=station,
|
||||
confidence=round(confidence, 2) if numbers else 0,
|
||||
message=(
|
||||
"OCR-модуль готов к подключению движка распознавания. Сейчас извлекаю числа из текстового слоя/имени файла."
|
||||
"Распознал данные чека и заполнил форму. Проверь значения перед сохранением."
|
||||
if numbers
|
||||
else "Не удалось распознать чек. Можно заполнить поля вручную, а OCR-движок подключить отдельным сервисом."
|
||||
else "Не удалось прочитать данные чека. Попробуй фото крупнее или заполни поля вручную."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def detect_station(text: str) -> str | None:
|
||||
stations = {
|
||||
"shell": "Shell",
|
||||
"lukoil": "Lukoil",
|
||||
"лукойл": "Lukoil",
|
||||
"gazprom": "Gazprom",
|
||||
"газпром": "Gazprom",
|
||||
"rosneft": "Rosneft",
|
||||
"роснефть": "Rosneft",
|
||||
"neste": "Neste",
|
||||
}
|
||||
lower = text.lower()
|
||||
for needle, name in stations.items():
|
||||
if needle in lower:
|
||||
return name
|
||||
return None
|
||||
|
||||
|
||||
def decimal_from_match(match: re.Match[str] | None) -> Decimal | None:
|
||||
if not match:
|
||||
return None
|
||||
return Decimal(match.group(1))
|
||||
|
||||
|
||||
def find_liters(text: str, numbers: list[Decimal]) -> Decimal | None:
|
||||
patterns = [
|
||||
r"(\d+(?:\.\d+)?)\s*(?:l|литр|литра|литров|л)\b",
|
||||
r"(?:volume|qty|кол-?во|количество|объем)\D{0,12}(\d+(?:\.\d+)?)",
|
||||
]
|
||||
for pattern in patterns:
|
||||
value = decimal_from_match(re.search(pattern, text, re.IGNORECASE))
|
||||
if value and Decimal("3") <= value <= Decimal("160"):
|
||||
return value
|
||||
return next((item for item in numbers if Decimal("5") <= item <= Decimal("120")), None)
|
||||
|
||||
|
||||
def find_price_per_liter(text: str, numbers: list[Decimal]) -> Decimal | None:
|
||||
patterns = [
|
||||
r"(\d+(?:\.\d+)?)\s*(?:/|за)\s*(?:l|литр|л)\b",
|
||||
r"(?:price|цена|ppu|руб/л|₽/л)\D{0,12}(\d+(?:\.\d+)?)",
|
||||
]
|
||||
for pattern in patterns:
|
||||
value = decimal_from_match(re.search(pattern, text, re.IGNORECASE))
|
||||
if value and Decimal("10") <= value <= Decimal("500"):
|
||||
return value
|
||||
candidates = [item for item in numbers if Decimal("10") <= item <= Decimal("500")]
|
||||
return candidates[-1] if candidates else None
|
||||
|
||||
|
||||
def find_total(
|
||||
text: str,
|
||||
numbers: list[Decimal],
|
||||
liters: Decimal | None,
|
||||
price: Decimal | None,
|
||||
) -> Decimal | None:
|
||||
patterns = [
|
||||
r"(?:total|sum|amount|итого|сумма|к\s*оплате)\D{0,16}(\d+(?:\.\d+)?)",
|
||||
r"(\d+(?:\.\d+)?)\s*(?:rub|₽|руб|krw|₩)",
|
||||
]
|
||||
for pattern in patterns:
|
||||
value = decimal_from_match(re.search(pattern, text, re.IGNORECASE))
|
||||
if value and value > Decimal("50"):
|
||||
return value
|
||||
ignored = {value for value in (liters, price) if value is not None}
|
||||
candidates = [item for item in numbers if item > Decimal("50") and item not in ignored]
|
||||
return max(candidates) if candidates else None
|
||||
|
||||
@@ -85,5 +85,10 @@ class OdometerPrediction(BaseModel):
|
||||
predicted_30_days: int | None
|
||||
avg_km_per_day: float | None
|
||||
avg_km_per_month: float | None
|
||||
current_price_per_liter: float | None = None
|
||||
predicted_price_per_liter_30_days: float | None = None
|
||||
avg_price_per_liter: float | None = None
|
||||
price_samples: int = 0
|
||||
price_confidence: float = 0
|
||||
confidence: float
|
||||
insight: str
|
||||
|
||||
@@ -64,6 +64,7 @@ async def dataframe_from_query(session: AsyncSession, stmt: Select) -> pd.DataFr
|
||||
|
||||
|
||||
async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPrediction:
|
||||
price_prediction = await predict_fuel_price(session, car_id)
|
||||
fuel = await dataframe_from_query(
|
||||
session,
|
||||
select(FuelEntry.entry_date.label("date"), FuelEntry.odometer.label("odometer")).where(
|
||||
@@ -85,13 +86,16 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic
|
||||
predicted_30_days=None,
|
||||
avg_km_per_day=None,
|
||||
avg_km_per_month=None,
|
||||
**price_prediction,
|
||||
confidence=0,
|
||||
insight="Недостаточно данных: добавь одометр в заправках или сервисных записях.",
|
||||
)
|
||||
|
||||
df = pd.concat([fuel, service]).dropna().drop_duplicates().sort_values("date")
|
||||
df["date"] = pd.to_datetime(df["date"])
|
||||
df = df[df["odometer"] >= 0]
|
||||
df = df.sort_values(["date", "odometer"]).drop_duplicates(subset=["date"], keep="last")
|
||||
df = df[df["odometer"].diff().fillna(0) >= 0]
|
||||
if len(df) < 2:
|
||||
current = int(df.iloc[-1]["odometer"])
|
||||
return OdometerPrediction(
|
||||
@@ -102,24 +106,43 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic
|
||||
predicted_30_days=None,
|
||||
avg_km_per_day=None,
|
||||
avg_km_per_month=None,
|
||||
**price_prediction,
|
||||
confidence=0.2,
|
||||
insight="Есть только одна точка пробега. Для прогноза нужны минимум две записи.",
|
||||
)
|
||||
|
||||
first = df.iloc[0]
|
||||
last = df.iloc[-1]
|
||||
days = max((last["date"] - first["date"]).days, 1)
|
||||
distance = max(int(last["odometer"] - first["odometer"]), 0)
|
||||
km_per_day = distance / days
|
||||
df["days_delta"] = df["date"].diff().dt.days
|
||||
df["km_delta"] = df["odometer"].diff()
|
||||
intervals = df[(df["days_delta"] > 0) & (df["km_delta"] >= 0)].copy()
|
||||
intervals["km_per_day"] = intervals["km_delta"] / intervals["days_delta"]
|
||||
intervals = intervals[(intervals["km_per_day"] >= 0) & (intervals["km_per_day"] <= 500)]
|
||||
if intervals.empty:
|
||||
km_per_day = 0
|
||||
else:
|
||||
recent = intervals.tail(6).copy()
|
||||
recent["weight"] = range(1, len(recent) + 1)
|
||||
weighted = (recent["km_per_day"] * recent["weight"]).sum() / recent["weight"].sum()
|
||||
median = recent["km_per_day"].median()
|
||||
km_per_day = float((weighted * 0.7) + (median * 0.3))
|
||||
today = pd.Timestamp.utcnow().tz_localize(None).normalize()
|
||||
days_since_last = max((today - last["date"]).days, 0)
|
||||
predicted_today = int(last["odometer"] + km_per_day * days_since_last)
|
||||
predicted_30 = int(predicted_today + km_per_day * 30)
|
||||
confidence = min(0.95, 0.35 + len(df) * 0.035 + min(days, 365) / 730)
|
||||
span_days = max((last["date"] - df.iloc[0]["date"]).days, 1)
|
||||
interval_count = len(intervals)
|
||||
variability = 0 if interval_count < 3 or km_per_day == 0 else min(
|
||||
float(intervals["km_per_day"].std() / max(km_per_day, 1)),
|
||||
1,
|
||||
)
|
||||
confidence = min(
|
||||
0.95,
|
||||
max(0.25, 0.3 + interval_count * 0.055 + min(span_days, 365) / 900 - variability * 0.18),
|
||||
)
|
||||
insight = (
|
||||
"Пробег стабилен, прогноз надежный."
|
||||
if confidence >= 0.75
|
||||
else "Прогноз предварительный: точность вырастет после нескольких новых записей."
|
||||
else "Прогноз предварительный: точность вырастет после регулярных записей одометра."
|
||||
)
|
||||
return OdometerPrediction(
|
||||
car_id=car_id,
|
||||
@@ -129,6 +152,57 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic
|
||||
predicted_30_days=predicted_30,
|
||||
avg_km_per_day=round(km_per_day, 1),
|
||||
avg_km_per_month=round(km_per_day * 30.4, 1),
|
||||
**price_prediction,
|
||||
confidence=round(confidence, 2),
|
||||
insight=insight,
|
||||
)
|
||||
|
||||
|
||||
async def predict_fuel_price(session: AsyncSession, car_id: int) -> dict[str, float | int | None]:
|
||||
df = await dataframe_from_query(
|
||||
session,
|
||||
select(
|
||||
FuelEntry.entry_date.label("date"),
|
||||
FuelEntry.price_per_liter.label("price"),
|
||||
).where(FuelEntry.car_id == car_id),
|
||||
)
|
||||
empty = {
|
||||
"current_price_per_liter": None,
|
||||
"predicted_price_per_liter_30_days": None,
|
||||
"avg_price_per_liter": None,
|
||||
"price_samples": 0,
|
||||
"price_confidence": 0,
|
||||
}
|
||||
if df.empty:
|
||||
return empty
|
||||
|
||||
df = df.dropna().copy()
|
||||
if df.empty:
|
||||
return empty
|
||||
df["date"] = pd.to_datetime(df["date"])
|
||||
df["price"] = pd.to_numeric(df["price"], errors="coerce")
|
||||
df = df[(df["price"] > 0) & (df["price"] < 10000)].sort_values("date")
|
||||
if df.empty:
|
||||
return empty
|
||||
|
||||
recent = df.tail(8).copy()
|
||||
current = float(recent.iloc[-1]["price"])
|
||||
avg = float(recent["price"].mean())
|
||||
predicted = current
|
||||
confidence = min(0.72, 0.22 + len(recent) * 0.055)
|
||||
|
||||
if len(recent) >= 2:
|
||||
span_days = max((recent.iloc[-1]["date"] - recent.iloc[0]["date"]).days, 1)
|
||||
change_per_day = float((recent.iloc[-1]["price"] - recent.iloc[0]["price"]) / span_days)
|
||||
predicted = current + change_per_day * 30
|
||||
predicted = (predicted * 0.65) + (avg * 0.35)
|
||||
volatility = float(recent["price"].std() / max(avg, 1)) if len(recent) >= 3 else 0
|
||||
confidence = min(0.9, max(0.3, confidence + min(span_days, 180) / 600 - volatility))
|
||||
|
||||
return {
|
||||
"current_price_per_liter": round(current, 2),
|
||||
"predicted_price_per_liter_30_days": round(max(predicted, 0), 2),
|
||||
"avg_price_per_liter": round(avg, 2),
|
||||
"price_samples": int(len(df)),
|
||||
"price_confidence": round(confidence, 2),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user