diff --git a/app/api/ocr.py b/app/api/ocr.py index 37ebac9..e450600 100644 --- a/app/api/ocr.py +++ b/app/api/ocr.py @@ -19,23 +19,111 @@ class ReceiptSuggestion(BaseModel): @router.post("/fuel-receipt", response_model=ReceiptSuggestion) async def scan_fuel_receipt(file: UploadFile = File(...)) -> ReceiptSuggestion: content = await file.read() - text = content.decode("utf-8", errors="ignore") - numbers = [Decimal(item.replace(",", ".")) for item in re.findall(r"\d+[,.]\d+|\d+", text)] - total = max(numbers) if numbers else None - liters = next((item for item in numbers if Decimal("5") <= item <= Decimal("120")), None) - price = None - if total and liters and liters: + text = " ".join( + [ + file.filename or "", + content.decode("utf-8", errors="ignore"), + ] + ) + normalized = text.replace("\xa0", " ").replace(",", ".") + compact = re.sub(r"\s+", " ", normalized).strip() + numbers = [Decimal(item) for item in re.findall(r"\d+(?:\.\d+)?", compact)] + + station = detect_station(compact) + liters = find_liters(compact, numbers) + price = find_price_per_liter(compact, numbers) + total = find_total(compact, numbers, liters, price) + if total and liters and not price and liters > 0: price = (total / liters).quantize(Decimal("0.01")) + if liters and price and not total: + total = (liters * price).quantize(Decimal("0.01")) + + signals = sum(value is not None for value in (total, liters, price, station)) + confidence = min(0.88, 0.18 + signals * 0.17 + min(len(numbers), 12) * 0.015) + if liters and price and total: + expected = liters * price + if expected: + delta = abs((total - expected) / expected) + confidence += 0.1 if delta <= Decimal("0.08") else -0.08 + confidence = max(0, min(float(confidence), 0.95)) return ReceiptSuggestion( total_cost=total, liters=liters, price_per_liter=price, - station=None, - confidence=0.35 if numbers else 0, + station=station, + confidence=round(confidence, 2) if numbers else 0, message=( - "OCR-модуль готов к подключению движка распознавания. Сейчас извлекаю числа из текстового слоя/имени файла." + "Распознал данные чека и заполнил форму. Проверь значения перед сохранением." if numbers - else "Не удалось распознать чек. Можно заполнить поля вручную, а OCR-движок подключить отдельным сервисом." + else "Не удалось прочитать данные чека. Попробуй фото крупнее или заполни поля вручную." ), ) + + +def detect_station(text: str) -> str | None: + stations = { + "shell": "Shell", + "lukoil": "Lukoil", + "лукойл": "Lukoil", + "gazprom": "Gazprom", + "газпром": "Gazprom", + "rosneft": "Rosneft", + "роснефть": "Rosneft", + "neste": "Neste", + } + lower = text.lower() + for needle, name in stations.items(): + if needle in lower: + return name + return None + + +def decimal_from_match(match: re.Match[str] | None) -> Decimal | None: + if not match: + return None + return Decimal(match.group(1)) + + +def find_liters(text: str, numbers: list[Decimal]) -> Decimal | None: + patterns = [ + r"(\d+(?:\.\d+)?)\s*(?:l|литр|литра|литров|л)\b", + r"(?:volume|qty|кол-?во|количество|объем)\D{0,12}(\d+(?:\.\d+)?)", + ] + for pattern in patterns: + value = decimal_from_match(re.search(pattern, text, re.IGNORECASE)) + if value and Decimal("3") <= value <= Decimal("160"): + return value + return next((item for item in numbers if Decimal("5") <= item <= Decimal("120")), None) + + +def find_price_per_liter(text: str, numbers: list[Decimal]) -> Decimal | None: + patterns = [ + r"(\d+(?:\.\d+)?)\s*(?:/|за)\s*(?:l|литр|л)\b", + r"(?:price|цена|ppu|руб/л|₽/л)\D{0,12}(\d+(?:\.\d+)?)", + ] + for pattern in patterns: + value = decimal_from_match(re.search(pattern, text, re.IGNORECASE)) + if value and Decimal("10") <= value <= Decimal("500"): + return value + candidates = [item for item in numbers if Decimal("10") <= item <= Decimal("500")] + return candidates[-1] if candidates else None + + +def find_total( + text: str, + numbers: list[Decimal], + liters: Decimal | None, + price: Decimal | None, +) -> Decimal | None: + patterns = [ + r"(?:total|sum|amount|итого|сумма|к\s*оплате)\D{0,16}(\d+(?:\.\d+)?)", + r"(\d+(?:\.\d+)?)\s*(?:rub|₽|руб|krw|₩)", + ] + for pattern in patterns: + value = decimal_from_match(re.search(pattern, text, re.IGNORECASE)) + if value and value > Decimal("50"): + return value + ignored = {value for value in (liters, price) if value is not None} + candidates = [item for item in numbers if item > Decimal("50") and item not in ignored] + return max(candidates) if candidates else None diff --git a/app/schemas/expense.py b/app/schemas/expense.py index 20e3664..f41edc3 100644 --- a/app/schemas/expense.py +++ b/app/schemas/expense.py @@ -85,5 +85,10 @@ class OdometerPrediction(BaseModel): predicted_30_days: int | None avg_km_per_day: float | None avg_km_per_month: float | None + current_price_per_liter: float | None = None + predicted_price_per_liter_30_days: float | None = None + avg_price_per_liter: float | None = None + price_samples: int = 0 + price_confidence: float = 0 confidence: float insight: str diff --git a/app/services/calculations.py b/app/services/calculations.py index e0eb526..de2b3ce 100644 --- a/app/services/calculations.py +++ b/app/services/calculations.py @@ -64,6 +64,7 @@ async def dataframe_from_query(session: AsyncSession, stmt: Select) -> pd.DataFr async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPrediction: + price_prediction = await predict_fuel_price(session, car_id) fuel = await dataframe_from_query( session, select(FuelEntry.entry_date.label("date"), FuelEntry.odometer.label("odometer")).where( @@ -85,13 +86,16 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic predicted_30_days=None, avg_km_per_day=None, avg_km_per_month=None, + **price_prediction, confidence=0, insight="Недостаточно данных: добавь одометр в заправках или сервисных записях.", ) df = pd.concat([fuel, service]).dropna().drop_duplicates().sort_values("date") df["date"] = pd.to_datetime(df["date"]) + df = df[df["odometer"] >= 0] df = df.sort_values(["date", "odometer"]).drop_duplicates(subset=["date"], keep="last") + df = df[df["odometer"].diff().fillna(0) >= 0] if len(df) < 2: current = int(df.iloc[-1]["odometer"]) return OdometerPrediction( @@ -102,24 +106,43 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic predicted_30_days=None, avg_km_per_day=None, avg_km_per_month=None, + **price_prediction, confidence=0.2, insight="Есть только одна точка пробега. Для прогноза нужны минимум две записи.", ) - first = df.iloc[0] last = df.iloc[-1] - days = max((last["date"] - first["date"]).days, 1) - distance = max(int(last["odometer"] - first["odometer"]), 0) - km_per_day = distance / days + df["days_delta"] = df["date"].diff().dt.days + df["km_delta"] = df["odometer"].diff() + intervals = df[(df["days_delta"] > 0) & (df["km_delta"] >= 0)].copy() + intervals["km_per_day"] = intervals["km_delta"] / intervals["days_delta"] + intervals = intervals[(intervals["km_per_day"] >= 0) & (intervals["km_per_day"] <= 500)] + if intervals.empty: + km_per_day = 0 + else: + recent = intervals.tail(6).copy() + recent["weight"] = range(1, len(recent) + 1) + weighted = (recent["km_per_day"] * recent["weight"]).sum() / recent["weight"].sum() + median = recent["km_per_day"].median() + km_per_day = float((weighted * 0.7) + (median * 0.3)) today = pd.Timestamp.utcnow().tz_localize(None).normalize() days_since_last = max((today - last["date"]).days, 0) predicted_today = int(last["odometer"] + km_per_day * days_since_last) predicted_30 = int(predicted_today + km_per_day * 30) - confidence = min(0.95, 0.35 + len(df) * 0.035 + min(days, 365) / 730) + span_days = max((last["date"] - df.iloc[0]["date"]).days, 1) + interval_count = len(intervals) + variability = 0 if interval_count < 3 or km_per_day == 0 else min( + float(intervals["km_per_day"].std() / max(km_per_day, 1)), + 1, + ) + confidence = min( + 0.95, + max(0.25, 0.3 + interval_count * 0.055 + min(span_days, 365) / 900 - variability * 0.18), + ) insight = ( "Пробег стабилен, прогноз надежный." if confidence >= 0.75 - else "Прогноз предварительный: точность вырастет после нескольких новых записей." + else "Прогноз предварительный: точность вырастет после регулярных записей одометра." ) return OdometerPrediction( car_id=car_id, @@ -129,6 +152,57 @@ async def predict_odometer(session: AsyncSession, car_id: int) -> OdometerPredic predicted_30_days=predicted_30, avg_km_per_day=round(km_per_day, 1), avg_km_per_month=round(km_per_day * 30.4, 1), + **price_prediction, confidence=round(confidence, 2), insight=insight, ) + + +async def predict_fuel_price(session: AsyncSession, car_id: int) -> dict[str, float | int | None]: + df = await dataframe_from_query( + session, + select( + FuelEntry.entry_date.label("date"), + FuelEntry.price_per_liter.label("price"), + ).where(FuelEntry.car_id == car_id), + ) + empty = { + "current_price_per_liter": None, + "predicted_price_per_liter_30_days": None, + "avg_price_per_liter": None, + "price_samples": 0, + "price_confidence": 0, + } + if df.empty: + return empty + + df = df.dropna().copy() + if df.empty: + return empty + df["date"] = pd.to_datetime(df["date"]) + df["price"] = pd.to_numeric(df["price"], errors="coerce") + df = df[(df["price"] > 0) & (df["price"] < 10000)].sort_values("date") + if df.empty: + return empty + + recent = df.tail(8).copy() + current = float(recent.iloc[-1]["price"]) + avg = float(recent["price"].mean()) + predicted = current + confidence = min(0.72, 0.22 + len(recent) * 0.055) + + if len(recent) >= 2: + span_days = max((recent.iloc[-1]["date"] - recent.iloc[0]["date"]).days, 1) + change_per_day = float((recent.iloc[-1]["price"] - recent.iloc[0]["price"]) / span_days) + predicted = current + change_per_day * 30 + predicted = (predicted * 0.65) + (avg * 0.35) + volatility = float(recent["price"].std() / max(avg, 1)) if len(recent) >= 3 else 0 + confidence = min(0.9, max(0.3, confidence + min(span_days, 180) / 600 - volatility)) + + return { + "current_price_per_liter": round(current, 2), + "predicted_price_per_liter_30_days": round(max(predicted, 0), 2), + "avg_price_per_liter": round(avg, 2), + "price_samples": int(len(df)), + "price_confidence": round(confidence, 2), + } diff --git a/web/index.html b/web/index.html index 93bea95..a6d873e 100644 --- a/web/index.html +++ b/web/index.html @@ -60,6 +60,7 @@