Bot become a Community Guard & Post send manager

added: dictionary support for censore
message/user management with dict triggers
This commit is contained in:
2025-08-22 21:44:14 +09:00
parent efdafb0efa
commit c16ec54891
27 changed files with 1746 additions and 184 deletions

338
app/moderation/engine.py Normal file
View File

@@ -0,0 +1,338 @@
# app/moderation/engine.py
import re
from typing import List, Optional
from datetime import datetime, timedelta
from urllib.parse import urlparse
# Быстрый и стабильный хеш контента
try:
import xxhash
def _hash_init():
return xxhash.xxh3_128()
def _hash_hex(h): # noqa: ANN001
return h.hexdigest()
except Exception: # fallback на hashlib
import hashlib
def _hash_init():
return hashlib.sha256()
def _hash_hex(h): # noqa: ANN001
return h.hexdigest()
# Быстрый поиск plain-терминов
try:
import ahocorasick # pyahocorasick
except Exception:
ahocorasick = None # не критично — останутся только regex-словаря
from sqlalchemy import select, func
from app.moderation.cache import TTLCache
from app.infra.redis_client import get_redis
from app.db.models import (
SecurityPolicy, ChatSecurity, Delivery,
SpamDictionary, DictionaryEntry, PolicyDictionaryLink, DomainRule,
ModerationLog, UserStrike, MessageEvent,
)
URL_RE = re.compile(r'https?://[^\s)]+', re.IGNORECASE)
MENTION_RE = re.compile(r'@\w+', re.UNICODE)
# Кеши (уменьшают число запросов к БД и компиляций)
policy_cache = TTLCache(ttl_seconds=60, max_size=4096) # chat_id -> snapshot(dict)
dict_cache = TTLCache(ttl_seconds=60, max_size=512) # policy_id -> (ac_automaton|None, [regex...])
domain_cache = TTLCache(ttl_seconds=60, max_size=1024) # policy_id -> (whitelist_set, blacklist_set)
def snapshot_policy(p: SecurityPolicy) -> dict:
return {
"id": p.id,
"cooldown_seconds": p.cooldown_seconds,
"duplicate_window_seconds": p.duplicate_window_seconds,
"max_links": p.max_links,
"max_mentions": p.max_mentions,
"use_whitelist": p.use_whitelist,
"block_adult": p.block_adult,
"block_spam": p.block_spam,
"block_scam": p.block_scam,
"block_profanity": p.block_profanity,
"enforce_action_default": p.enforce_action_default,
"timeout_minutes": p.timeout_minutes,
"strikes_to_warn": p.strikes_to_warn,
"strikes_to_timeout": p.strikes_to_timeout,
"strikes_to_ban": p.strikes_to_ban,
"user_msg_per_minute": p.user_msg_per_minute,
}
def compute_content_hash(text: str, media_ids: List[str]) -> str:
h = _hash_init()
h.update(text or "")
for m in media_ids or []:
h.update("|")
h.update(m or "")
return _hash_hex(h)
def _find_domains(text: str) -> list[str]:
domains = []
for m in URL_RE.findall(text or ""):
try:
d = urlparse(m).netloc.lower()
if d.startswith("www."):
d = d[4:]
if d:
domains.append(d)
except Exception:
pass
return domains
def get_policy_for_chat(session, chat_id: int) -> Optional[SecurityPolicy]:
"""Возвращает активную (enabled) политику для чата или None."""
snap = policy_cache.get(chat_id)
if snap:
return session.get(SecurityPolicy, snap["id"])
cs = session.execute(
select(ChatSecurity).where(
ChatSecurity.chat_id == chat_id,
ChatSecurity.enabled.is_(True),
)
).scalar_one_or_none()
if not cs:
return None
p = session.get(SecurityPolicy, cs.policy_id)
if p:
policy_cache.set(chat_id, snapshot_policy(p))
return p
def _active_dicts(session, policy: SecurityPolicy) -> list[SpamDictionary]:
# Явно привязанные к политике словари
linked = session.execute(
select(SpamDictionary)
.join(PolicyDictionaryLink, PolicyDictionaryLink.dictionary_id == SpamDictionary.id)
.where(PolicyDictionaryLink.policy_id == policy.id)
).scalars().all()
# Глобальные словари включённых категорий
cats = []
if policy.block_adult: cats.append("adult")
if policy.block_spam: cats.append("spam")
if policy.block_scam: cats.append("scam")
if policy.block_profanity: cats.append("profanity")
globals_by_cat = session.execute(
select(SpamDictionary).where(
SpamDictionary.owner_user_id.is_(None),
SpamDictionary.category.in_(cats) if cats else False # если пусто — не брать
)
).scalars().all()
# unique по id
got = {d.id: d for d in (linked + globals_by_cat)}
return list(got.values())
def _compile_dicts(session, policy: SecurityPolicy):
cached = dict_cache.get(policy.id)
if cached is not None:
return cached
dicts = _active_dicts(session, policy)
if not dicts:
dict_cache.set(policy.id, (None, []))
return (None, [])
ids = [d.id for d in dicts]
entries = session.execute(
select(DictionaryEntry).where(DictionaryEntry.dictionary_id.in_(ids))
).scalars().all()
ac = None
regex_list: list[re.Pattern] = []
if entries and ahocorasick is not None:
ac = ahocorasick.Automaton()
plain_count = 0
kinds = {d.id: d.kind for d in dicts}
for e in entries:
use_regex = e.is_regex or kinds.get(e.dictionary_id) == "regex"
if use_regex:
try:
regex_list.append(re.compile(e.pattern, re.IGNORECASE))
except re.error:
continue
else:
if ac is not None:
try:
term = (e.pattern or "").lower()
if term:
ac.add_word(term, term)
plain_count += 1
except Exception:
continue
if ac is not None and plain_count > 0:
ac.make_automaton()
else:
ac = None
dict_cache.set(policy.id, (ac, regex_list))
return ac, regex_list
def _domain_sets(session, policy: SecurityPolicy) -> tuple[set[str], set[str]]:
cached = domain_cache.get(policy.id)
if cached is not None:
return cached
wl = session.execute(
select(DomainRule).where(DomainRule.policy_id == policy.id, DomainRule.kind == "whitelist")
).scalars().all()
bl = session.execute(
select(DomainRule).where(DomainRule.policy_id == policy.id, DomainRule.kind == "blacklist")
).scalars().all()
wl_set = {r.domain for r in wl}
bl_set = {r.domain for r in bl}
domain_cache.set(policy.id, (wl_set, bl_set))
return wl_set, bl_set
# ==========================
# Outgoing: проверка рассылки
# ==========================
def check_message_allowed(session, chat_id: int, owner_user_id: int, text: str, media_ids: List[str]):
"""
Проверка перед отправкой сообщения в конкретный чат по привязанной политике.
Возвращает кортеж:
(ok: bool, reasons: list[str], content_hash: str)
Где reasons — причины блокировки (если ok == False), а content_hash — хеш
контента (текст + список media_id) для анти-дубликатов.
"""
policy = get_policy_for_chat(session, chat_id)
reasons: list[str] = []
content_hash = compute_content_hash(text or "", media_ids or [])
# Если к чату не привязана политика — разрешаем
if not policy:
return True, reasons, content_hash
# 1) Кулдаун между отправками в этот чат
last = session.execute(
select(Delivery)
.where(Delivery.chat_id == chat_id, Delivery.status == "sent")
.order_by(Delivery.created_at.desc())
).scalars().first()
if last:
delta = datetime.utcnow() - last.created_at
if delta.total_seconds() < policy.cooldown_seconds:
reasons.append(f"cooldown<{policy.cooldown_seconds}s")
# 2) Дубликаты за окно
if policy.duplicate_window_seconds > 0:
since = datetime.utcnow() - timedelta(seconds=policy.duplicate_window_seconds)
dupe = session.execute(
select(Delivery).where(
Delivery.chat_id == chat_id,
Delivery.content_hash == content_hash,
Delivery.created_at >= since,
Delivery.status == "sent",
)
).scalars().first()
if dupe:
reasons.append("duplicate")
# 3) Лимиты ссылок и упоминаний
links = URL_RE.findall(text or "")
if policy.max_links >= 0 and len(links) > policy.max_links:
reasons.append(f"links>{policy.max_links}")
mentions = MENTION_RE.findall(text or "")
if policy.max_mentions >= 0 and len(mentions) > policy.max_mentions:
reasons.append(f"mentions>{policy.max_mentions}")
# 4) Доменные правила
wl, bl = _domain_sets(session, policy)
domains = _find_domains(text or "")
if policy.use_whitelist and wl:
bad = [d for d in domains if d not in wl]
if bad:
reasons.append("not_whitelisted:" + ",".join(sorted(set(bad))))
else:
bad = [d for d in domains if d in bl]
if bad:
reasons.append("blacklisted:" + ",".join(sorted(set(bad))))
# 5) Словари (plain + regex)
ac, regex_list = _compile_dicts(session, policy)
if ac is not None:
lo = (text or "").lower()
for _, _term in ac.iter(lo):
reasons.append("dictionary_match")
break
if not reasons and regex_list:
for r in regex_list:
if r.search(text or ""):
reasons.append("dictionary_match")
break
return (len(reasons) == 0), reasons, content_hash
# ==========================
# Ниже — helpers для модерации входящих (если используете «страж» в группе)
# ==========================
async def redis_rate_check(chat_id: int, user_id: int, per_minute: int) -> bool:
"""True — если укладывается в лимит per_minute сообщений/минуту."""
if per_minute <= 0:
return True
r = await get_redis()
if r is None:
return True
key = f"rl:{chat_id}:{user_id}"
pipe = r.pipeline()
pipe.incr(key, 1)
pipe.expire(key, 60)
cnt, _ = await pipe.execute()
return int(cnt) <= per_minute
async def redis_dupe_check(chat_id: int, user_id: int, content_hash: str, window_s: int) -> bool:
"""True — если не дубликат за окно window_s (сек.), иначе False."""
if window_s <= 0:
return True
r = await get_redis()
if r is None:
return True
key = f"dupe:{chat_id}:{user_id}:{content_hash}"
ok = await r.set(key, "1", ex=window_s, nx=True)
return ok is True
def add_strike_and_decide_action(session, policy: SecurityPolicy, chat_id: int, tg_user_id: int) -> str:
"""
Увеличивает страйки и возвращает действие: warn|timeout|ban|delete|none
(эскалация по порогам политики).
"""
us = session.execute(
select(UserStrike).where(UserStrike.chat_id == chat_id, UserStrike.tg_user_id == tg_user_id)
).scalar_one_or_none()
if not us:
us = UserStrike(chat_id=chat_id, tg_user_id=tg_user_id, strikes=0)
session.add(us)
session.commit()
session.refresh(us)
us.strikes += 1
us.updated_at = datetime.utcnow()
session.commit()
if us.strikes >= policy.strikes_to_ban:
return "ban"
if us.strikes >= policy.strikes_to_timeout:
return "timeout"
if us.strikes >= policy.strikes_to_warn:
return "warn"
return policy.enforce_action_default or "delete"