Files
Touchh/antifroud/data_sync.py
2024-12-18 13:40:50 +09:00

261 lines
10 KiB
Python

import logging
import pymysql
from datetime import datetime
from urllib.parse import unquote, parse_qs
from django.utils import timezone
import html
from hotels.models import Room, Hotel
from .models import UserActivityLog, ExternalDBSettings
from touchh.utils.log import CustomLogger
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from decouple import config
class DatabaseConnector:
def __init__(self, db_settings_id):
self.db_settings_id = db_settings_id
self.logger = CustomLogger(name="DatabaseConnector", log_level="DEBUG").get_logger()
self.connection = None
self.db_settings = self.get_db_settings()
def get_db_settings(self):
try:
settings = ExternalDBSettings.objects.get(id=self.db_settings_id)
self.logger.info(f"Retrieved DB settings: {settings}")
return {
"host": settings.host,
"port": settings.port,
"user": settings.user,
"password": settings.password,
"database": settings.database,
"table_name": settings.table_name
}
except ExternalDBSettings.DoesNotExist:
self.logger.error(f"Settings with ID {self.db_settings_id} not found.")
raise ValueError("Invalid db_settings_id")
except Exception as e:
self.logger.error(f"Error retrieving settings: {e}")
raise
def connect(self):
try:
self.logger.info(f"Connecting to DB with settings: {self.db_settings}")
self.connection = pymysql.connect(
host=self.db_settings["host"],
port=self.db_settings["port"],
user=self.db_settings["user"],
password=self.db_settings["password"],
database=self.db_settings["database"],
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
)
self.logger.info("Database connection established successfully.")
except pymysql.err.OperationalError as e:
self.logger.error(f"Operational error during DB connection: {e}")
raise ConnectionError(f"Operational error: {e}")
except Exception as e:
self.logger.error(f"Unexpected database connection error: {e}")
raise ConnectionError(e)
def execute_query(self, query):
try:
with self.connection.cursor() as cursor:
cursor.execute(query)
return cursor.fetchall()
except Exception as e:
self.logger.error(f"Query execution error: {e}")
return []
def close(self):
if self.connection:
self.connection.close()
self.logger.info("Database connection closed.")
class DataProcessor:
def __init__(self, logger):
self.logger = logger
def decode_html_entities(self, text):
if text and isinstance(text, str):
return html.unescape(unquote(text))
return text
def parse_datetime(self, dt_str):
try:
if isinstance(dt_str, datetime):
return timezone.make_aware(dt_str) if timezone.is_naive(dt_str) else dt_str
if dt_str:
return timezone.make_aware(datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S"))
except Exception as e:
self.logger.error(f"Datetime parsing error: {e}")
return None
def url_parameters_parser(self, url_parameters):
try:
if not url_parameters:
return {}
decoded_params = unquote(url_parameters)
parsed_params = parse_qs(decoded_params)
return {key: value[0] for key, value in parsed_params.items()}
except Exception as e:
self.logger.error(f"URL parameters parsing error: {e}")
return {}
class HotelRoomManager:
def __init__(self, logger):
self.logger = logger
def get_or_create_hotel(self, hotel_id, page_title):
if not hotel_id:
self.logger.warning("Hotel creation skipped: missing hotel_id.")
return None
hotel, created = Hotel.objects.get_or_create(
hotel_id=hotel_id,
defaults={
"name": html.unescape(page_title) or f"Отель {hotel_id}",
"description": "Автоматически созданный отель",
}
)
if created:
self.logger.info(f"Hotel '{hotel.name}' created with hotel_id: {hotel_id}")
else:
self.logger.info(f"Hotel '{hotel.name}' already exists with hotel_id: {hotel_id}")
return hotel
def get_or_create_room(self, hotel, room_number):
if not hotel:
self.logger.warning("Room creation skipped: missing hotel.")
return None
if not room_number:
self.logger.warning(f"Room creation skipped: missing room_number for hotel {hotel.name}.")
return None
external_id = f"{hotel.hotel_id}_{room_number}".lower()
room, created = Room.objects.get_or_create(
hotel=hotel,
number=room_number,
defaults={
"external_id": external_id,
"description": "Автоматически созданная комната",
}
)
if created:
self.logger.info(f"Room '{room.number}' (external_id: {external_id}) created in hotel '{hotel.name}'")
else:
self.logger.info(f"Room '{room.number}' already exists in hotel '{hotel.name}'")
return room
class DataSyncManager:
def __init__(self, db_settings_id):
self.logger = CustomLogger(name="DataSyncManager", log_level="DEBUG").get_logger()
self.db_connector = DatabaseConnector(db_settings_id)
self.db_settings = self.db_connector.db_settings # Сохраняем настройки базы данных
self.data_processor = DataProcessor(self.logger)
self.hotel_manager = HotelRoomManager(self.logger)
def get_last_saved_record(self):
record = UserActivityLog.objects.order_by("-id").first()
return record.id if record else 0
def fetch_new_data(self, last_id):
query = f"""
SELECT * FROM `{self.db_settings.get('table_name')}`
WHERE id > {last_id}
AND url_parameters IS NOT NULL
AND url_parameters LIKE '%utm_medium%'
AND page_url IS NOT NULL
ORDER BY id ASC
LIMIT 1000;
"""
self.logger.info(f"Fetching new data with query: {query}")
return self.db_connector.execute_query(query)
def process_and_save_data(self, rows):
for row in rows:
try:
url_params = self.data_processor.decode_html_entities(row.get("url_parameters", ""))
params = self.data_processor.url_parameters_parser(url_params)
hotel_id = params.get("utm_content")
room_number = params.get("utm_term")
page_title = row.get("page_title")
external_id = row.get("id")
hits = row.get("hits") or 0
hotel = self.hotel_manager.get_or_create_hotel(hotel_id, page_title)
room = self.hotel_manager.get_or_create_room(hotel, room_number)
page_url = row.get("page_url")
if hits != 0 and page_title is not None:
UserActivityLog.objects.update_or_create(
external_id=external_id,
defaults={
"user_id": row.get("user_id") or 0,
"timestamp": row.get("timestamp"),
"date_time": row.get("date_time"),
"ip": row.get("ip") or "0.0.0.0",
"created": self.data_processor.parse_datetime(row.get("created")) or timezone.now(),
"url_parameters": url_params,
"page_id": room.id if room else None,
"page_title": html.unescape(page_title),
"hits": hits,
"page_url": html.unescape(page_url),
}
)
else:
self.logger.warning("Invalid data for UserActivityLog.")
self.logger.info(f"Record ID {external_id} processed successfully.")
except Exception as e:
self.logger.error(f"Error processing record ID {row.get('id')}: {e}")
def sync(self):
self.db_connector.connect()
try:
last_id = self.get_last_saved_record()
rows = self.fetch_new_data(last_id)
self.process_and_save_data(rows)
self.logger.info("Sync completed.")
finally:
self.db_connector.close()
def scheduled_sync():
logger = CustomLogger(name="DatabaseSyncScheduler", log_level="DEBUG").get_logger()
logger.info("Starting scheduled sync.")
active_db_settings = ExternalDBSettings.objects.filter(is_active=True)
if not active_db_settings.exists():
logger.warning("No active database connections found.")
return
logger.info(f"Found {len(active_db_settings)} active database connections.")
def sync_task(db_settings):
try:
logger.info(f"Syncing connection: {db_settings.name} (ID={db_settings.id})")
sync_manager = DataSyncManager(db_settings.id)
sync_manager.sync()
logger.info(f"Sync completed for connection: {db_settings}")
except Exception as e:
logger.error(f"Error syncing connection {db_settings}: {e}")
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(sync_task, db_settings) for db_settings in active_db_settings]
for future in futures:
try:
future.result(timeout=300)
except TimeoutError:
logger.error("Sync task timed out.")
logger.info("Scheduled sync completed.")