import os import requests import time from django.db import connection, Error as DjangoDBError from django.core.cache import cache from django.conf import settings import boto3 from botocore.client import Config from botocore.exceptions import ClientError from datetime import datetime, timezone # นำเข้า Repository และ Service ของ Model Registry from model_registry.repositories.ai_model_repository import AiModelRepository from model_registry.models import AiModel # สร้าง Instance ของ Repository และ Service (ถ้ายังไม่มีในไฟล์นี้) ai_model_repo = AiModelRepository() # กำหนด URL ของ AI Service (ใช้ localhost สำหรับ Local Dev) AI_SERVICE_URL = os.getenv("AI_SERVICE_INTERNAL_URL", "http://localhost:8001") # ใช้ Exception เดิมจาก model_registry class HealthCheckError(Exception): pass class HealthService: def __init__(self): pass def _check_database(self): # Logic ตรวจสอบ CockroachDB (เหมือนเดิม) start_time = time.time() try: with connection.cursor() as cursor: cursor.execute("SELECT 1") latency = round((time.time() - start_time) * 1000, 2) return "UP", f"Query executed successfully. Latency: {latency}ms" except DjangoDBError as e: return "DOWN", f"DB Connection Error: {e}" except Exception as e: return "DOWN", f"Unknown DB Error: {e}" def _check_cache(self): # Logic ตรวจสอบ Redis (เหมือนเดิม) start_time = time.time() test_key = 'health_test_key' try: cache.set(test_key, 'ok', timeout=1) result = cache.get(test_key) latency = round((time.time() - start_time) * 1000, 2) if result == 'ok': return "UP", f"Read/Write successful. Latency: {latency}ms" return "DOWN", "Failed to retrieve test key." except Exception as e: return "DOWN", f"Redis Error: {e}" def _check_minio(self): """Logic ตรวจสอบ Object Storage (MinIO) โดยใช้ boto3""" try: # 1. สร้าง S3 Client ด้วย boto3 s3_client = boto3.client( "s3", endpoint_url=os.getenv("MINIO_ENDPOINT", "http://localhost:9000"), aws_access_key_id=os.getenv("MINIO_ACCESS_KEY", "minio_admin"), aws_secret_access_key=os.getenv("MINIO_SECRET_KEY", "minio_p@ssw0rd!"), # ใช้ Config เพื่อจัดการ timeout/signature version config=Config(signature_version="s3v4", connect_timeout=5, read_timeout=10) ) bucket_name = os.getenv("MODEL_BUCKET", "models") # 2. ทดสอบการเข้าถึง Bucket โดยใช้ head_bucket (มีประสิทธิภาพกว่า list_buckets) # ถ้า Bucket มีอยู่ จะไม่เกิด Error s3_client.head_bucket(Bucket=bucket_name) return "UP", f"Bucket '{bucket_name}' accessible via boto3." except ClientError as e: error_code = e.response['Error']['Code'] if error_code == '404': return "DOWN", f"Bucket '{bucket_name}' not found." elif error_code == '403': return "DOWN", f"MinIO S3 Access Denied. Check Key/Secret." return "DOWN", f"MinIO S3 Error (Code {error_code}): {e}" except Exception as e: return "DOWN", f"MinIO Connection Error: {e}" def _check_ai_service(self): # Logic ตรวจสอบ FastAPI/MONAI (เหมือนเดิม) start_time = time.time() try: response = requests.get(AI_SERVICE_URL, timeout=5) response.raise_for_status() data = response.json() model_loaded = data.get("model_loaded", False) latency = round((time.time() - start_time) * 1000, 2) if model_loaded: return "UP", f"Service running & Model loaded. Latency: {latency}ms" else: return "DOWN", f"Service running, but Model load status: {model_loaded}. Latency: {latency}ms" except requests.exceptions.RequestException as e: return "DOWN", f"FastAPI/MONAI Service Unreachable: {e}" except Exception as e: return "DOWN", f"AI Service Check Failed: {e}" # ----------------------------------------------- # ตรวจสอบ Health Check ของ AI Model แต่ละตัว # ----------------------------------------------- def _check_single_ai_model(self, model: AiModel): """เรียก Health Check ของโมเดลตัวเดียวโดยใช้ Full URL""" # ใช้ endpoint เฉพาะสำหรับ Health Check (Path มี '/health' ต่อท้าย) health_url = model.full_inference_url().rstrip('/') + '/health' start_time = time.time() try: # ใช้ requests.get ไปยัง Health Check Endpoint response = requests.get(health_url, timeout=5) response.raise_for_status() # จะผ่านถ้าได้ 200 OK # ... (ส่วนการคำนวณ Latency และ return "UP" เหมือนเดิม) latency = round((time.time() - start_time) * 1000, 2) return "UP", f"Model Health Check successful. Latency: {latency}ms" except requests.exceptions.RequestException as e: return "DOWN", f"Health Check Failed at {health_url}: {e}" except Exception as e: return "DOWN", f"Check Failed: {e}" def _check_all_ai_models(self): """วนลูปตรวจสอบโมเดลทั้งหมดที่สถานะ ACTIVE""" active_models = ai_model_repo.get_all().filter(status='ACTIVE') model_statuses = [] for model in active_models: status, detail = self._check_single_ai_model(model) model_statuses.append({ "name": f"{model.name} (v{model.model_version})", "id": str(model.id), # ส่งเป็น String เพื่อป้องกัน JS Truncation "status": status, "endpoint": model.full_inference_url(), "details": detail }) overall_model_health = "UP" if all(m['status'] == "UP" for m in model_statuses) else "DOWN" return overall_model_health, model_statuses def get_system_health(self): """เมธอดหลักที่รวมผลลัพธ์ทั้งหมด""" results = {} overall_status = "Healthy" # ตรวจสอบ Health ของ AI Model แต่ละตัว ai_overall_status, model_details = self._check_all_ai_models() # รวมผลลัพธ์ของ Model Health เข้าไปใน results results['model_endpoints'] = {"name": "AI Model Endpoints", "status": ai_overall_status, "models": model_details} if ai_overall_status != 'UP': overall_status = "Degraded" # รัน Check ทั้งหมด db_status, db_details = self._check_database() results['database'] = {"name": "CockroachDB", "status": db_status, "details": db_details} if db_status != 'UP': overall_status = "Degraded" cache_status, cache_details = self._check_cache() results['cache'] = {"name": "Redis Cache", "status": cache_status, "details": cache_details} if cache_status != 'UP': overall_status = "Degraded" minio_status, minio_details = self._check_minio() results['storage'] = {"name": "MinIO S3", "status": minio_status, "details": minio_details} if minio_status != 'UP': overall_status = "Degraded" ai_status, ai_details = self._check_ai_service() results['ai_service'] = {"name": "MONAI FastAPI", "status": ai_status, "details": ai_details} if ai_status != 'UP': overall_status = "Degraded" return { "status": overall_status, "services": results, "last_checked": datetime.now(timezone.utc).isoformat() }