186 lines
8.3 KiB
Python

import os
import requests
import time
from django.db import connection, Error as DjangoDBError
from django.core.cache import cache
from django.conf import settings
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError
from datetime import datetime, timezone
# นำเข้า Repository และ Service ของ Model Registry
from model_registry.repositories.ai_model_repository import AiModelRepository
from model_registry.models import AiModel
# สร้าง Instance ของ Repository และ Service (ถ้ายังไม่มีในไฟล์นี้)
ai_model_repo = AiModelRepository()
# กำหนด URL ของ AI Service (ใช้ localhost สำหรับ Local Dev)
AI_SERVICE_URL = os.getenv("AI_SERVICE_INTERNAL_URL", "http://localhost:8001")
# ใช้ Exception เดิมจาก model_registry
class HealthCheckError(Exception):
pass
class HealthService:
def __init__(self):
pass
def _check_database(self):
# Logic ตรวจสอบ CockroachDB (เหมือนเดิม)
start_time = time.time()
try:
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
latency = round((time.time() - start_time) * 1000, 2)
return "UP", f"Query executed successfully. Latency: {latency}ms"
except DjangoDBError as e:
return "DOWN", f"DB Connection Error: {e}"
except Exception as e:
return "DOWN", f"Unknown DB Error: {e}"
def _check_cache(self):
# Logic ตรวจสอบ Redis (เหมือนเดิม)
start_time = time.time()
test_key = 'health_test_key'
try:
cache.set(test_key, 'ok', timeout=1)
result = cache.get(test_key)
latency = round((time.time() - start_time) * 1000, 2)
if result == 'ok':
return "UP", f"Read/Write successful. Latency: {latency}ms"
return "DOWN", "Failed to retrieve test key."
except Exception as e:
return "DOWN", f"Redis Error: {e}"
def _check_minio(self):
"""Logic ตรวจสอบ Object Storage (MinIO) โดยใช้ boto3"""
try:
# 1. สร้าง S3 Client ด้วย boto3
s3_client = boto3.client(
"s3",
endpoint_url=os.getenv("MINIO_ENDPOINT", "http://localhost:9000"),
aws_access_key_id=os.getenv("MINIO_ACCESS_KEY", "minio_admin"),
aws_secret_access_key=os.getenv("MINIO_SECRET_KEY", "minio_p@ssw0rd!"),
# ใช้ Config เพื่อจัดการ timeout/signature version
config=Config(signature_version="s3v4", connect_timeout=5, read_timeout=10)
)
bucket_name = os.getenv("MODEL_BUCKET", "models")
# 2. ทดสอบการเข้าถึง Bucket โดยใช้ head_bucket (มีประสิทธิภาพกว่า list_buckets)
# ถ้า Bucket มีอยู่ จะไม่เกิด Error
s3_client.head_bucket(Bucket=bucket_name)
return "UP", f"Bucket '{bucket_name}' accessible via boto3."
except ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
return "DOWN", f"Bucket '{bucket_name}' not found."
elif error_code == '403':
return "DOWN", f"MinIO S3 Access Denied. Check Key/Secret."
return "DOWN", f"MinIO S3 Error (Code {error_code}): {e}"
except Exception as e:
return "DOWN", f"MinIO Connection Error: {e}"
def _check_ai_service(self):
# Logic ตรวจสอบ FastAPI/MONAI (เหมือนเดิม)
start_time = time.time()
try:
response = requests.get(AI_SERVICE_URL, timeout=5)
response.raise_for_status()
data = response.json()
model_loaded = data.get("model_loaded", False)
latency = round((time.time() - start_time) * 1000, 2)
if model_loaded:
return "UP", f"Service running & Model loaded. Latency: {latency}ms"
else:
return "DOWN", f"Service running, but Model load status: {model_loaded}. Latency: {latency}ms"
except requests.exceptions.RequestException as e:
return "DOWN", f"FastAPI/MONAI Service Unreachable: {e}"
except Exception as e:
return "DOWN", f"AI Service Check Failed: {e}"
# -----------------------------------------------
# ตรวจสอบ Health Check ของ AI Model แต่ละตัว
# -----------------------------------------------
def _check_single_ai_model(self, model: AiModel):
"""เรียก Health Check ของโมเดลตัวเดียวโดยใช้ Full URL"""
# ใช้ endpoint เฉพาะสำหรับ Health Check (Path มี '/health' ต่อท้าย)
health_url = model.full_inference_url().rstrip('/') + '/health'
start_time = time.time()
try:
# ใช้ requests.get ไปยัง Health Check Endpoint
response = requests.get(health_url, timeout=5)
response.raise_for_status() # จะผ่านถ้าได้ 200 OK
# ... (ส่วนการคำนวณ Latency และ return "UP" เหมือนเดิม)
latency = round((time.time() - start_time) * 1000, 2)
return "UP", f"Model Health Check successful. Latency: {latency}ms"
except requests.exceptions.RequestException as e:
return "DOWN", f"Health Check Failed at {health_url}: {e}"
except Exception as e:
return "DOWN", f"Check Failed: {e}"
def _check_all_ai_models(self):
"""วนลูปตรวจสอบโมเดลทั้งหมดที่สถานะ ACTIVE"""
active_models = ai_model_repo.get_all().filter(status='ACTIVE')
model_statuses = []
for model in active_models:
status, detail = self._check_single_ai_model(model)
model_statuses.append({
"name": f"{model.name} (v{model.model_version})",
"id": str(model.id), # ส่งเป็น String เพื่อป้องกัน JS Truncation
"status": status,
"endpoint": model.full_inference_url(),
"details": detail
})
overall_model_health = "UP" if all(m['status'] == "UP" for m in model_statuses) else "DOWN"
return overall_model_health, model_statuses
def get_system_health(self):
"""เมธอดหลักที่รวมผลลัพธ์ทั้งหมด"""
results = {}
overall_status = "Healthy"
# ตรวจสอบ Health ของ AI Model แต่ละตัว
ai_overall_status, model_details = self._check_all_ai_models()
# รวมผลลัพธ์ของ Model Health เข้าไปใน results
results['model_endpoints'] = {"name": "AI Model Endpoints", "status": ai_overall_status, "models": model_details}
if ai_overall_status != 'UP': overall_status = "Degraded"
# รัน Check ทั้งหมด
db_status, db_details = self._check_database()
results['database'] = {"name": "CockroachDB", "status": db_status, "details": db_details}
if db_status != 'UP': overall_status = "Degraded"
cache_status, cache_details = self._check_cache()
results['cache'] = {"name": "Redis Cache", "status": cache_status, "details": cache_details}
if cache_status != 'UP': overall_status = "Degraded"
minio_status, minio_details = self._check_minio()
results['storage'] = {"name": "MinIO S3", "status": minio_status, "details": minio_details}
if minio_status != 'UP': overall_status = "Degraded"
ai_status, ai_details = self._check_ai_service()
results['ai_service'] = {"name": "MONAI FastAPI", "status": ai_status, "details": ai_details}
if ai_status != 'UP': overall_status = "Degraded"
return {
"status": overall_status,
"services": results,
"last_checked": datetime.now(timezone.utc).isoformat()
}