186 lines
8.3 KiB
Python
186 lines
8.3 KiB
Python
import os
|
|
import requests
|
|
import time
|
|
from django.db import connection, Error as DjangoDBError
|
|
from django.core.cache import cache
|
|
from django.conf import settings
|
|
import boto3
|
|
from botocore.client import Config
|
|
from botocore.exceptions import ClientError
|
|
from datetime import datetime, timezone
|
|
|
|
# นำเข้า Repository และ Service ของ Model Registry
|
|
from model_registry.repositories.ai_model_repository import AiModelRepository
|
|
from model_registry.models import AiModel
|
|
|
|
# สร้าง Instance ของ Repository และ Service (ถ้ายังไม่มีในไฟล์นี้)
|
|
ai_model_repo = AiModelRepository()
|
|
|
|
# กำหนด URL ของ AI Service (ใช้ localhost สำหรับ Local Dev)
|
|
AI_SERVICE_URL = os.getenv("AI_SERVICE_INTERNAL_URL", "http://localhost:8001")
|
|
|
|
# ใช้ Exception เดิมจาก model_registry
|
|
class HealthCheckError(Exception):
|
|
pass
|
|
|
|
|
|
class HealthService:
|
|
def __init__(self):
|
|
pass
|
|
|
|
def _check_database(self):
|
|
# Logic ตรวจสอบ CockroachDB (เหมือนเดิม)
|
|
start_time = time.time()
|
|
try:
|
|
with connection.cursor() as cursor:
|
|
cursor.execute("SELECT 1")
|
|
latency = round((time.time() - start_time) * 1000, 2)
|
|
return "UP", f"Query executed successfully. Latency: {latency}ms"
|
|
except DjangoDBError as e:
|
|
return "DOWN", f"DB Connection Error: {e}"
|
|
except Exception as e:
|
|
return "DOWN", f"Unknown DB Error: {e}"
|
|
|
|
def _check_cache(self):
|
|
# Logic ตรวจสอบ Redis (เหมือนเดิม)
|
|
start_time = time.time()
|
|
test_key = 'health_test_key'
|
|
try:
|
|
cache.set(test_key, 'ok', timeout=1)
|
|
result = cache.get(test_key)
|
|
latency = round((time.time() - start_time) * 1000, 2)
|
|
if result == 'ok':
|
|
return "UP", f"Read/Write successful. Latency: {latency}ms"
|
|
return "DOWN", "Failed to retrieve test key."
|
|
except Exception as e:
|
|
return "DOWN", f"Redis Error: {e}"
|
|
|
|
def _check_minio(self):
|
|
"""Logic ตรวจสอบ Object Storage (MinIO) โดยใช้ boto3"""
|
|
try:
|
|
# 1. สร้าง S3 Client ด้วย boto3
|
|
s3_client = boto3.client(
|
|
"s3",
|
|
endpoint_url=os.getenv("MINIO_ENDPOINT", "http://localhost:9000"),
|
|
aws_access_key_id=os.getenv("MINIO_ACCESS_KEY", "minio_admin"),
|
|
aws_secret_access_key=os.getenv("MINIO_SECRET_KEY", "minio_p@ssw0rd!"),
|
|
# ใช้ Config เพื่อจัดการ timeout/signature version
|
|
config=Config(signature_version="s3v4", connect_timeout=5, read_timeout=10)
|
|
)
|
|
|
|
bucket_name = os.getenv("MODEL_BUCKET", "models")
|
|
|
|
# 2. ทดสอบการเข้าถึง Bucket โดยใช้ head_bucket (มีประสิทธิภาพกว่า list_buckets)
|
|
# ถ้า Bucket มีอยู่ จะไม่เกิด Error
|
|
s3_client.head_bucket(Bucket=bucket_name)
|
|
|
|
return "UP", f"Bucket '{bucket_name}' accessible via boto3."
|
|
|
|
except ClientError as e:
|
|
error_code = e.response['Error']['Code']
|
|
if error_code == '404':
|
|
return "DOWN", f"Bucket '{bucket_name}' not found."
|
|
elif error_code == '403':
|
|
return "DOWN", f"MinIO S3 Access Denied. Check Key/Secret."
|
|
return "DOWN", f"MinIO S3 Error (Code {error_code}): {e}"
|
|
except Exception as e:
|
|
return "DOWN", f"MinIO Connection Error: {e}"
|
|
|
|
def _check_ai_service(self):
|
|
# Logic ตรวจสอบ FastAPI/MONAI (เหมือนเดิม)
|
|
start_time = time.time()
|
|
try:
|
|
response = requests.get(AI_SERVICE_URL, timeout=5)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
model_loaded = data.get("model_loaded", False)
|
|
|
|
latency = round((time.time() - start_time) * 1000, 2)
|
|
|
|
if model_loaded:
|
|
return "UP", f"Service running & Model loaded. Latency: {latency}ms"
|
|
else:
|
|
return "DOWN", f"Service running, but Model load status: {model_loaded}. Latency: {latency}ms"
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
return "DOWN", f"FastAPI/MONAI Service Unreachable: {e}"
|
|
except Exception as e:
|
|
return "DOWN", f"AI Service Check Failed: {e}"
|
|
|
|
# -----------------------------------------------
|
|
# ตรวจสอบ Health Check ของ AI Model แต่ละตัว
|
|
# -----------------------------------------------
|
|
def _check_single_ai_model(self, model: AiModel):
|
|
"""เรียก Health Check ของโมเดลตัวเดียวโดยใช้ Full URL"""
|
|
# ใช้ endpoint เฉพาะสำหรับ Health Check (Path มี '/health' ต่อท้าย)
|
|
health_url = model.full_inference_url().rstrip('/') + '/health'
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# ใช้ requests.get ไปยัง Health Check Endpoint
|
|
response = requests.get(health_url, timeout=5)
|
|
response.raise_for_status() # จะผ่านถ้าได้ 200 OK
|
|
# ... (ส่วนการคำนวณ Latency และ return "UP" เหมือนเดิม)
|
|
latency = round((time.time() - start_time) * 1000, 2)
|
|
return "UP", f"Model Health Check successful. Latency: {latency}ms"
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
return "DOWN", f"Health Check Failed at {health_url}: {e}"
|
|
except Exception as e:
|
|
return "DOWN", f"Check Failed: {e}"
|
|
|
|
|
|
def _check_all_ai_models(self):
|
|
"""วนลูปตรวจสอบโมเดลทั้งหมดที่สถานะ ACTIVE"""
|
|
active_models = ai_model_repo.get_all().filter(status='ACTIVE')
|
|
model_statuses = []
|
|
|
|
for model in active_models:
|
|
status, detail = self._check_single_ai_model(model)
|
|
model_statuses.append({
|
|
"name": f"{model.name} (v{model.model_version})",
|
|
"id": str(model.id), # ส่งเป็น String เพื่อป้องกัน JS Truncation
|
|
"status": status,
|
|
"endpoint": model.full_inference_url(),
|
|
"details": detail
|
|
})
|
|
|
|
overall_model_health = "UP" if all(m['status'] == "UP" for m in model_statuses) else "DOWN"
|
|
|
|
return overall_model_health, model_statuses
|
|
|
|
def get_system_health(self):
|
|
"""เมธอดหลักที่รวมผลลัพธ์ทั้งหมด"""
|
|
results = {}
|
|
overall_status = "Healthy"
|
|
|
|
# ตรวจสอบ Health ของ AI Model แต่ละตัว
|
|
ai_overall_status, model_details = self._check_all_ai_models()
|
|
|
|
# รวมผลลัพธ์ของ Model Health เข้าไปใน results
|
|
results['model_endpoints'] = {"name": "AI Model Endpoints", "status": ai_overall_status, "models": model_details}
|
|
if ai_overall_status != 'UP': overall_status = "Degraded"
|
|
|
|
# รัน Check ทั้งหมด
|
|
db_status, db_details = self._check_database()
|
|
results['database'] = {"name": "CockroachDB", "status": db_status, "details": db_details}
|
|
if db_status != 'UP': overall_status = "Degraded"
|
|
|
|
cache_status, cache_details = self._check_cache()
|
|
results['cache'] = {"name": "Redis Cache", "status": cache_status, "details": cache_details}
|
|
if cache_status != 'UP': overall_status = "Degraded"
|
|
|
|
minio_status, minio_details = self._check_minio()
|
|
results['storage'] = {"name": "MinIO S3", "status": minio_status, "details": minio_details}
|
|
if minio_status != 'UP': overall_status = "Degraded"
|
|
|
|
ai_status, ai_details = self._check_ai_service()
|
|
results['ai_service'] = {"name": "MONAI FastAPI", "status": ai_status, "details": ai_details}
|
|
if ai_status != 'UP': overall_status = "Degraded"
|
|
|
|
return {
|
|
"status": overall_status,
|
|
"services": results,
|
|
"last_checked": datetime.now(timezone.utc).isoformat()
|
|
} |