diff --git a/backend/api/services/__init__.py b/backend/api/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/api/services/health_service.py b/backend/api/services/health_service.py new file mode 100644 index 0000000..55be93e --- /dev/null +++ b/backend/api/services/health_service.py @@ -0,0 +1,186 @@ +import os +import requests +import time +from django.db import connection, Error as DjangoDBError +from django.core.cache import cache +from django.conf import settings +import boto3 +from botocore.client import Config +from botocore.exceptions import ClientError +from datetime import datetime, timezone + +# นำเข้า Repository และ Service ของ Model Registry +from model_registry.repositories.ai_model_repository import AiModelRepository +from model_registry.models import AiModel + +# สร้าง Instance ของ Repository และ Service (ถ้ายังไม่มีในไฟล์นี้) +ai_model_repo = AiModelRepository() + +# กำหนด URL ของ AI Service (ใช้ localhost สำหรับ Local Dev) +AI_SERVICE_URL = os.getenv("AI_SERVICE_INTERNAL_URL", "http://localhost:8001") + +# ใช้ Exception เดิมจาก model_registry +class HealthCheckError(Exception): + pass + + +class HealthService: + def __init__(self): + pass + + def _check_database(self): + # Logic ตรวจสอบ CockroachDB (เหมือนเดิม) + start_time = time.time() + try: + with connection.cursor() as cursor: + cursor.execute("SELECT 1") + latency = round((time.time() - start_time) * 1000, 2) + return "UP", f"Query executed successfully. Latency: {latency}ms" + except DjangoDBError as e: + return "DOWN", f"DB Connection Error: {e}" + except Exception as e: + return "DOWN", f"Unknown DB Error: {e}" + + def _check_cache(self): + # Logic ตรวจสอบ Redis (เหมือนเดิม) + start_time = time.time() + test_key = 'health_test_key' + try: + cache.set(test_key, 'ok', timeout=1) + result = cache.get(test_key) + latency = round((time.time() - start_time) * 1000, 2) + if result == 'ok': + return "UP", f"Read/Write successful. Latency: {latency}ms" + return "DOWN", "Failed to retrieve test key." + except Exception as e: + return "DOWN", f"Redis Error: {e}" + + def _check_minio(self): + """Logic ตรวจสอบ Object Storage (MinIO) โดยใช้ boto3""" + try: + # 1. สร้าง S3 Client ด้วย boto3 + s3_client = boto3.client( + "s3", + endpoint_url=os.getenv("MINIO_ENDPOINT", "http://localhost:9000"), + aws_access_key_id=os.getenv("MINIO_ACCESS_KEY", "minio_admin"), + aws_secret_access_key=os.getenv("MINIO_SECRET_KEY", "minio_p@ssw0rd!"), + # ใช้ Config เพื่อจัดการ timeout/signature version + config=Config(signature_version="s3v4", connect_timeout=5, read_timeout=10) + ) + + bucket_name = os.getenv("MODEL_BUCKET", "models") + + # 2. ทดสอบการเข้าถึง Bucket โดยใช้ head_bucket (มีประสิทธิภาพกว่า list_buckets) + # ถ้า Bucket มีอยู่ จะไม่เกิด Error + s3_client.head_bucket(Bucket=bucket_name) + + return "UP", f"Bucket '{bucket_name}' accessible via boto3." + + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + return "DOWN", f"Bucket '{bucket_name}' not found." + elif error_code == '403': + return "DOWN", f"MinIO S3 Access Denied. Check Key/Secret." + return "DOWN", f"MinIO S3 Error (Code {error_code}): {e}" + except Exception as e: + return "DOWN", f"MinIO Connection Error: {e}" + + def _check_ai_service(self): + # Logic ตรวจสอบ FastAPI/MONAI (เหมือนเดิม) + start_time = time.time() + try: + response = requests.get(AI_SERVICE_URL, timeout=5) + response.raise_for_status() + + data = response.json() + model_loaded = data.get("model_loaded", False) + + latency = round((time.time() - start_time) * 1000, 2) + + if model_loaded: + return "UP", f"Service running & Model loaded. Latency: {latency}ms" + else: + return "DOWN", f"Service running, but Model load status: {model_loaded}. Latency: {latency}ms" + + except requests.exceptions.RequestException as e: + return "DOWN", f"FastAPI/MONAI Service Unreachable: {e}" + except Exception as e: + return "DOWN", f"AI Service Check Failed: {e}" + + # ----------------------------------------------- + # ตรวจสอบ Health Check ของ AI Model แต่ละตัว + # ----------------------------------------------- + def _check_single_ai_model(self, model: AiModel): + """เรียก Health Check ของโมเดลตัวเดียวโดยใช้ Full URL""" + # ใช้ endpoint เฉพาะสำหรับ Health Check (Path มี '/health' ต่อท้าย) + health_url = model.full_inference_url().rstrip('/') + '/health' + start_time = time.time() + + try: + # ใช้ requests.get ไปยัง Health Check Endpoint + response = requests.get(health_url, timeout=5) + response.raise_for_status() # จะผ่านถ้าได้ 200 OK + # ... (ส่วนการคำนวณ Latency และ return "UP" เหมือนเดิม) + latency = round((time.time() - start_time) * 1000, 2) + return "UP", f"Model Health Check successful. Latency: {latency}ms" + + except requests.exceptions.RequestException as e: + return "DOWN", f"Health Check Failed at {health_url}: {e}" + except Exception as e: + return "DOWN", f"Check Failed: {e}" + + + def _check_all_ai_models(self): + """วนลูปตรวจสอบโมเดลทั้งหมดที่สถานะ ACTIVE""" + active_models = ai_model_repo.get_all().filter(status='ACTIVE') + model_statuses = [] + + for model in active_models: + status, detail = self._check_single_ai_model(model) + model_statuses.append({ + "name": f"{model.name} (v{model.model_version})", + "id": str(model.id), # ส่งเป็น String เพื่อป้องกัน JS Truncation + "status": status, + "endpoint": model.full_inference_url(), + "details": detail + }) + + overall_model_health = "UP" if all(m['status'] == "UP" for m in model_statuses) else "DOWN" + + return overall_model_health, model_statuses + + def get_system_health(self): + """เมธอดหลักที่รวมผลลัพธ์ทั้งหมด""" + results = {} + overall_status = "Healthy" + + # ตรวจสอบ Health ของ AI Model แต่ละตัว + ai_overall_status, model_details = self._check_all_ai_models() + + # รวมผลลัพธ์ของ Model Health เข้าไปใน results + results['model_endpoints'] = {"name": "AI Model Endpoints", "status": ai_overall_status, "models": model_details} + if ai_overall_status != 'UP': overall_status = "Degraded" + + # รัน Check ทั้งหมด + db_status, db_details = self._check_database() + results['database'] = {"name": "CockroachDB", "status": db_status, "details": db_details} + if db_status != 'UP': overall_status = "Degraded" + + cache_status, cache_details = self._check_cache() + results['cache'] = {"name": "Redis Cache", "status": cache_status, "details": cache_details} + if cache_status != 'UP': overall_status = "Degraded" + + minio_status, minio_details = self._check_minio() + results['storage'] = {"name": "MinIO S3", "status": minio_status, "details": minio_details} + if minio_status != 'UP': overall_status = "Degraded" + + ai_status, ai_details = self._check_ai_service() + results['ai_service'] = {"name": "MONAI FastAPI", "status": ai_status, "details": ai_details} + if ai_status != 'UP': overall_status = "Degraded" + + return { + "status": overall_status, + "services": results, + "last_checked": datetime.now(timezone.utc).isoformat() + } \ No newline at end of file diff --git a/backend/api/views/__init__.py b/backend/api/views/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/api/views/health_check_view.py b/backend/api/views/health_check_view.py new file mode 100644 index 0000000..fc5a255 --- /dev/null +++ b/backend/api/views/health_check_view.py @@ -0,0 +1,36 @@ +from rest_framework.views import APIView +from rest_framework.response import Response +from rest_framework import status, permissions +from datetime import datetime, timezone + +# Import Service Layer +from api.services.health_service import HealthService + +# Dependency Injection: สร้าง Instance ของ Service +health_service = HealthService() + +class SystemHealthCheck(APIView): + """ + GET /api/v1/health/ + Controller สำหรับดึงสถานะ Health Check ของระบบ + """ + permission_classes = [permissions.AllowAny] + + def get(self, request): + try: + # เรียกใช้ Service Layer + response_data = health_service.get_system_health() + + # กำหนด HTTP Status ตามสถานะรวม + http_status = status.HTTP_200_OK + if response_data['status'] != "Healthy": + http_status = status.HTTP_503_SERVICE_UNAVAILABLE + + return Response(response_data, status=http_status) + + except Exception as e: + # จัดการข้อผิดพลาดที่ไม่คาดคิดในระดับ View + return Response( + {"status": "Error", "detail": f"Internal Server Error during health check: {e}"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR + ) \ No newline at end of file diff --git a/backend/core/urls.py b/backend/core/urls.py index c3be368..b7eaefe 100644 --- a/backend/core/urls.py +++ b/backend/core/urls.py @@ -21,6 +21,9 @@ from rest_framework.routers import DefaultRouter from model_registry.views.ai_model_viewset import AiModelRegistryViewSet from drf_spectacular.views import SpectacularAPIView, SpectacularSwaggerView, SpectacularRedocView +# Import Health Check View ในแอพ /api +from api.views.health_check_view import SystemHealthCheck + # 1. กำหนดตัวแปร router ก่อนใช้งาน router = DefaultRouter() @@ -49,6 +52,9 @@ urlpatterns = [ path('api/v1/auth/', include('djoser.urls')), # /users/ (Register/Update/Me), /users/set_password path('api/v1/auth/', include('djoser.urls.jwt')), # /jwt/create (Login), /jwt/refresh (Refresh Token) + # Health Check Endpoint URL: /api/v1/health/ + path('api/v1/health/', SystemHealthCheck.as_view(), name='system-health'), + # 3. รวม Router API path('api/v1/', include(router.urls)), ]