It works now

2025-06-11 00:01:10 +03:00
parent 11a565732d
commit 94c988ee7b
9 changed files with 797 additions and 30 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,15 +1,14 @@
-
 import os
 import uuid
 import json
 import logging
-from datetime import datetime
-
+from datetime import datetime, timezone, timedelta
 from fastapi import FastAPI, Request, status
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
-from pydantic import BaseModel, Field, validator, constr, conlist
-from typing import Dict, List
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field, validator
+from typing import Dict, List, Annotated
 import uuid as uuid_lib

 from collections import deque
@@ -44,7 +43,6 @@ class BufferHandler(logging.Handler):
        except Exception as e:
            # Log the error to stderr, to avoid recursion or filling the buffer with errors
            print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
-            # Optionally, you could log record.msg or record.exc_info here for more context


 class LogBuffer:
@@ -60,7 +58,6 @@ class LogBuffer:
            'message': record.get('message'),
            'extra': {k: v for k, v in record.items()
                     if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
-            # Added more common LogRecord attributes to exclude from 'extra'
        })

    def get_logs(self, limit=100):
@@ -80,10 +77,8 @@ formatter = jsonlogger.JsonFormatter(
 logHandler.setFormatter(formatter)

 # Add handlers to the root logger
-# Avoid adding handlers multiple times in a uvicorn environment
 if not logger.handlers:
    logger.addHandler(logHandler)
-    # Add buffer handler to logger ONLY ONCE
    buffer_handler = BufferHandler()
    logger.addHandler(buffer_handler)

@@ -97,15 +92,18 @@ app = FastAPI(
 # Configure templates for the web interface
 templates = Jinja2Templates(directory="app/web/templates")

+# Mount static files directory
+app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
+

 # --- Data Models (as defined in the project spec) ---
 class NodeStatusModel(BaseModel):
    uptime_seconds: int
-    load_avg: conlist(float, min_length=3, max_length=3)
+    load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
    memory_usage_percent: float

 class PingModel(BaseModel):
-    pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
+    pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]

 class StatusUpdate(BaseModel):
    node: str = Field(..., description="Node UUID")
@@ -131,15 +129,61 @@ class StatusUpdate(BaseModel):
        return v


-# A mock database of known nodes for the auto-discovery demo
-# In a real app, this would be managed more dynamically
-known_nodes_db = {}
+# --- Node Management and Health Logic ---
+# A mock database of known nodes, now storing more comprehensive data
+known_nodes_db: Dict[str, Dict] = {}
+
+# Health calculation constants (can be tuned)
+LOAD_AVG_WARNING_THRESHOLD = 1.5
+LOAD_AVG_CRITICAL_THRESHOLD = 3.0
+MEMORY_WARNING_THRESHOLD = 75.0
+MEMORY_CRITICAL_THRESHOLD = 90.0
+# If a node hasn't reported in this many seconds, it's considered critical
+LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30 
+
+def get_node_health(node_data: Dict) -> str:
+    """Calculates the health status based on node metrics and last seen time."""
+    # Check for liveness first
+    last_seen_str = node_data.get("last_seen")
+    if last_seen_str:
+        last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
+        time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
+        if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
+            return "critical" # Node has not reported recently
+    else:
+        return "unknown" # Should not happen if 'last_seen' is always set
+
+    status_model_data = node_data.get("status")
+    if not status_model_data:
+        # This could happen if a node is just discovered but hasn't sent a full status update yet
+        return "unknown" 
+
+    try:
+        status = NodeStatusModel(**status_model_data)
+    except Exception:
+        logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
+        return "unknown" # Or critical if parsing fails
+
+    # Check load average (using 1-minute load for primary indicator)
+    load_1min = status.load_avg[0]
+    if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
+        return "critical"
+    elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
+        return "warning"
+
+    # Check memory usage
+    if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
+        return "critical"
+    elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
+        return "warning"
+
+    return "healthy"


 # --- API Endpoints ---
@app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
-    """Serves the main web page which displays the Service UUID."""
+    """Serves the main web page which displays the Service UUID and the node grid."""
    logger.info(
        "Web root accessed",
        extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
@@ -149,7 +193,6 @@ async def read_root(request: Request):
        {"request": request, "service_uuid": SERVICE_UUID}
    )

-# Add the logs endpoint
@app.get("/{service_uuid}/logs")
 async def get_logs(service_uuid: str, limit: int = 100):
    """Get recent logs for the service."""
@@ -175,7 +218,6 @@ async def update_node_status(
    request: Request
 ):
    """Receives status updates from a node and returns a list of peers."""
-    # Log the incoming status update with structured context
    logger.info(
        "Received node status update",
        extra={
@@ -205,7 +247,7 @@ async def update_node_status(
        )
        
        # Update ping metrics
-        for target_uuid, latency in status_update.pings.pings.items():
+        for target_uuid, latency in status_update.pings.items():
            database.update_ping_metrics(
                node_uuid=node_uuid,
                target_uuid=target_uuid,
@@ -214,20 +256,51 @@ async def update_node_status(
            )
            
    except Exception as e:
-        logger.error(f"Database update failed: {e}")
+        logger.error(f"Database update failed: {e}", exc_info=True)
        # Continue processing even if DB update fails

-    # Auto-discovery logic
-    if node_uuid not in known_nodes_db:
-        logger.info(f"New node discovered: {node_uuid}")
-    # A real app would need a strategy to handle node addresses
-    known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
+    # Auto-discovery logic and update known_nodes_db with full status
+    current_time_utc = datetime.now(timezone.utc)
+    
+    known_nodes_db[node_uuid] = {
+        "last_seen": current_time_utc.isoformat(),
+        "ip": request.client.host,
+        "status": status_update.status.dict(), # Store the dict representation
+        # Store direct values for convenience in /nodes/status endpoint
+        "uptime_seconds": status_update.status.uptime_seconds,
+        "load_avg": status_update.status.load_avg,
+        "memory_usage_percent": status_update.status.memory_usage_percent
+    }
+    
+    # Calculate health for logging purposes (it will be recalculated for /nodes/status)
+    health_status_for_log = get_node_health(known_nodes_db[node_uuid])
+    logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")

    # Respond with the list of other known peers
-    peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
+    peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
+                 for uuid, data in known_nodes_db.items() if uuid != node_uuid}

    return {"message": "Status received", "peers": peer_list}

+@app.get("/nodes/status")
+async def get_all_nodes_status():
+    """Returns the current status of all known nodes for the UI."""
+    logger.info("Fetching all nodes status for UI.")
+    response_nodes = []
+    for node_uuid, data in known_nodes_db.items():
+        # Dynamically calculate health for each node based on its current data
+        current_health = get_node_health(data)
+        
+        response_nodes.append({
+            "uuid": node_uuid,
+            "last_seen": data["last_seen"],
+            "ip": data["ip"],
+            "health_status": current_health,
+            "uptime_seconds": data.get("uptime_seconds"),
+            "load_avg": data.get("load_avg"),
+            "memory_usage_percent": data.get("memory_usage_percent")
+        })
+    return {"nodes": response_nodes}

@app.get("/health")
 async def health_check():