Gemini, trying to make this "done".

2025-06-11 22:27:46 +03:00
parent d93b0ee4ee
commit 1cbf9311e5
7 changed files with 300 additions and 135 deletions
--- a/app/main.py
+++ b/app/main.py
@ -2,8 +2,8 @@ import os
 import uuid
 import json
 import logging
-from datetime import datetime, timezone, timedelta
-from fastapi import FastAPI, Request, status
+from datetime import datetime, timezone
+from fastapi import FastAPI, Request, status, Query
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
@ -19,38 +19,33 @@ import sys
 from .database import RRDDatabase

 # --- Service Configuration ---
-# Generate a unique Service UUID on startup, or get it from an environment variable
 SERVICE_UUID = os.environ.get("SERVICE_UUID", str(uuid.uuid4()))
 database = RRDDatabase()

 # --- Logging Configuration ---
-# Get the root logger
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)

-# Custom handler to capture logs
 class BufferHandler(logging.Handler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        # Instantiate the formatter once for efficiency
        self.formatter = jsonlogger.JsonFormatter()

    def emit(self, record):
        try:
-            # Format the record as JSON string, then parse to dict
+            # Format the record as a JSON string and then parse it back to a dict
+            # This ensures consistency with the jsonlogger's output format
            log_entry = json.loads(self.formatter.format(record))
            log_buffer.add_log(log_entry)
        except Exception as e:
-            # Log the error to stderr, to avoid recursion or filling the buffer with errors
            print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)

-
 class LogBuffer:
    def __init__(self, maxlen=1000):
        self.buffer = deque(maxlen=maxlen)

    def add_log(self, record):
-        # Ensure 'asctime' is present or handle its absence
+        # Assuming 'record' here is already a dictionary parsed from the JSON log string
        timestamp = record.get('asctime') or datetime.utcnow().isoformat()
        self.buffer.append({
            'timestamp': timestamp,
@ -60,28 +55,40 @@ class LogBuffer:
                     if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
        })

-    def get_logs(self, limit=100):
-        return list(self.buffer)[-limit:]
+    def get_logs(self, limit=100, level=None, since=None):
+        logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}")
+        logs = list(self.buffer)
+        # Apply level filter
+        if level and level.strip():
+            level = level.upper()
+            valid_levels = {'INFO', 'WARNING', 'ERROR', 'DEBUG'} # Added DEBUG for completeness
+            if level in valid_levels:
+                logs = [log for log in logs if log['level'].upper() == level]
+            else:
+                logger.warning(f"Invalid log level: {level}")
+        # Apply since filter
+        if since:
+            try:
+                # Handle 'Z' for UTC and ensure timezone awareness for comparison
+                since_dt = datetime.fromisoformat(since.replace('Z', '+00:00')).astimezone(timezone.utc)
+                logs = [log for log in logs if
+                        datetime.fromisoformat(log['timestamp'].replace('Z', '+00:00')).astimezone(timezone.utc) >= since_dt]
+            except ValueError:
+                logger.warning(f"Invalid 'since' timestamp: {since}")
+        logger.debug(f"Returning {len(logs[-limit:])} logs")
+        return logs[-limit:]

-# Create global log buffer
 log_buffer = LogBuffer()

-# Use a handler that streams to stdout
 logHandler = logging.StreamHandler()
-
-# Create a JSON formatter and add it to the handler
-# The format string adds default log attributes to the JSON output
 formatter = jsonlogger.JsonFormatter(
    '%(asctime)s %(name)s %(levelname)s %(message)s'
 )
 logHandler.setFormatter(formatter)

-# Add handlers to the root logger
 if not logger.handlers:
    logger.addHandler(logHandler)
-    buffer_handler = BufferHandler()
-    logger.addHandler(buffer_handler)
-
+    logger.addHandler(BufferHandler())

 # --- FastAPI Application ---
 app = FastAPI(
@ -89,14 +96,10 @@ app = FastAPI(
    description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}"
 )

-# Configure templates for the web interface
 templates = Jinja2Templates(directory="app/web/templates")
-
-# Mount static files directory
 app.mount("/static", StaticFiles(directory="app/web/static"), name="static")

-
-# --- Data Models (as defined in the project spec) ---
+# --- Data Models ---
 class NodeStatusModel(BaseModel):
    uptime_seconds: int
    load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
@ -128,50 +131,41 @@ class StatusUpdate(BaseModel):
                raise ValueError(f'Invalid UUID format in pings: {key}')
        return v

-
 # --- Node Management and Health Logic ---
-# A mock database of known nodes, now storing more comprehensive data
 known_nodes_db: Dict[str, Dict] = {}

-# Health calculation constants (can be tuned)
 LOAD_AVG_WARNING_THRESHOLD = 1.5
 LOAD_AVG_CRITICAL_THRESHOLD = 3.0
 MEMORY_WARNING_THRESHOLD = 75.0
 MEMORY_CRITICAL_THRESHOLD = 90.0
-# If a node hasn't reported in this many seconds, it's considered critical
-LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30 
+LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30

 def get_node_health(node_data: Dict) -> str:
-    """Calculates the health status based on node metrics and last seen time."""
-    # Check for liveness first
    last_seen_str = node_data.get("last_seen")
    if last_seen_str:
        last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
        time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
        if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
-            return "critical" # Node has not reported recently
+            return "critical"
    else:
-        return "unknown" # Should not happen if 'last_seen' is always set
+        return "unknown"

    status_model_data = node_data.get("status")
    if not status_model_data:
-        # This could happen if a node is just discovered but hasn't sent a full status update yet
-        return "unknown" 
+        return "unknown"

    try:
        status = NodeStatusModel(**status_model_data)
    except Exception:
        logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
-        return "unknown" # Or critical if parsing fails
+        return "unknown"

-    # Check load average (using 1-minute load for primary indicator)
    load_1min = status.load_avg[0]
    if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
        return "critical"
    elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
        return "warning"

-    # Check memory usage
    if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
        return "critical"
    elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
@ -179,53 +173,108 @@ def get_node_health(node_data: Dict) -> str:

    return "healthy"

-
 # --- API Endpoints ---
@app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
-    """Serves the main web page which displays the Service UUID and the node grid."""
+    # Use X-Forwarded-For if available, otherwise client.host
+    client_ip = request.headers.get("x-forwarded-for", request.client.host)
    logger.info(
        "Web root accessed",
-        extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
+        extra={'client_ip': client_ip, 'service_uuid': SERVICE_UUID}
    )
    return templates.TemplateResponse(
        "index.html",
-        {"request": request, "service_uuid": SERVICE_UUID}
+        {
+            "request": request,
+            "service_uuid": SERVICE_UUID,
+            "url_for": request.url_for, # Pass url_for for dynamic URL generation
+            "root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
+        }
+    )
+
+@app.get("/{service_uuid}/logs")
+async def get_logs(
+    request: Request,
+    service_uuid: str,
+    limit: int = 100,
+    format: str = Query(None, description="Response format: 'json' for JSON, default is HTML"),
+    level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"),
+    since: str = Query(None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00")
+):
+    # Use X-Forwarded-For if available, otherwise client.host
+    client_ip = request.headers.get("x-forwarded-for", request.client.host)
+    logger.info(
+        "Logs endpoint accessed",
+        extra={
+            'service_uuid': service_uuid,
+            'format': format,
+            'level': level,
+            'since': since,
+            'limit': limit,
+            'client_ip': client_ip
+        }
    )

-@app.get("/{service_uuid}/logs", response_class=HTMLResponse)
-async def get_logs(request: Request, service_uuid: str, limit: int = 100):
-    """Serve the logs web page with recent logs for the service."""
    if service_uuid != SERVICE_UUID:
+        logger.warning(f"Invalid service UUID: {service_uuid}")
        return JSONResponse(
            status_code=404,
            content={"error": "Service UUID not found"}
        )
-    
-    logs = log_buffer.get_logs(limit)
-    return templates.TemplateResponse(
-        "logs.html",
-        {
-            "request": request,
-            "service_uuid": service_uuid,
-            "logs": logs,
-            "log_count": len(logs)
-        }
-    )

-@app.put("/{service_uuid}/{node_uuid}/", status_code=status.HTTP_200_OK)
+    try:
+        logs = log_buffer.get_logs(limit=limit, level=level, since=since)
+        log_data = {
+            "service_uuid": service_uuid,
+            "log_count": len(logs),
+            "logs": logs
+        }
+        logger.debug(f"Fetched {len(logs)} logs for response")
+    except Exception as e:
+        logger.error(f"Error fetching logs: {e}", exc_info=True)
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Failed to fetch logs"}
+        )
+
+    if format == "json":
+        logger.debug("Returning JSON response")
+        return JSONResponse(content=log_data)
+
+    logger.debug("Rendering logs.html template")
+    try:
+        return templates.TemplateResponse(
+            "logs.html",
+            {
+                "request": request,
+                "service_uuid": service_uuid,
+                "logs": logs,
+                "log_count": len(logs),
+                "url_for": request.url_for, # Pass url_for for dynamic URL generation
+                "root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
+            }
+        )
+    except Exception as e:
+        logger.error(f"Error rendering logs.html: {e}", exc_info=True)
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Failed to render logs page"}
+        )
+
+@app.put("/{service_uuid}/{node_uuid}/")
 async def update_node_status(
    service_uuid: str,
    node_uuid: str,
    status_update: StatusUpdate,
    request: Request
 ):
-    """Receives status updates from a node and returns a list of peers."""
+    # Use X-Forwarded-For if available, otherwise client.host
+    client_ip = request.headers.get("x-forwarded-for", request.client.host)
    logger.info(
        "Received node status update",
        extra={
            'event_type': 'node_status_update',
-            'client_ip': request.client.host,
+            'client_ip': client_ip,
            'service_uuid': service_uuid,
            'node_uuid': node_uuid,
            'data': status_update.dict()
@ -239,7 +288,6 @@ async def update_node_status(
        )
        return {"error": "Service UUID mismatch", "peers": []}

-    # Update RRD database with system metrics
    try:
        database.update_system_metrics(
            node_uuid=node_uuid,
@ -248,8 +296,7 @@ async def update_node_status(
            load_avg=status_update.status.load_avg,
            memory_usage_percent=status_update.status.memory_usage_percent
        )
-        
-        # Update ping metrics
+
        for target_uuid, latency in status_update.pings.items():
            database.update_ping_metrics(
                node_uuid=node_uuid,
@ -257,29 +304,24 @@ async def update_node_status(
                timestamp=status_update.timestamp,
                latency_ms=latency
            )
-            
+
    except Exception as e:
        logger.error(f"Database update failed: {e}", exc_info=True)
-        # Continue processing even if DB update fails

-    # Auto-discovery logic and update known_nodes_db with full status
    current_time_utc = datetime.now(timezone.utc)
-    
+
    known_nodes_db[node_uuid] = {
        "last_seen": current_time_utc.isoformat(),
-        "ip": request.client.host,
-        "status": status_update.status.dict(), # Store the dict representation
-        # Store direct values for convenience in /nodes/status endpoint
+        "ip": request.client.host, # Keep original client.host here as it's the direct connection
+        "status": status_update.status.dict(),
        "uptime_seconds": status_update.status.uptime_seconds,
        "load_avg": status_update.status.load_avg,
        "memory_usage_percent": status_update.status.memory_usage_percent
    }
-    
-    # Calculate health for logging purposes (it will be recalculated for /nodes/status)
+
    health_status_for_log = get_node_health(known_nodes_db[node_uuid])
    logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")

-    # Respond with the list of other known peers
    peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
                 for uuid, data in known_nodes_db.items() if uuid != node_uuid}

@ -287,25 +329,21 @@ async def update_node_status(

@app.get("/nodes/status")
 async def get_all_nodes_status():
-    """Returns the current status of all known nodes for the UI, including ping latencies."""
    logger.info("Fetching all nodes status for UI.")
    response_nodes = []

    for node_uuid, data in known_nodes_db.items():
-        # Dynamically calculate health for each node
        current_health = get_node_health(data)

-        # Build connections dictionary with raw ping latencies
        connections = {}
        for target_uuid in known_nodes_db:
-            if target_uuid != node_uuid:  # Exclude self
-                # Fetch recent ping data (last 5 minutes to account for RRD step=60s)
+            if target_uuid != node_uuid:
                ping_data = database.get_ping_data(node_uuid, target_uuid, start_time="-300s")
                latency_ms = None
                if ping_data and ping_data['data']['latency']:
-                    # Get the most recent non-null latency
+                    # Get the most recent non-None latency
                    for latency in reversed(ping_data['data']['latency']):
-                        if latency is not None:
+                        if latency is not None and not (isinstance(latency, float) and latency == 0.0): # Exclude 0.0 which might be a default
                            latency_ms = float(latency)
                            break
                connections[target_uuid] = latency_ms
@ -324,5 +362,4 @@ async def get_all_nodes_status():

@app.get("/health")
 async def health_check():
-    """Health check endpoint for container orchestration."""
    return {"status": "ok"}