It works now
This commit is contained in:
123
app/main.py
123
app/main.py
@ -1,15 +1,14 @@
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from pydantic import BaseModel, Field, validator, constr, conlist
|
||||
from typing import Dict, List
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from typing import Dict, List, Annotated
|
||||
import uuid as uuid_lib
|
||||
|
||||
from collections import deque
|
||||
@ -44,7 +43,6 @@ class BufferHandler(logging.Handler):
|
||||
except Exception as e:
|
||||
# Log the error to stderr, to avoid recursion or filling the buffer with errors
|
||||
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
|
||||
# Optionally, you could log record.msg or record.exc_info here for more context
|
||||
|
||||
|
||||
class LogBuffer:
|
||||
@ -60,7 +58,6 @@ class LogBuffer:
|
||||
'message': record.get('message'),
|
||||
'extra': {k: v for k, v in record.items()
|
||||
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
|
||||
# Added more common LogRecord attributes to exclude from 'extra'
|
||||
})
|
||||
|
||||
def get_logs(self, limit=100):
|
||||
@ -80,10 +77,8 @@ formatter = jsonlogger.JsonFormatter(
|
||||
logHandler.setFormatter(formatter)
|
||||
|
||||
# Add handlers to the root logger
|
||||
# Avoid adding handlers multiple times in a uvicorn environment
|
||||
if not logger.handlers:
|
||||
logger.addHandler(logHandler)
|
||||
# Add buffer handler to logger ONLY ONCE
|
||||
buffer_handler = BufferHandler()
|
||||
logger.addHandler(buffer_handler)
|
||||
|
||||
@ -97,15 +92,18 @@ app = FastAPI(
|
||||
# Configure templates for the web interface
|
||||
templates = Jinja2Templates(directory="app/web/templates")
|
||||
|
||||
# Mount static files directory
|
||||
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
||||
|
||||
|
||||
# --- Data Models (as defined in the project spec) ---
|
||||
class NodeStatusModel(BaseModel):
|
||||
uptime_seconds: int
|
||||
load_avg: conlist(float, min_length=3, max_length=3)
|
||||
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
|
||||
memory_usage_percent: float
|
||||
|
||||
class PingModel(BaseModel):
|
||||
pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
|
||||
pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]
|
||||
|
||||
class StatusUpdate(BaseModel):
|
||||
node: str = Field(..., description="Node UUID")
|
||||
@ -131,15 +129,61 @@ class StatusUpdate(BaseModel):
|
||||
return v
|
||||
|
||||
|
||||
# A mock database of known nodes for the auto-discovery demo
|
||||
# In a real app, this would be managed more dynamically
|
||||
known_nodes_db = {}
|
||||
# --- Node Management and Health Logic ---
|
||||
# A mock database of known nodes, now storing more comprehensive data
|
||||
known_nodes_db: Dict[str, Dict] = {}
|
||||
|
||||
# Health calculation constants (can be tuned)
|
||||
LOAD_AVG_WARNING_THRESHOLD = 1.5
|
||||
LOAD_AVG_CRITICAL_THRESHOLD = 3.0
|
||||
MEMORY_WARNING_THRESHOLD = 75.0
|
||||
MEMORY_CRITICAL_THRESHOLD = 90.0
|
||||
# If a node hasn't reported in this many seconds, it's considered critical
|
||||
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
|
||||
|
||||
def get_node_health(node_data: Dict) -> str:
|
||||
"""Calculates the health status based on node metrics and last seen time."""
|
||||
# Check for liveness first
|
||||
last_seen_str = node_data.get("last_seen")
|
||||
if last_seen_str:
|
||||
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
|
||||
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
|
||||
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
|
||||
return "critical" # Node has not reported recently
|
||||
else:
|
||||
return "unknown" # Should not happen if 'last_seen' is always set
|
||||
|
||||
status_model_data = node_data.get("status")
|
||||
if not status_model_data:
|
||||
# This could happen if a node is just discovered but hasn't sent a full status update yet
|
||||
return "unknown"
|
||||
|
||||
try:
|
||||
status = NodeStatusModel(**status_model_data)
|
||||
except Exception:
|
||||
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
|
||||
return "unknown" # Or critical if parsing fails
|
||||
|
||||
# Check load average (using 1-minute load for primary indicator)
|
||||
load_1min = status.load_avg[0]
|
||||
if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
|
||||
return "critical"
|
||||
elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
|
||||
return "warning"
|
||||
|
||||
# Check memory usage
|
||||
if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
|
||||
return "critical"
|
||||
elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
|
||||
return "warning"
|
||||
|
||||
return "healthy"
|
||||
|
||||
|
||||
# --- API Endpoints ---
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def read_root(request: Request):
|
||||
"""Serves the main web page which displays the Service UUID."""
|
||||
"""Serves the main web page which displays the Service UUID and the node grid."""
|
||||
logger.info(
|
||||
"Web root accessed",
|
||||
extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
|
||||
@ -149,7 +193,6 @@ async def read_root(request: Request):
|
||||
{"request": request, "service_uuid": SERVICE_UUID}
|
||||
)
|
||||
|
||||
# Add the logs endpoint
|
||||
@app.get("/{service_uuid}/logs")
|
||||
async def get_logs(service_uuid: str, limit: int = 100):
|
||||
"""Get recent logs for the service."""
|
||||
@ -175,7 +218,6 @@ async def update_node_status(
|
||||
request: Request
|
||||
):
|
||||
"""Receives status updates from a node and returns a list of peers."""
|
||||
# Log the incoming status update with structured context
|
||||
logger.info(
|
||||
"Received node status update",
|
||||
extra={
|
||||
@ -205,7 +247,7 @@ async def update_node_status(
|
||||
)
|
||||
|
||||
# Update ping metrics
|
||||
for target_uuid, latency in status_update.pings.pings.items():
|
||||
for target_uuid, latency in status_update.pings.items():
|
||||
database.update_ping_metrics(
|
||||
node_uuid=node_uuid,
|
||||
target_uuid=target_uuid,
|
||||
@ -214,20 +256,51 @@ async def update_node_status(
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database update failed: {e}")
|
||||
logger.error(f"Database update failed: {e}", exc_info=True)
|
||||
# Continue processing even if DB update fails
|
||||
|
||||
# Auto-discovery logic
|
||||
if node_uuid not in known_nodes_db:
|
||||
logger.info(f"New node discovered: {node_uuid}")
|
||||
# A real app would need a strategy to handle node addresses
|
||||
known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
|
||||
# Auto-discovery logic and update known_nodes_db with full status
|
||||
current_time_utc = datetime.now(timezone.utc)
|
||||
|
||||
known_nodes_db[node_uuid] = {
|
||||
"last_seen": current_time_utc.isoformat(),
|
||||
"ip": request.client.host,
|
||||
"status": status_update.status.dict(), # Store the dict representation
|
||||
# Store direct values for convenience in /nodes/status endpoint
|
||||
"uptime_seconds": status_update.status.uptime_seconds,
|
||||
"load_avg": status_update.status.load_avg,
|
||||
"memory_usage_percent": status_update.status.memory_usage_percent
|
||||
}
|
||||
|
||||
# Calculate health for logging purposes (it will be recalculated for /nodes/status)
|
||||
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
|
||||
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
|
||||
|
||||
# Respond with the list of other known peers
|
||||
peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
|
||||
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
|
||||
for uuid, data in known_nodes_db.items() if uuid != node_uuid}
|
||||
|
||||
return {"message": "Status received", "peers": peer_list}
|
||||
|
||||
@app.get("/nodes/status")
|
||||
async def get_all_nodes_status():
|
||||
"""Returns the current status of all known nodes for the UI."""
|
||||
logger.info("Fetching all nodes status for UI.")
|
||||
response_nodes = []
|
||||
for node_uuid, data in known_nodes_db.items():
|
||||
# Dynamically calculate health for each node based on its current data
|
||||
current_health = get_node_health(data)
|
||||
|
||||
response_nodes.append({
|
||||
"uuid": node_uuid,
|
||||
"last_seen": data["last_seen"],
|
||||
"ip": data["ip"],
|
||||
"health_status": current_health,
|
||||
"uptime_seconds": data.get("uptime_seconds"),
|
||||
"load_avg": data.get("load_avg"),
|
||||
"memory_usage_percent": data.get("memory_usage_percent")
|
||||
})
|
||||
return {"nodes": response_nodes}
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
|
Reference in New Issue
Block a user