It works now

This commit is contained in:
Kalzu Rekku
2025-06-11 00:01:10 +03:00
parent 11a565732d
commit 94c988ee7b
9 changed files with 797 additions and 30 deletions

View File

@ -1,15 +1,14 @@
import os
import uuid
import json
import logging
from datetime import datetime
from datetime import datetime, timezone, timedelta
from fastapi import FastAPI, Request, status
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, Field, validator, constr, conlist
from typing import Dict, List
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field, validator
from typing import Dict, List, Annotated
import uuid as uuid_lib
from collections import deque
@ -44,7 +43,6 @@ class BufferHandler(logging.Handler):
except Exception as e:
# Log the error to stderr, to avoid recursion or filling the buffer with errors
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
# Optionally, you could log record.msg or record.exc_info here for more context
class LogBuffer:
@ -60,7 +58,6 @@ class LogBuffer:
'message': record.get('message'),
'extra': {k: v for k, v in record.items()
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
# Added more common LogRecord attributes to exclude from 'extra'
})
def get_logs(self, limit=100):
@ -80,10 +77,8 @@ formatter = jsonlogger.JsonFormatter(
logHandler.setFormatter(formatter)
# Add handlers to the root logger
# Avoid adding handlers multiple times in a uvicorn environment
if not logger.handlers:
logger.addHandler(logHandler)
# Add buffer handler to logger ONLY ONCE
buffer_handler = BufferHandler()
logger.addHandler(buffer_handler)
@ -97,15 +92,18 @@ app = FastAPI(
# Configure templates for the web interface
templates = Jinja2Templates(directory="app/web/templates")
# Mount static files directory
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
# --- Data Models (as defined in the project spec) ---
class NodeStatusModel(BaseModel):
uptime_seconds: int
load_avg: conlist(float, min_length=3, max_length=3)
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
memory_usage_percent: float
class PingModel(BaseModel):
pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]
class StatusUpdate(BaseModel):
node: str = Field(..., description="Node UUID")
@ -131,15 +129,61 @@ class StatusUpdate(BaseModel):
return v
# A mock database of known nodes for the auto-discovery demo
# In a real app, this would be managed more dynamically
known_nodes_db = {}
# --- Node Management and Health Logic ---
# A mock database of known nodes, now storing more comprehensive data
known_nodes_db: Dict[str, Dict] = {}
# Health calculation constants (can be tuned)
LOAD_AVG_WARNING_THRESHOLD = 1.5
LOAD_AVG_CRITICAL_THRESHOLD = 3.0
MEMORY_WARNING_THRESHOLD = 75.0
MEMORY_CRITICAL_THRESHOLD = 90.0
# If a node hasn't reported in this many seconds, it's considered critical
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
def get_node_health(node_data: Dict) -> str:
"""Calculates the health status based on node metrics and last seen time."""
# Check for liveness first
last_seen_str = node_data.get("last_seen")
if last_seen_str:
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
return "critical" # Node has not reported recently
else:
return "unknown" # Should not happen if 'last_seen' is always set
status_model_data = node_data.get("status")
if not status_model_data:
# This could happen if a node is just discovered but hasn't sent a full status update yet
return "unknown"
try:
status = NodeStatusModel(**status_model_data)
except Exception:
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
return "unknown" # Or critical if parsing fails
# Check load average (using 1-minute load for primary indicator)
load_1min = status.load_avg[0]
if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
return "critical"
elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
return "warning"
# Check memory usage
if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
return "critical"
elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
return "warning"
return "healthy"
# --- API Endpoints ---
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
"""Serves the main web page which displays the Service UUID."""
"""Serves the main web page which displays the Service UUID and the node grid."""
logger.info(
"Web root accessed",
extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
@ -149,7 +193,6 @@ async def read_root(request: Request):
{"request": request, "service_uuid": SERVICE_UUID}
)
# Add the logs endpoint
@app.get("/{service_uuid}/logs")
async def get_logs(service_uuid: str, limit: int = 100):
"""Get recent logs for the service."""
@ -175,7 +218,6 @@ async def update_node_status(
request: Request
):
"""Receives status updates from a node and returns a list of peers."""
# Log the incoming status update with structured context
logger.info(
"Received node status update",
extra={
@ -205,7 +247,7 @@ async def update_node_status(
)
# Update ping metrics
for target_uuid, latency in status_update.pings.pings.items():
for target_uuid, latency in status_update.pings.items():
database.update_ping_metrics(
node_uuid=node_uuid,
target_uuid=target_uuid,
@ -214,20 +256,51 @@ async def update_node_status(
)
except Exception as e:
logger.error(f"Database update failed: {e}")
logger.error(f"Database update failed: {e}", exc_info=True)
# Continue processing even if DB update fails
# Auto-discovery logic
if node_uuid not in known_nodes_db:
logger.info(f"New node discovered: {node_uuid}")
# A real app would need a strategy to handle node addresses
known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
# Auto-discovery logic and update known_nodes_db with full status
current_time_utc = datetime.now(timezone.utc)
known_nodes_db[node_uuid] = {
"last_seen": current_time_utc.isoformat(),
"ip": request.client.host,
"status": status_update.status.dict(), # Store the dict representation
# Store direct values for convenience in /nodes/status endpoint
"uptime_seconds": status_update.status.uptime_seconds,
"load_avg": status_update.status.load_avg,
"memory_usage_percent": status_update.status.memory_usage_percent
}
# Calculate health for logging purposes (it will be recalculated for /nodes/status)
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
# Respond with the list of other known peers
peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
for uuid, data in known_nodes_db.items() if uuid != node_uuid}
return {"message": "Status received", "peers": peer_list}
@app.get("/nodes/status")
async def get_all_nodes_status():
"""Returns the current status of all known nodes for the UI."""
logger.info("Fetching all nodes status for UI.")
response_nodes = []
for node_uuid, data in known_nodes_db.items():
# Dynamically calculate health for each node based on its current data
current_health = get_node_health(data)
response_nodes.append({
"uuid": node_uuid,
"last_seen": data["last_seen"],
"ip": data["ip"],
"health_status": current_health,
"uptime_seconds": data.get("uptime_seconds"),
"load_avg": data.get("load_avg"),
"memory_usage_percent": data.get("memory_usage_percent")
})
return {"nodes": response_nodes}
@app.get("/health")
async def health_check():