Gemini, trying to make this "done".
This commit is contained in:
201
app/main.py
201
app/main.py
@ -2,8 +2,8 @@ import os
|
||||
import uuid
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from fastapi import FastAPI, Request, status
|
||||
from datetime import datetime, timezone
|
||||
from fastapi import FastAPI, Request, status, Query
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
@ -19,38 +19,33 @@ import sys
|
||||
from .database import RRDDatabase
|
||||
|
||||
# --- Service Configuration ---
|
||||
# Generate a unique Service UUID on startup, or get it from an environment variable
|
||||
SERVICE_UUID = os.environ.get("SERVICE_UUID", str(uuid.uuid4()))
|
||||
database = RRDDatabase()
|
||||
|
||||
# --- Logging Configuration ---
|
||||
# Get the root logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Custom handler to capture logs
|
||||
class BufferHandler(logging.Handler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Instantiate the formatter once for efficiency
|
||||
self.formatter = jsonlogger.JsonFormatter()
|
||||
|
||||
def emit(self, record):
|
||||
try:
|
||||
# Format the record as JSON string, then parse to dict
|
||||
# Format the record as a JSON string and then parse it back to a dict
|
||||
# This ensures consistency with the jsonlogger's output format
|
||||
log_entry = json.loads(self.formatter.format(record))
|
||||
log_buffer.add_log(log_entry)
|
||||
except Exception as e:
|
||||
# Log the error to stderr, to avoid recursion or filling the buffer with errors
|
||||
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
class LogBuffer:
|
||||
def __init__(self, maxlen=1000):
|
||||
self.buffer = deque(maxlen=maxlen)
|
||||
|
||||
def add_log(self, record):
|
||||
# Ensure 'asctime' is present or handle its absence
|
||||
# Assuming 'record' here is already a dictionary parsed from the JSON log string
|
||||
timestamp = record.get('asctime') or datetime.utcnow().isoformat()
|
||||
self.buffer.append({
|
||||
'timestamp': timestamp,
|
||||
@ -60,28 +55,40 @@ class LogBuffer:
|
||||
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
|
||||
})
|
||||
|
||||
def get_logs(self, limit=100):
|
||||
return list(self.buffer)[-limit:]
|
||||
def get_logs(self, limit=100, level=None, since=None):
|
||||
logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}")
|
||||
logs = list(self.buffer)
|
||||
# Apply level filter
|
||||
if level and level.strip():
|
||||
level = level.upper()
|
||||
valid_levels = {'INFO', 'WARNING', 'ERROR', 'DEBUG'} # Added DEBUG for completeness
|
||||
if level in valid_levels:
|
||||
logs = [log for log in logs if log['level'].upper() == level]
|
||||
else:
|
||||
logger.warning(f"Invalid log level: {level}")
|
||||
# Apply since filter
|
||||
if since:
|
||||
try:
|
||||
# Handle 'Z' for UTC and ensure timezone awareness for comparison
|
||||
since_dt = datetime.fromisoformat(since.replace('Z', '+00:00')).astimezone(timezone.utc)
|
||||
logs = [log for log in logs if
|
||||
datetime.fromisoformat(log['timestamp'].replace('Z', '+00:00')).astimezone(timezone.utc) >= since_dt]
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid 'since' timestamp: {since}")
|
||||
logger.debug(f"Returning {len(logs[-limit:])} logs")
|
||||
return logs[-limit:]
|
||||
|
||||
# Create global log buffer
|
||||
log_buffer = LogBuffer()
|
||||
|
||||
# Use a handler that streams to stdout
|
||||
logHandler = logging.StreamHandler()
|
||||
|
||||
# Create a JSON formatter and add it to the handler
|
||||
# The format string adds default log attributes to the JSON output
|
||||
formatter = jsonlogger.JsonFormatter(
|
||||
'%(asctime)s %(name)s %(levelname)s %(message)s'
|
||||
)
|
||||
logHandler.setFormatter(formatter)
|
||||
|
||||
# Add handlers to the root logger
|
||||
if not logger.handlers:
|
||||
logger.addHandler(logHandler)
|
||||
buffer_handler = BufferHandler()
|
||||
logger.addHandler(buffer_handler)
|
||||
|
||||
logger.addHandler(BufferHandler())
|
||||
|
||||
# --- FastAPI Application ---
|
||||
app = FastAPI(
|
||||
@ -89,14 +96,10 @@ app = FastAPI(
|
||||
description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}"
|
||||
)
|
||||
|
||||
# Configure templates for the web interface
|
||||
templates = Jinja2Templates(directory="app/web/templates")
|
||||
|
||||
# Mount static files directory
|
||||
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
||||
|
||||
|
||||
# --- Data Models (as defined in the project spec) ---
|
||||
# --- Data Models ---
|
||||
class NodeStatusModel(BaseModel):
|
||||
uptime_seconds: int
|
||||
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
|
||||
@ -128,50 +131,41 @@ class StatusUpdate(BaseModel):
|
||||
raise ValueError(f'Invalid UUID format in pings: {key}')
|
||||
return v
|
||||
|
||||
|
||||
# --- Node Management and Health Logic ---
|
||||
# A mock database of known nodes, now storing more comprehensive data
|
||||
known_nodes_db: Dict[str, Dict] = {}
|
||||
|
||||
# Health calculation constants (can be tuned)
|
||||
LOAD_AVG_WARNING_THRESHOLD = 1.5
|
||||
LOAD_AVG_CRITICAL_THRESHOLD = 3.0
|
||||
MEMORY_WARNING_THRESHOLD = 75.0
|
||||
MEMORY_CRITICAL_THRESHOLD = 90.0
|
||||
# If a node hasn't reported in this many seconds, it's considered critical
|
||||
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
|
||||
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
|
||||
|
||||
def get_node_health(node_data: Dict) -> str:
|
||||
"""Calculates the health status based on node metrics and last seen time."""
|
||||
# Check for liveness first
|
||||
last_seen_str = node_data.get("last_seen")
|
||||
if last_seen_str:
|
||||
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
|
||||
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
|
||||
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
|
||||
return "critical" # Node has not reported recently
|
||||
return "critical"
|
||||
else:
|
||||
return "unknown" # Should not happen if 'last_seen' is always set
|
||||
return "unknown"
|
||||
|
||||
status_model_data = node_data.get("status")
|
||||
if not status_model_data:
|
||||
# This could happen if a node is just discovered but hasn't sent a full status update yet
|
||||
return "unknown"
|
||||
return "unknown"
|
||||
|
||||
try:
|
||||
status = NodeStatusModel(**status_model_data)
|
||||
except Exception:
|
||||
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
|
||||
return "unknown" # Or critical if parsing fails
|
||||
return "unknown"
|
||||
|
||||
# Check load average (using 1-minute load for primary indicator)
|
||||
load_1min = status.load_avg[0]
|
||||
if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
|
||||
return "critical"
|
||||
elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
|
||||
return "warning"
|
||||
|
||||
# Check memory usage
|
||||
if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
|
||||
return "critical"
|
||||
elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
|
||||
@ -179,53 +173,108 @@ def get_node_health(node_data: Dict) -> str:
|
||||
|
||||
return "healthy"
|
||||
|
||||
|
||||
# --- API Endpoints ---
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def read_root(request: Request):
|
||||
"""Serves the main web page which displays the Service UUID and the node grid."""
|
||||
# Use X-Forwarded-For if available, otherwise client.host
|
||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||
logger.info(
|
||||
"Web root accessed",
|
||||
extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
|
||||
extra={'client_ip': client_ip, 'service_uuid': SERVICE_UUID}
|
||||
)
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
{"request": request, "service_uuid": SERVICE_UUID}
|
||||
{
|
||||
"request": request,
|
||||
"service_uuid": SERVICE_UUID,
|
||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
||||
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/{service_uuid}/logs")
|
||||
async def get_logs(
|
||||
request: Request,
|
||||
service_uuid: str,
|
||||
limit: int = 100,
|
||||
format: str = Query(None, description="Response format: 'json' for JSON, default is HTML"),
|
||||
level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"),
|
||||
since: str = Query(None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00")
|
||||
):
|
||||
# Use X-Forwarded-For if available, otherwise client.host
|
||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||
logger.info(
|
||||
"Logs endpoint accessed",
|
||||
extra={
|
||||
'service_uuid': service_uuid,
|
||||
'format': format,
|
||||
'level': level,
|
||||
'since': since,
|
||||
'limit': limit,
|
||||
'client_ip': client_ip
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/{service_uuid}/logs", response_class=HTMLResponse)
|
||||
async def get_logs(request: Request, service_uuid: str, limit: int = 100):
|
||||
"""Serve the logs web page with recent logs for the service."""
|
||||
if service_uuid != SERVICE_UUID:
|
||||
logger.warning(f"Invalid service UUID: {service_uuid}")
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content={"error": "Service UUID not found"}
|
||||
)
|
||||
|
||||
logs = log_buffer.get_logs(limit)
|
||||
return templates.TemplateResponse(
|
||||
"logs.html",
|
||||
{
|
||||
"request": request,
|
||||
"service_uuid": service_uuid,
|
||||
"logs": logs,
|
||||
"log_count": len(logs)
|
||||
}
|
||||
)
|
||||
|
||||
@app.put("/{service_uuid}/{node_uuid}/", status_code=status.HTTP_200_OK)
|
||||
try:
|
||||
logs = log_buffer.get_logs(limit=limit, level=level, since=since)
|
||||
log_data = {
|
||||
"service_uuid": service_uuid,
|
||||
"log_count": len(logs),
|
||||
"logs": logs
|
||||
}
|
||||
logger.debug(f"Fetched {len(logs)} logs for response")
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching logs: {e}", exc_info=True)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={"error": "Failed to fetch logs"}
|
||||
)
|
||||
|
||||
if format == "json":
|
||||
logger.debug("Returning JSON response")
|
||||
return JSONResponse(content=log_data)
|
||||
|
||||
logger.debug("Rendering logs.html template")
|
||||
try:
|
||||
return templates.TemplateResponse(
|
||||
"logs.html",
|
||||
{
|
||||
"request": request,
|
||||
"service_uuid": service_uuid,
|
||||
"logs": logs,
|
||||
"log_count": len(logs),
|
||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
||||
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error rendering logs.html: {e}", exc_info=True)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={"error": "Failed to render logs page"}
|
||||
)
|
||||
|
||||
@app.put("/{service_uuid}/{node_uuid}/")
|
||||
async def update_node_status(
|
||||
service_uuid: str,
|
||||
node_uuid: str,
|
||||
status_update: StatusUpdate,
|
||||
request: Request
|
||||
):
|
||||
"""Receives status updates from a node and returns a list of peers."""
|
||||
# Use X-Forwarded-For if available, otherwise client.host
|
||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||
logger.info(
|
||||
"Received node status update",
|
||||
extra={
|
||||
'event_type': 'node_status_update',
|
||||
'client_ip': request.client.host,
|
||||
'client_ip': client_ip,
|
||||
'service_uuid': service_uuid,
|
||||
'node_uuid': node_uuid,
|
||||
'data': status_update.dict()
|
||||
@ -239,7 +288,6 @@ async def update_node_status(
|
||||
)
|
||||
return {"error": "Service UUID mismatch", "peers": []}
|
||||
|
||||
# Update RRD database with system metrics
|
||||
try:
|
||||
database.update_system_metrics(
|
||||
node_uuid=node_uuid,
|
||||
@ -248,8 +296,7 @@ async def update_node_status(
|
||||
load_avg=status_update.status.load_avg,
|
||||
memory_usage_percent=status_update.status.memory_usage_percent
|
||||
)
|
||||
|
||||
# Update ping metrics
|
||||
|
||||
for target_uuid, latency in status_update.pings.items():
|
||||
database.update_ping_metrics(
|
||||
node_uuid=node_uuid,
|
||||
@ -257,29 +304,24 @@ async def update_node_status(
|
||||
timestamp=status_update.timestamp,
|
||||
latency_ms=latency
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database update failed: {e}", exc_info=True)
|
||||
# Continue processing even if DB update fails
|
||||
|
||||
# Auto-discovery logic and update known_nodes_db with full status
|
||||
current_time_utc = datetime.now(timezone.utc)
|
||||
|
||||
|
||||
known_nodes_db[node_uuid] = {
|
||||
"last_seen": current_time_utc.isoformat(),
|
||||
"ip": request.client.host,
|
||||
"status": status_update.status.dict(), # Store the dict representation
|
||||
# Store direct values for convenience in /nodes/status endpoint
|
||||
"ip": request.client.host, # Keep original client.host here as it's the direct connection
|
||||
"status": status_update.status.dict(),
|
||||
"uptime_seconds": status_update.status.uptime_seconds,
|
||||
"load_avg": status_update.status.load_avg,
|
||||
"memory_usage_percent": status_update.status.memory_usage_percent
|
||||
}
|
||||
|
||||
# Calculate health for logging purposes (it will be recalculated for /nodes/status)
|
||||
|
||||
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
|
||||
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
|
||||
|
||||
# Respond with the list of other known peers
|
||||
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
|
||||
for uuid, data in known_nodes_db.items() if uuid != node_uuid}
|
||||
|
||||
@ -287,25 +329,21 @@ async def update_node_status(
|
||||
|
||||
@app.get("/nodes/status")
|
||||
async def get_all_nodes_status():
|
||||
"""Returns the current status of all known nodes for the UI, including ping latencies."""
|
||||
logger.info("Fetching all nodes status for UI.")
|
||||
response_nodes = []
|
||||
|
||||
for node_uuid, data in known_nodes_db.items():
|
||||
# Dynamically calculate health for each node
|
||||
current_health = get_node_health(data)
|
||||
|
||||
# Build connections dictionary with raw ping latencies
|
||||
connections = {}
|
||||
for target_uuid in known_nodes_db:
|
||||
if target_uuid != node_uuid: # Exclude self
|
||||
# Fetch recent ping data (last 5 minutes to account for RRD step=60s)
|
||||
if target_uuid != node_uuid:
|
||||
ping_data = database.get_ping_data(node_uuid, target_uuid, start_time="-300s")
|
||||
latency_ms = None
|
||||
if ping_data and ping_data['data']['latency']:
|
||||
# Get the most recent non-null latency
|
||||
# Get the most recent non-None latency
|
||||
for latency in reversed(ping_data['data']['latency']):
|
||||
if latency is not None:
|
||||
if latency is not None and not (isinstance(latency, float) and latency == 0.0): # Exclude 0.0 which might be a default
|
||||
latency_ms = float(latency)
|
||||
break
|
||||
connections[target_uuid] = latency_ms
|
||||
@ -324,5 +362,4 @@ async def get_all_nodes_status():
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint for container orchestration."""
|
||||
return {"status": "ok"}
|
||||
|
Reference in New Issue
Block a user