Files
node-monitor/app/main.py
2025-06-10 22:25:48 +03:00

204 lines
6.1 KiB
Python

# app/main.py
import os
import uuid
import json
import logging
from datetime import datetime
from fastapi import FastAPI, Request, status
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, Field, validator, constr, conlist
from typing import Dict, List
import uuid as uuid_lib
from collections import deque
from pythonjsonlogger import jsonlogger
# --- Service Configuration ---
# Generate a unique Service UUID on startup, or get it from an environment variable
SERVICE_UUID = os.environ.get("SERVICE_UUID", str(uuid.uuid4()))
# --- Logging Configuration ---
# Get the root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Use a handler that streams to stdout
logHandler = logging.StreamHandler()
# Create a JSON formatter and add it to the handler
# The format string adds default log attributes to the JSON output
formatter = jsonlogger.JsonFormatter(
'%(asctime)s %(name)s %(levelname)s %(message)s'
)
logHandler.setFormatter(formatter)
# Add the handler to the root logger
# Avoid adding handlers multiple times in a uvicorn environment
if not logger.handlers:
logger.addHandler(logHandler)
class LogBuffer:
def __init__(self, maxlen=1000):
self.buffer = deque(maxlen=maxlen)
def add_log(self, record):
self.buffer.append({
'timestamp': record.get('asctime'),
'level': record.get('levelname'),
'message': record.get('message'),
'extra': {k: v for k, v in record.items()
if k not in ['asctime', 'levelname', 'message', 'name']}
})
def get_logs(self, limit=100):
return list(self.buffer)[-limit:]
# Create global log buffer
log_buffer = LogBuffer()
# Add buffer handler to logger
buffer_handler = BufferHandler()
logger.addHandler(buffer_handler)
# Custom handler to capture logs
class BufferHandler(logging.Handler):
def emit(self, record):
try:
# Format the record as JSON
formatter = jsonlogger.JsonFormatter()
log_entry = json.loads(formatter.format(record))
log_buffer.add_log(log_entry)
except Exception:
pass
# --- FastAPI Application ---
app = FastAPI(
title="Node Monitoring System",
description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}"
)
# Configure templates for the web interface
templates = Jinja2Templates(directory="app/web/templates")
# --- Data Models (as defined in the project spec) ---
class NodeStatusModel(BaseModel):
uptime_seconds: int
load_avg: conlist(float, min_length=3, max_length=3)
memory_usage_percent: float
class PingModel(BaseModel):
pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
class StatusUpdate(BaseModel):
node: str = Field(..., description="Node UUID")
timestamp: datetime
status: NodeStatusModel
pings: Dict[str, float]
@validator('node')
def validate_node_uuid(cls, v):
try:
uuid_lib.UUID(v)
return v
except ValueError:
raise ValueError('Invalid UUID format')
@validator('pings')
def validate_ping_uuids(cls, v):
for key in v.keys():
try:
uuid_lib.UUID(key)
except ValueError:
raise ValueError(f'Invalid UUID format in pings: {key}')
return v
# A mock database of known nodes for the auto-discovery demo
# In a real app, this would be managed more dynamically
known_nodes_db = {}
# --- API Endpoints ---
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
"""Serves the main web page which displays the Service UUID."""
logger.info(
"Web root accessed",
extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
)
return templates.TemplateResponse(
"index.html",
{"request": request, "service_uuid": SERVICE_UUID}
)
# Add the logs endpoint
@app.get("/{service_uuid}/logs")
async def get_logs(service_uuid: str, limit: int = 100):
"""Get recent logs for the service."""
if service_uuid != SERVICE_UUID:
return JSONResponse(
status_code=404,
content={"error": "Service UUID not found"}
)
logs = log_buffer.get_logs(limit)
return {
"service_uuid": service_uuid,
"log_count": len(logs),
"logs": logs
}
@app.put("/{service_uuid}/{node_uuid}/", status_code=status.HTTP_200_OK)
async def update_node_status(
service_uuid: str,
node_uuid: str,
status_update: StatusUpdate,
request: Request
):
"""Receives status updates from a node and returns a list of peers."""
# Log the incoming status update with structured context
logger.info(
"Received node status update",
extra={
'event_type': 'node_status_update',
'client_ip': request.client.host,
'service_uuid': service_uuid,
'node_uuid': node_uuid,
'data': status_update.dict()
}
)
# In a real implementation, you would now update the RRD database here.
# e.g., database.update(node_uuid, status_update)
# For now, we simulate the logic.
if service_uuid != SERVICE_UUID:
logger.warning(
"Node sent status to wrong service UUID",
extra={'client_node_uuid': node_uuid, 'target_uuid': service_uuid}
)
return {"error": "Service UUID mismatch", "peers": []}
# Auto-discovery logic: update our list of known nodes
if node_uuid not in known_nodes_db:
logger.info(f"New node discovered: {node_uuid}")
# A real app would need a strategy to handle node addresses
known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
# Respond with the list of other known peers
peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
return {"message": "Status received", "peers": peer_list}
@app.get("/health")
async def health_check():
"""Health check endpoint for container orchestration."""
return {"status": "ok"}