starting to be usable.

This commit is contained in:
Kalzu Rekku
2025-06-12 23:12:19 +03:00
parent 4e31fe7cce
commit a1f4fc556b
8 changed files with 263 additions and 153 deletions

View File

@ -1,50 +1,78 @@
# Use Debian-based Python 3.13 slim image # Stage 1:
FROM python:3.13-slim-bookworm # This stage installs build dependencies and builds Python packages into wheels.
FROM python:3.13-slim-bookworm AS builder
# Install build dependencies for rrdtool and Python packages
RUN apt-get update && apt-get install -y --no-install-recommends \
librrd-dev \
build-essential \
python3-dev \
&& rm -rf /var/lib/apt/lists/*
# Set working directory # Set working directory
WORKDIR /app WORKDIR /app
# Install system dependencies for rrdtool # Copy requirements file
RUN apt-get update && apt-get install -y \
rrdtool \
librrd-dev \
build-essential \
python3-dev \
wget \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first for better Docker layer caching
COPY requirements.txt . COPY requirements.txt .
# Install Python dependencies # Install Python dependencies into a wheelhouse
# This builds source distributions (like rrdtool) into wheels
# We don't need a venv here as we're just creating wheels, not installing them
RUN pip install --no-cache-dir --upgrade pip && \ RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt pip wheel --no-cache-dir --wheel-dir /tmp/wheels -r requirements.txt
# Remove build dependencies to reduce image size
RUN apt-get purge -y build-essential python3-dev && \ # Stage 2: Runtime
apt-get autoremove -y && \ # This stage takes the minimal base image and copies only the necessary runtime artifacts.
apt-get clean FROM python:3.13-slim-bookworm
# Install runtime system dependencies for rrdtool and wget for healthcheck
# rrdtool and librrd8 are the runtime libraries for rrdtool (not librrd-dev)
RUN apt-get update && apt-get install -y --no-install-recommends \
rrdtool \
librrd8 \
wget \
# Final cleanup to reduce image size
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Set working directory
WORKDIR /app
# Create a non-root user for security (before creating venv in their home if desired, or in /opt)
RUN useradd --create-home --shell /bin/bash appuser
# Create a virtual environment for the application
# We'll put it in /opt/venv for consistency, and ensure appuser can access it
RUN python3 -m venv /opt/venv && \
/opt/venv/bin/pip install --no-cache-dir --upgrade pip
# Copy the built Python wheels from the builder stage
COPY --from=builder /tmp/wheels /tmp/wheels/
# Install Python dependencies from the wheels into the virtual environment
RUN /opt/venv/bin/pip install --no-cache-dir /tmp/wheels/*.whl && \
rm -rf /tmp/wheels # Remove the wheels after installation to save space
# Copy application code # Copy application code
COPY app/ ./app/ COPY app/ ./app/
# Create directory for RRD data at /data (will be volume mounted) # Set permissions for the appuser and data directory
RUN mkdir -p /data RUN chown -R appuser:appuser /app && \
chown -R appuser:appuser /opt/venv && \
mkdir -p /data && \
chown -R appuser:appuser /data && \
chmod 777 /data # Ensure volume mount has write permissions
# Switch to the non-root user
USER appuser
# Expose port # Expose port
EXPOSE 8000 EXPOSE 8000
# Create non-root user for security
RUN useradd --create-home --shell /bin/bash appuser && \
chown -R appuser:appuser /app && \
chown -R appuser:appuser /data && \
chmod 777 /data
USER appuser
# Health check # Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1 CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
# Run the application # Run the application using the virtual environment's python interpreter
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -12,7 +12,7 @@ from typing import Dict, List, Annotated
import uuid as uuid_lib import uuid as uuid_lib
from collections import deque from collections import deque
from dateutil.parser import isoparse from dateutil.parser import isoparse # Import isoparse for robust date parsing
from pythonjsonlogger import jsonlogger from pythonjsonlogger import jsonlogger
import sys import sys
@ -27,19 +27,26 @@ database = RRDDatabase()
logger = logging.getLogger() logger = logging.getLogger()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class BufferHandler(logging.Handler): class BufferHandler(logging.Handler):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.formatter = jsonlogger.JsonFormatter() # Use the same formatter string as the StreamHandler for consistency
self.formatter = jsonlogger.JsonFormatter(
"%(asctime)s %(name)s %(levelname)s %(message)s"
)
def emit(self, record): def emit(self, record):
try: try:
# Format the record as a JSON string and then parse it back to a dict log_entry_str = self.formatter.format(record)
# This ensures consistency with the jsonlogger's output format log_entry = json.loads(log_entry_str)
log_entry = json.loads(self.formatter.format(record))
log_buffer.add_log(log_entry) log_buffer.add_log(log_entry)
except Exception as e: except Exception as e:
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr) print(
f"Error in BufferHandler: Could not process log record: {e}",
file=sys.stderr,
)
class LogBuffer: class LogBuffer:
def __init__(self, maxlen=1000): def __init__(self, maxlen=1000):
@ -47,14 +54,35 @@ class LogBuffer:
def add_log(self, record): def add_log(self, record):
# Assuming 'record' here is already a dictionary parsed from the JSON log string # Assuming 'record' here is already a dictionary parsed from the JSON log string
timestamp = record.get('asctime') or datetime.utcnow().isoformat() timestamp = record.get("asctime") or datetime.utcnow().isoformat()
self.buffer.append({ self.buffer.append(
'timestamp': timestamp, {
'level': record.get('levelname'), "timestamp": timestamp,
'message': record.get('message'), "level": record.get(
'extra': {k: v for k, v in record.items() "levelname"
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']} ), # This should now correctly get 'levelname'
}) "message": record.get("message"),
"extra": {
k: v
for k, v in record.items()
if k
not in [
"asctime",
"levelname",
"message",
"name",
"lineno",
"filename",
"pathname",
"funcName",
"process",
"processName",
"thread",
"threadName",
]
},
}
)
def get_logs(self, limit=100, level=None, since=None): def get_logs(self, limit=100, level=None, since=None):
logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}") logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}")
@ -62,9 +90,9 @@ class LogBuffer:
# Apply level filter # Apply level filter
if level and level.strip(): if level and level.strip():
level = level.upper() level = level.upper()
valid_levels = {'INFO', 'WARNING', 'ERROR', 'DEBUG'} valid_levels = {"INFO", "WARNING", "ERROR", "DEBUG"}
if level in valid_levels: if level in valid_levels:
logs = [log for log in logs if log['level'].upper() == level] logs = [log for log in logs if log["level"].upper() == level]
else: else:
logger.warning(f"Invalid log level: {level}") logger.warning(f"Invalid log level: {level}")
# Apply since filter # Apply since filter
@ -72,7 +100,7 @@ class LogBuffer:
try: try:
# Use isoparse for robust parsing of ISO 8601 strings # Use isoparse for robust parsing of ISO 8601 strings
since_dt = isoparse(since) since_dt = isoparse(since)
# If the parsed datetime is naive (no timezone info), assume it's UTC # If the parsed datetime is naive (no timezone info), assume it's UTC
if since_dt.tzinfo is None: if since_dt.tzinfo is None:
since_dt = since_dt.replace(tzinfo=timezone.utc) since_dt = since_dt.replace(tzinfo=timezone.utc)
@ -80,19 +108,24 @@ class LogBuffer:
# If it has timezone info, convert it to UTC for consistent comparison # If it has timezone info, convert it to UTC for consistent comparison
since_dt = since_dt.astimezone(timezone.utc) since_dt = since_dt.astimezone(timezone.utc)
logs = [log for log in logs if logs = [
datetime.fromisoformat(log['timestamp'].replace('Z', '+00:00')).astimezone(timezone.utc) >= since_dt] log
for log in logs
if datetime.fromisoformat(
log["timestamp"].replace("Z", "+00:00")
).astimezone(timezone.utc)
>= since_dt
]
except ValueError: except ValueError:
logger.warning(f"Invalid 'since' timestamp: {since}") logger.warning(f"Invalid 'since' timestamp: {since}")
logger.debug(f"Returning {len(logs[-limit:])} logs") logger.debug(f"Returning {len(logs[-limit:])} logs")
return logs[-limit:] return logs[-limit:]
log_buffer = LogBuffer() log_buffer = LogBuffer()
logHandler = logging.StreamHandler() logHandler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter( formatter = jsonlogger.JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s")
'%(asctime)s %(name)s %(levelname)s %(message)s'
)
logHandler.setFormatter(formatter) logHandler.setFormatter(formatter)
if not logger.handlers: if not logger.handlers:
@ -109,20 +142,23 @@ logging.getLogger("uvicorn.error").propagate = True
# --- FastAPI Application --- # --- FastAPI Application ---
app = FastAPI( app = FastAPI(
title="Node Monitoring System", title="Node Monitoring System",
description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}" description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}",
) )
templates = Jinja2Templates(directory="app/web/templates") templates = Jinja2Templates(directory="app/web/templates")
app.mount("/static", StaticFiles(directory="app/web/static"), name="static") app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
# --- Data Models --- # --- Data Models ---
class NodeStatusModel(BaseModel): class NodeStatusModel(BaseModel):
uptime_seconds: int uptime_seconds: int
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)] load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
memory_usage_percent: float memory_usage_percent: float
class PingModel(BaseModel): class PingModel(BaseModel):
pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float] pings: Dict[Annotated[str, Field(pattern=r"^[0-9a-fA-F-]{36}$")], float]
class StatusUpdate(BaseModel): class StatusUpdate(BaseModel):
node: str = Field(..., description="Node UUID") node: str = Field(..., description="Node UUID")
@ -130,23 +166,24 @@ class StatusUpdate(BaseModel):
status: NodeStatusModel status: NodeStatusModel
pings: Dict[str, float] pings: Dict[str, float]
@validator('node') @validator("node")
def validate_node_uuid(cls, v): def validate_node_uuid(cls, v):
try: try:
uuid_lib.UUID(v) uuid_lib.UUID(v)
return v return v
except ValueError: except ValueError:
raise ValueError('Invalid UUID format') raise ValueError("Invalid UUID format")
@validator('pings') @validator("pings")
def validate_ping_uuids(cls, v): def validate_ping_uuids(cls, v):
for key in v.keys(): for key in v.keys():
try: try:
uuid_lib.UUID(key) uuid_lib.UUID(key)
except ValueError: except ValueError:
raise ValueError(f'Invalid UUID format in pings: {key}') raise ValueError(f"Invalid UUID format in pings: {key}")
return v return v
# --- Node Management and Health Logic --- # --- Node Management and Health Logic ---
known_nodes_db: Dict[str, Dict] = {} known_nodes_db: Dict[str, Dict] = {}
@ -155,13 +192,20 @@ LOAD_AVG_CRITICAL_THRESHOLD = 3.0
MEMORY_WARNING_THRESHOLD = 75.0 MEMORY_WARNING_THRESHOLD = 75.0
MEMORY_CRITICAL_THRESHOLD = 90.0 MEMORY_CRITICAL_THRESHOLD = 90.0
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30 LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS = 300 # Remove node from UI after 5 minutes of inactivity NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS = (
300 # Remove node from UI after 5 minutes of inactivity
)
def get_node_health(node_data: Dict) -> str: def get_node_health(node_data: Dict) -> str:
last_seen_str = node_data.get("last_seen") last_seen_str = node_data.get("last_seen")
if last_seen_str: if last_seen_str:
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc) last_seen_dt = datetime.fromisoformat(last_seen_str).replace(
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds() tzinfo=timezone.utc
)
time_since_last_seen = (
datetime.now(timezone.utc) - last_seen_dt
).total_seconds()
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS: if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
return "critical" return "critical"
else: else:
@ -174,7 +218,10 @@ def get_node_health(node_data: Dict) -> str:
try: try:
status = NodeStatusModel(**status_model_data) status = NodeStatusModel(**status_model_data)
except Exception: except Exception:
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True) logger.error(
f"Could not parse status data for node {node_data.get('uuid')}",
exc_info=True,
)
return "unknown" return "unknown"
load_1min = status.load_avg[0] load_1min = status.load_avg[0]
@ -190,6 +237,7 @@ def get_node_health(node_data: Dict) -> str:
return "healthy" return "healthy"
# --- API Endpoints --- # --- API Endpoints ---
@app.get("/", response_class=HTMLResponse) @app.get("/", response_class=HTMLResponse)
async def read_root(request: Request): async def read_root(request: Request):
@ -197,62 +245,61 @@ async def read_root(request: Request):
client_ip = request.headers.get("x-forwarded-for", request.client.host) client_ip = request.headers.get("x-forwarded-for", request.client.host)
logger.info( logger.info(
"Web root accessed", "Web root accessed",
extra={'client_ip': client_ip, 'service_uuid': SERVICE_UUID} extra={"client_ip": client_ip, "service_uuid": SERVICE_UUID},
) )
return templates.TemplateResponse( return templates.TemplateResponse(
"index.html", "index.html",
{ {
"request": request, "request": request,
"service_uuid": SERVICE_UUID, "service_uuid": SERVICE_UUID,
"url_for": request.url_for, # Pass url_for for dynamic URL generation "url_for": request.url_for, # Pass url_for for dynamic URL generation
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL "root_path": request.scope.get(
} "root_path", ""
), # Pass root_path for JS base URL
},
) )
@app.get("/{service_uuid}/logs") @app.get("/{service_uuid}/logs")
async def get_logs( async def get_logs(
request: Request, request: Request,
service_uuid: str, service_uuid: str,
limit: int = 100, limit: int = 100,
format: str = Query(None, description="Response format: 'json' for JSON, default is HTML"), format: str = Query(
None, description="Response format: 'json' for JSON, default is HTML"
),
level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"), level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"),
since: str = Query(None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00") since: str = Query(
None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00"
),
): ):
# Use X-Forwarded-For if available, otherwise client.host # Use X-Forwarded-For if available, otherwise client.host
client_ip = request.headers.get("x-forwarded-for", request.client.host) client_ip = request.headers.get("x-forwarded-for", request.client.host)
logger.info( logger.info(
"Logs endpoint accessed", "Logs endpoint accessed",
extra={ extra={
'service_uuid': service_uuid, "service_uuid": service_uuid,
'format': format, "format": format,
'level': level, "level": level,
'since': since, "since": since,
'limit': limit, "limit": limit,
'client_ip': client_ip "client_ip": client_ip,
} },
) )
if service_uuid != SERVICE_UUID: if service_uuid != SERVICE_UUID:
logger.warning(f"Invalid service UUID: {service_uuid}") logger.warning(f"Invalid service UUID: {service_uuid}")
return JSONResponse( return JSONResponse(
status_code=404, status_code=404, content={"error": "Service UUID not found"}
content={"error": "Service UUID not found"}
) )
try: try:
logs = log_buffer.get_logs(limit=limit, level=level, since=since) logs = log_buffer.get_logs(limit=limit, level=level, since=since)
log_data = { log_data = {"service_uuid": service_uuid, "log_count": len(logs), "logs": logs}
"service_uuid": service_uuid,
"log_count": len(logs),
"logs": logs
}
logger.debug(f"Fetched {len(logs)} logs for response") logger.debug(f"Fetched {len(logs)} logs for response")
except Exception as e: except Exception as e:
logger.error(f"Error fetching logs: {e}", exc_info=True) logger.error(f"Error fetching logs: {e}", exc_info=True)
return JSONResponse( return JSONResponse(status_code=500, content={"error": "Failed to fetch logs"})
status_code=500,
content={"error": "Failed to fetch logs"}
)
if format == "json": if format == "json":
logger.debug("Returning JSON response") logger.debug("Returning JSON response")
@ -267,41 +314,40 @@ async def get_logs(
"service_uuid": service_uuid, "service_uuid": service_uuid,
"logs": logs, "logs": logs,
"log_count": len(logs), "log_count": len(logs),
"url_for": request.url_for, # Pass url_for for dynamic URL generation "url_for": request.url_for, # Pass url_for for dynamic URL generation
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL "root_path": request.scope.get(
} "root_path", ""
), # Pass root_path for JS base URL
},
) )
except Exception as e: except Exception as e:
logger.error(f"Error rendering logs.html: {e}", exc_info=True) logger.error(f"Error rendering logs.html: {e}", exc_info=True)
return JSONResponse( return JSONResponse(
status_code=500, status_code=500, content={"error": "Failed to render logs page"}
content={"error": "Failed to render logs page"}
) )
@app.put("/{service_uuid}/{node_uuid}/") @app.put("/{service_uuid}/{node_uuid}/")
async def update_node_status( async def update_node_status(
service_uuid: str, service_uuid: str, node_uuid: str, status_update: StatusUpdate, request: Request
node_uuid: str,
status_update: StatusUpdate,
request: Request
): ):
# Use X-Forwarded-For if available, otherwise client.host # Use X-Forwarded-For if available, otherwise client.host
client_ip = request.headers.get("x-forwarded-for", request.client.host) client_ip = request.headers.get("x-forwarded-for", request.client.host)
logger.info( logger.info(
"Received node status update", "Received node status update",
extra={ extra={
'event_type': 'node_status_update', "event_type": "node_status_update",
'client_ip': client_ip, "client_ip": client_ip,
'service_uuid': service_uuid, "service_uuid": service_uuid,
'node_uuid': node_uuid, "node_uuid": node_uuid,
'data': status_update.dict() "data": status_update.dict(),
} },
) )
if service_uuid != SERVICE_UUID: if service_uuid != SERVICE_UUID:
logger.warning( logger.warning(
"Node sent status to wrong service UUID", "Node sent status to wrong service UUID",
extra={'client_node_uuid': node_uuid, 'target_uuid': service_uuid} extra={"client_node_uuid": node_uuid, "target_uuid": service_uuid},
) )
return {"error": "Service UUID mismatch", "peers": []} return {"error": "Service UUID mismatch", "peers": []}
@ -311,7 +357,7 @@ async def update_node_status(
timestamp=status_update.timestamp, timestamp=status_update.timestamp,
uptime_seconds=status_update.status.uptime_seconds, uptime_seconds=status_update.status.uptime_seconds,
load_avg=status_update.status.load_avg, load_avg=status_update.status.load_avg,
memory_usage_percent=status_update.status.memory_usage_percent memory_usage_percent=status_update.status.memory_usage_percent,
) )
for target_uuid, latency in status_update.pings.items(): for target_uuid, latency in status_update.pings.items():
@ -319,7 +365,7 @@ async def update_node_status(
node_uuid=node_uuid, node_uuid=node_uuid,
target_uuid=target_uuid, target_uuid=target_uuid,
timestamp=status_update.timestamp, timestamp=status_update.timestamp,
latency_ms=latency latency_ms=latency,
) )
except Exception as e: except Exception as e:
@ -329,31 +375,39 @@ async def update_node_status(
known_nodes_db[node_uuid] = { known_nodes_db[node_uuid] = {
"last_seen": current_time_utc.isoformat(), "last_seen": current_time_utc.isoformat(),
"ip": request.client.host, # Keep original client.host here as it's the direct connection "ip": request.client.host, # Keep original client.host here as it's the direct connection
"status": status_update.status.dict(), "status": status_update.status.dict(),
"uptime_seconds": status_update.status.uptime_seconds, "uptime_seconds": status_update.status.uptime_seconds,
"load_avg": status_update.status.load_avg, "load_avg": status_update.status.load_avg,
"memory_usage_percent": status_update.status.memory_usage_percent "memory_usage_percent": status_update.status.memory_usage_percent,
} }
health_status_for_log = get_node_health(known_nodes_db[node_uuid]) health_status_for_log = get_node_health(known_nodes_db[node_uuid])
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}") logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]} peer_list = {
for uuid, data in known_nodes_db.items() if uuid != node_uuid} uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
for uuid, data in known_nodes_db.items()
if uuid != node_uuid
}
return {"message": "Status received", "peers": peer_list} return {"message": "Status received", "peers": peer_list}
@app.get("/nodes/status") @app.get("/nodes/status")
async def get_all_nodes_status(): async def get_all_nodes_status():
logger.info("Fetching all nodes status for UI.") logger.info("Fetching all nodes status for UI.")
# Prune inactive nodes from known_nodes_db before processing # Prune inactive nodes from known_nodes_db before processing
current_time_utc = datetime.now(timezone.utc) current_time_utc = datetime.now(timezone.utc)
nodes_to_remove = [] nodes_to_remove = []
for node_uuid, data in known_nodes_db.items(): for node_uuid, data in known_nodes_db.items():
last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(tzinfo=timezone.utc) last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(
if (current_time_utc - last_seen_dt).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS: tzinfo=timezone.utc
)
if (
current_time_utc - last_seen_dt
).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS:
nodes_to_remove.append(node_uuid) nodes_to_remove.append(node_uuid)
logger.info(f"Removing inactive node {node_uuid} from known_nodes_db.") logger.info(f"Removing inactive node {node_uuid} from known_nodes_db.")
@ -365,31 +419,37 @@ async def get_all_nodes_status():
current_health = get_node_health(data) current_health = get_node_health(data)
connections = {} connections = {}
for target_uuid in known_nodes_db: # Only iterate over currently active nodes for target_uuid in known_nodes_db: # Only iterate over currently active nodes
if target_uuid != node_uuid: if target_uuid != node_uuid:
ping_data = database.get_ping_data(node_uuid, target_uuid, start_time="-300s") ping_data = database.get_ping_data(
node_uuid, target_uuid, start_time="-300s"
)
latency_ms = None latency_ms = None
if ping_data and ping_data['data']['latency']: if ping_data and ping_data["data"]["latency"]:
# Get the most recent non-None latency # Get the most recent non-None latency
for latency in reversed(ping_data['data']['latency']): for latency in reversed(ping_data["data"]["latency"]):
if latency is not None and not (isinstance(latency, float) and latency == 0.0): # Exclude 0.0 which might be a default if latency is not None and not (
isinstance(latency, float) and latency == 0.0
): # Exclude 0.0 which might be a default
latency_ms = float(latency) latency_ms = float(latency)
break break
connections[target_uuid] = latency_ms connections[target_uuid] = latency_ms
response_nodes.append({ response_nodes.append(
"uuid": node_uuid, {
"last_seen": data["last_seen"], "uuid": node_uuid,
"ip": data["ip"], "last_seen": data["last_seen"],
"health_status": current_health, "ip": data["ip"],
"uptime_seconds": data.get("uptime_seconds"), "health_status": current_health,
"load_avg": data.get("load_avg"), "uptime_seconds": data.get("uptime_seconds"),
"memory_usage_percent": data.get("memory_usage_percent"), "load_avg": data.get("load_avg"),
"connections": connections "memory_usage_percent": data.get("memory_usage_percent"),
}) "connections": connections,
}
)
return {"nodes": response_nodes} return {"nodes": response_nodes}
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():
return {"status": "ok"} return {"status": "ok"}
# --- END OF FILE main.py ---

View File

@ -24,9 +24,11 @@ document.addEventListener('DOMContentLoaded', () => {
console.log('Fetch URL:', url); console.log('Fetch URL:', url);
const response = await fetch(url); const response = await fetch(url);
console.log('Response status:', response.status); console.log('Response status:', response.status);
console.log('Response Content-Type:', response.headers.get('Content-Type')); // NEW: Log Content-Type
if (!response.ok) { if (!response.ok) {
const errorText = await response.text(); // Try to get response body as text const errorText = await response.text(); // Try to get response body as text
console.error('Response text on error:', errorText); // Log it console.error('Raw response text on error:', errorText.substring(0, 500) + (errorText.length > 500 ? '...' : '')); // Log first 500 chars
// If the server returns a 404, it might be due to a stale UUID. // If the server returns a 404, it might be due to a stale UUID.
// Log a more specific message. // Log a more specific message.
if (response.status === 404) { if (response.status === 404) {
@ -39,7 +41,9 @@ document.addEventListener('DOMContentLoaded', () => {
} }
return; // Stop further processing if error return; // Stop further processing if error
} }
const data = await response.json();
// Attempt to parse JSON. This is where the error would occur if the content is HTML.
const data = await response.json();
console.log('Received logs:', data.logs.length); console.log('Received logs:', data.logs.length);
renderLogTable(data.logs); renderLogTable(data.logs);
logCountSpan.textContent = data.log_count; logCountSpan.textContent = data.log_count;
@ -51,7 +55,7 @@ document.addEventListener('DOMContentLoaded', () => {
function renderLogTable(logs) { function renderLogTable(logs) {
console.log('Rendering logs:', logs.length); console.log('Rendering logs:', logs.length);
logTableContainer.innerHTML = ''; logTableContainer.innerHTML = ''; // Clear existing content before rendering
if (logs.length === 0) { if (logs.length === 0) {
logTableContainer.innerHTML = '<p class="loading-message">No logs available.</p>'; logTableContainer.innerHTML = '<p class="loading-message">No logs available.</p>';
@ -86,7 +90,7 @@ document.addEventListener('DOMContentLoaded', () => {
const row = document.createElement('tr'); const row = document.createElement('tr');
row.innerHTML = ` row.innerHTML = `
<td>${new Date(log.timestamp).toLocaleString()}</td> <td>${new Date(log.timestamp).toLocaleString()}</td>
<td class="log-level log-level-${log.level.toLowerCase()}">${log.level}</td> <td class="log-level log-level-${(log.level || '').toLowerCase()}">${log.level || 'N/A'}</td>
<td>${escapeHtml(log.message)}</td> <td>${escapeHtml(log.message)}</td>
<td> <td>
${log.extra ? ` ${log.extra ? `
@ -158,6 +162,8 @@ document.addEventListener('DOMContentLoaded', () => {
}); });
console.log('Initializing logs page'); console.log('Initializing logs page');
// Call fetchLogs immediately on page load to populate the table with fresh data
// and handle the initial refresh logic.
fetchLogs(); fetchLogs();
setInterval(fetchLogs, POLLING_INTERVAL_MS); setInterval(fetchLogs, POLLING_INTERVAL_MS);
}); });

View File

@ -19,8 +19,8 @@ document.addEventListener('DOMContentLoaded', () => {
} }
function renderNodeGrid(nodes) { function renderNodeGrid(nodes) {
nodeGridContainer.innerHTML = ''; nodeGridContainer.innerHTML = ''; // Clear existing content
nodeCountSpan.textContent = nodes.length; nodeCountSpan.textContent = nodes.length; // Update total node count
if (nodes.length === 0) { if (nodes.length === 0) {
nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>'; nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>';
@ -132,4 +132,3 @@ document.addEventListener('DOMContentLoaded', () => {
fetchNodeData(); fetchNodeData();
setInterval(fetchNodeData, POLLING_INTERVAL_MS); setInterval(fetchNodeData, POLLING_INTERVAL_MS);
}); });

View File

@ -38,10 +38,10 @@ body {
border-radius: 8px; border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
margin-bottom: 20px; margin-bottom: 20px;
width: 80vw; width: 80vw; /* Keep this fixed width for the header */
max-width: 1200px; max-width: 1200px; /* Keep this max-width for the header */
margin-left: auto; margin-left: auto; /* Center the header */
margin-right: auto; margin-right: auto; /* Center the header */
} }
h1 { h1 {
@ -65,17 +65,18 @@ code {
} }
#node-grid-container, #log-table-container { #node-grid-container, #log-table-container {
width: 95vw; /* Adjusted width/max-width to allow dynamic resizing and scrolling */
max-width: 1600px; width: 95vw; /* Allow it to take up to 95% of viewport width */
min-width: 400px; max-width: 1800px; /* Increased max-width to accommodate more columns */
min-width: 400px; /* Keep a minimum width */
padding: 20px; padding: 20px;
background-color: var(--nord3); background-color: var(--nord3);
border-radius: 8px; border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
margin-bottom: 20px; margin-bottom: 20px; /* Spacing below the container */
margin-left: auto; margin-left: auto; /* Center the block */
margin-right: auto; margin-right: auto; /* Center the block */
overflow-x: auto; overflow-x: auto; /* Enable horizontal scrolling if content overflows */
} }
.connection-grid { .connection-grid {
@ -263,7 +264,7 @@ code {
color: var(--nord11); /* Red */ color: var(--nord11); /* Red */
} }
.log-level-debug { /* Added for potential debug logs */ .log-level-debug {
color: var(--nord9); /* Blue */ color: var(--nord9); /* Blue */
} }

View File

@ -10,7 +10,7 @@
<div class="header-container"> <div class="header-container">
<h1>Node Monitoring System</h1> <h1>Node Monitoring System</h1>
<p>Total Nodes: <span id="node-count">0</span></p> <p>Total Nodes: <span id="node-count">0</span></p>
<p>Service UUID: <code>{{ service_uuid }}</code></p> <p>Service UUID: <code>{{ service_uuid }}</code></p> <!-- ALWAYS DISPLAYED -->
</div> </div>
<div id="node-grid-container"> <div id="node-grid-container">

View File

@ -24,6 +24,7 @@
</div> </div>
</div> </div>
<div id="log-table-container" data-service-uuid="{{ service_uuid }}"> <div id="log-table-container" data-service-uuid="{{ service_uuid }}">
{# The initial logs are rendered by Jinja2 here #}
{% if logs %} {% if logs %}
<table class="log-table"> <table class="log-table">
<thead> <thead>
@ -63,4 +64,3 @@
<script src="{{ url_for('static', path='/logs.js') }}"></script> <script src="{{ url_for('static', path='/logs.js') }}"></script>
</body> </body>
</html> </html>

View File

@ -2,18 +2,34 @@ version: '3.8'
services: services:
node-monitor: node-monitor:
build: . image: node-monitor:latest
container_name: node-monitor-app
ports: ports:
- "8000:8000" - "8000:8000"
# Mount the 'data' directory for RRD files.
# The left side './data' refers to a 'data' directory in the same location
# as this docker-compose.yml file.
# For Podman, if you encounter SELinux issues, you might need to append ':Z' or ':z'
# to the host path, e.g., './data:/data:Z'
volumes: volumes:
- ./data:/data - ../data:/data:Z
# Environment variables for the application
environment: environment:
- DATA_DIR=/data # Set a fixed SERVICE_UUID here. Replace this with your desired UUID.
- SERVICE_UUID=${SERVICE_UUID:-auto-generated} # This UUID will be used by the FastAPI app and passed to the frontend.
SERVICE_UUID: "ab73d00a-8169-46bb-997d-f13e5f760973"
DATA_DIR: "/data" # Inform the application where its data volume is mounted
# Restart the container if it stops for any reason, unless explicitly stopped
restart: unless-stopped restart: unless-stopped
# Healthcheck to ensure the container is running and responsive
healthcheck: healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"] test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3
start_period: 5s start_period: 5s
retries: 3