It works now

This commit is contained in:
Kalzu Rekku
2025-06-11 00:01:10 +03:00
parent 11a565732d
commit 94c988ee7b
9 changed files with 797 additions and 30 deletions

View File

@ -28,15 +28,17 @@ RUN apt-get purge -y build-essential python3-dev && \
# Copy application code
COPY app/ ./app/
# Create directory for RRD data
RUN mkdir -p /app/data
# Create directory for RRD data at /data (will be volume mounted)
RUN mkdir -p /data
# Expose port
EXPOSE 8000
# Create non-root user for security
RUN useradd --create-home --shell /bin/bash appuser && \
chown -R appuser:appuser /app
chown -R appuser:appuser /app && \
chown -R appuser:appuser /data && \
chmod 777 /data
USER appuser
# Health check
@ -44,4 +46,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
# Run the application
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

257
app/database.py Normal file
View File

@ -0,0 +1,257 @@
import os
import rrdtool
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
from pathlib import Path
logger = logging.getLogger(__name__)
class RRDDatabase:
def __init__(self, data_dir: str = None):
# Use environment variable or default to /data
if data_dir is None:
data_dir = os.environ.get("DATA_DIR", "/data")
self.data_dir = Path(data_dir)
# Create data directory if it doesn't exist
try:
self.data_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Using data directory: {self.data_dir}")
except PermissionError:
logger.error(f"Permission denied creating data directory: {self.data_dir}")
raise
except Exception as e:
logger.error(f"Failed to create data directory {self.data_dir}: {e}")
raise
# RRD configuration
self.step = 60 # 1-minute intervals
self.heartbeat = 120 # 2-minute heartbeat (allow 1 missed update)
# Retention policy (6 months total)
self.rra_config = [
"RRA:AVERAGE:0.5:1:1440", # 1-min avg for 24 hours (1440 points)
"RRA:AVERAGE:0.5:60:744", # 1-hour avg for 31 days (744 points)
"RRA:AVERAGE:0.5:1440:180", # 1-day avg for 6 months (180 points)
"RRA:MAX:0.5:1:1440", # 1-min max for 24 hours
"RRA:MAX:0.5:60:744", # 1-hour max for 31 days
"RRA:MIN:0.5:60:744", # 1-hour min for 31 days
]
def _get_node_dir(self, node_uuid: str) -> Path:
"""Get the directory path for a specific node's RRD files."""
node_dir = self.data_dir / node_uuid
node_dir.mkdir(exist_ok=True)
return node_dir
def _create_system_rrd(self, node_uuid: str) -> str:
"""Create RRD file for system metrics (uptime, load, memory)."""
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
if rrd_file.exists():
return str(rrd_file)
try:
rrdtool.create(
str(rrd_file),
"--step", str(self.step),
# Data sources
f"DS:uptime:GAUGE:{self.heartbeat}:0:U", # Uptime in seconds
f"DS:load1:GAUGE:{self.heartbeat}:0:100", # 1-min load average
f"DS:load5:GAUGE:{self.heartbeat}:0:100", # 5-min load average
f"DS:load15:GAUGE:{self.heartbeat}:0:100", # 15-min load average
f"DS:memory:GAUGE:{self.heartbeat}:0:100", # Memory usage %
# Round Robin Archives
*self.rra_config
)
logger.info(f"Created system RRD for node {node_uuid}")
return str(rrd_file)
except Exception as e:
logger.error(f"Failed to create system RRD for {node_uuid}: {e}")
raise
def _create_ping_rrd(self, node_uuid: str, target_uuid: str) -> str:
"""Create RRD file for ping metrics between two nodes."""
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
if rrd_file.exists():
return str(rrd_file)
try:
rrdtool.create(
str(rrd_file),
"--step", str(self.step),
# Data sources for ping metrics
f"DS:latency:GAUGE:{self.heartbeat}:0:10000", # Ping latency in ms
f"DS:loss:GAUGE:{self.heartbeat}:0:100", # Packet loss %
# Round Robin Archives
*self.rra_config
)
logger.info(f"Created ping RRD for {node_uuid} -> {target_uuid}")
return str(rrd_file)
except Exception as e:
logger.error(f"Failed to create ping RRD for {node_uuid}->{target_uuid}: {e}")
raise
def update_system_metrics(self, node_uuid: str, timestamp: datetime,
uptime_seconds: int, load_avg: List[float],
memory_usage_percent: float):
"""Update system metrics for a node."""
try:
rrd_file = self._create_system_rrd(node_uuid)
# Convert datetime to Unix timestamp
unix_time = int(timestamp.timestamp())
# Format: timestamp:uptime:load1:load5:load15:memory
values = f"{unix_time}:{uptime_seconds}:{load_avg[0]}:{load_avg[1]}:{load_avg[2]}:{memory_usage_percent}"
rrdtool.update(rrd_file, values)
logger.debug(f"Updated system metrics for {node_uuid}: {values}")
except Exception as e:
logger.error(f"Failed to update system metrics for {node_uuid}: {e}")
raise
def update_ping_metrics(self, node_uuid: str, target_uuid: str,
timestamp: datetime, latency_ms: float):
"""Update ping metrics between two nodes."""
try:
rrd_file = self._create_ping_rrd(node_uuid, target_uuid)
unix_time = int(timestamp.timestamp())
# For now, we only track latency. Loss can be calculated from missing updates
values = f"{unix_time}:{latency_ms}:0" # 0% loss (could be enhanced)
rrdtool.update(rrd_file, values)
logger.debug(f"Updated ping metrics {node_uuid}->{target_uuid}: {latency_ms}ms")
except Exception as e:
logger.error(f"Failed to update ping metrics {node_uuid}->{target_uuid}: {e}")
raise
def get_system_data(self, node_uuid: str, start_time: str = "-24h",
end_time: str = "now") -> Optional[Dict]:
"""Retrieve system metrics data for a node."""
try:
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
if not rrd_file.exists():
return None
result = rrdtool.fetch(
str(rrd_file),
"AVERAGE",
"--start", start_time,
"--end", end_time
)
# Parse RRDtool fetch result
start, end, step = result[0]
ds_names = result[1] # ['uptime', 'load1', 'load5', 'load15', 'memory']
data_points = result[2]
# Convert to more usable format
timestamps = []
data = {ds: [] for ds in ds_names}
current_time = start
for point in data_points:
timestamps.append(current_time)
for i, ds in enumerate(ds_names):
value = point[i] if point[i] is not None else 0
data[ds].append(value)
current_time += step
return {
'timestamps': timestamps,
'data': data,
'step': step
}
except Exception as e:
logger.error(f"Failed to get system data for {node_uuid}: {e}")
return None
def get_ping_data(self, node_uuid: str, target_uuid: str,
start_time: str = "-24h", end_time: str = "now") -> Optional[Dict]:
"""Retrieve ping metrics between two nodes."""
try:
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
if not rrd_file.exists():
return None
result = rrdtool.fetch(
str(rrd_file),
"AVERAGE",
"--start", start_time,
"--end", end_time
)
start, end, step = result[0]
ds_names = result[1] # ['latency', 'loss']
data_points = result[2]
timestamps = []
data = {ds: [] for ds in ds_names}
current_time = start
for point in data_points:
timestamps.append(current_time)
for i, ds in enumerate(ds_names):
value = point[i] if point[i] is not None else 0
data[ds].append(value)
current_time += step
return {
'timestamps': timestamps,
'data': data,
'step': step
}
except Exception as e:
logger.error(f"Failed to get ping data {node_uuid}->{target_uuid}: {e}")
return None
def list_nodes(self) -> List[str]:
"""Get list of all nodes with RRD data."""
try:
nodes = []
for item in self.data_dir.iterdir():
if item.is_dir() and (item / "system.rrd").exists():
nodes.append(item.name)
return nodes
except Exception as e:
logger.error(f"Failed to list nodes: {e}")
return []
def cleanup_old_data(self):
"""Clean up RRD files older than 6 months (handled automatically by RRD retention)."""
# RRD automatically handles data retention based on RRA configuration
# This method could be used for cleaning up orphaned files
cutoff_date = datetime.now() - timedelta(days=190) # 6+ months
try:
for node_dir in self.data_dir.iterdir():
if not node_dir.is_dir():
continue
# Check if any RRD files have been modified recently
rrd_files = list(node_dir.glob("*.rrd"))
if not rrd_files:
continue
# If all RRD files are old, the node is probably dead
all_old = all(
datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date
for f in rrd_files
)
if all_old:
logger.info(f"Node {node_dir.name} appears inactive for >6 months")
# Could optionally remove the directory here
except Exception as e:
logger.error(f"Failed during cleanup: {e}")

View File

@ -1,15 +1,14 @@
import os
import uuid
import json
import logging
from datetime import datetime
from datetime import datetime, timezone, timedelta
from fastapi import FastAPI, Request, status
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, Field, validator, constr, conlist
from typing import Dict, List
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field, validator
from typing import Dict, List, Annotated
import uuid as uuid_lib
from collections import deque
@ -44,7 +43,6 @@ class BufferHandler(logging.Handler):
except Exception as e:
# Log the error to stderr, to avoid recursion or filling the buffer with errors
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
# Optionally, you could log record.msg or record.exc_info here for more context
class LogBuffer:
@ -60,7 +58,6 @@ class LogBuffer:
'message': record.get('message'),
'extra': {k: v for k, v in record.items()
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
# Added more common LogRecord attributes to exclude from 'extra'
})
def get_logs(self, limit=100):
@ -80,10 +77,8 @@ formatter = jsonlogger.JsonFormatter(
logHandler.setFormatter(formatter)
# Add handlers to the root logger
# Avoid adding handlers multiple times in a uvicorn environment
if not logger.handlers:
logger.addHandler(logHandler)
# Add buffer handler to logger ONLY ONCE
buffer_handler = BufferHandler()
logger.addHandler(buffer_handler)
@ -97,15 +92,18 @@ app = FastAPI(
# Configure templates for the web interface
templates = Jinja2Templates(directory="app/web/templates")
# Mount static files directory
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
# --- Data Models (as defined in the project spec) ---
class NodeStatusModel(BaseModel):
uptime_seconds: int
load_avg: conlist(float, min_length=3, max_length=3)
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
memory_usage_percent: float
class PingModel(BaseModel):
pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]
class StatusUpdate(BaseModel):
node: str = Field(..., description="Node UUID")
@ -131,15 +129,61 @@ class StatusUpdate(BaseModel):
return v
# A mock database of known nodes for the auto-discovery demo
# In a real app, this would be managed more dynamically
known_nodes_db = {}
# --- Node Management and Health Logic ---
# A mock database of known nodes, now storing more comprehensive data
known_nodes_db: Dict[str, Dict] = {}
# Health calculation constants (can be tuned)
LOAD_AVG_WARNING_THRESHOLD = 1.5
LOAD_AVG_CRITICAL_THRESHOLD = 3.0
MEMORY_WARNING_THRESHOLD = 75.0
MEMORY_CRITICAL_THRESHOLD = 90.0
# If a node hasn't reported in this many seconds, it's considered critical
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
def get_node_health(node_data: Dict) -> str:
"""Calculates the health status based on node metrics and last seen time."""
# Check for liveness first
last_seen_str = node_data.get("last_seen")
if last_seen_str:
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
return "critical" # Node has not reported recently
else:
return "unknown" # Should not happen if 'last_seen' is always set
status_model_data = node_data.get("status")
if not status_model_data:
# This could happen if a node is just discovered but hasn't sent a full status update yet
return "unknown"
try:
status = NodeStatusModel(**status_model_data)
except Exception:
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
return "unknown" # Or critical if parsing fails
# Check load average (using 1-minute load for primary indicator)
load_1min = status.load_avg[0]
if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
return "critical"
elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
return "warning"
# Check memory usage
if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
return "critical"
elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
return "warning"
return "healthy"
# --- API Endpoints ---
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
"""Serves the main web page which displays the Service UUID."""
"""Serves the main web page which displays the Service UUID and the node grid."""
logger.info(
"Web root accessed",
extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
@ -149,7 +193,6 @@ async def read_root(request: Request):
{"request": request, "service_uuid": SERVICE_UUID}
)
# Add the logs endpoint
@app.get("/{service_uuid}/logs")
async def get_logs(service_uuid: str, limit: int = 100):
"""Get recent logs for the service."""
@ -175,7 +218,6 @@ async def update_node_status(
request: Request
):
"""Receives status updates from a node and returns a list of peers."""
# Log the incoming status update with structured context
logger.info(
"Received node status update",
extra={
@ -205,7 +247,7 @@ async def update_node_status(
)
# Update ping metrics
for target_uuid, latency in status_update.pings.pings.items():
for target_uuid, latency in status_update.pings.items():
database.update_ping_metrics(
node_uuid=node_uuid,
target_uuid=target_uuid,
@ -214,20 +256,51 @@ async def update_node_status(
)
except Exception as e:
logger.error(f"Database update failed: {e}")
logger.error(f"Database update failed: {e}", exc_info=True)
# Continue processing even if DB update fails
# Auto-discovery logic
if node_uuid not in known_nodes_db:
logger.info(f"New node discovered: {node_uuid}")
# A real app would need a strategy to handle node addresses
known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
# Auto-discovery logic and update known_nodes_db with full status
current_time_utc = datetime.now(timezone.utc)
known_nodes_db[node_uuid] = {
"last_seen": current_time_utc.isoformat(),
"ip": request.client.host,
"status": status_update.status.dict(), # Store the dict representation
# Store direct values for convenience in /nodes/status endpoint
"uptime_seconds": status_update.status.uptime_seconds,
"load_avg": status_update.status.load_avg,
"memory_usage_percent": status_update.status.memory_usage_percent
}
# Calculate health for logging purposes (it will be recalculated for /nodes/status)
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
# Respond with the list of other known peers
peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
for uuid, data in known_nodes_db.items() if uuid != node_uuid}
return {"message": "Status received", "peers": peer_list}
@app.get("/nodes/status")
async def get_all_nodes_status():
"""Returns the current status of all known nodes for the UI."""
logger.info("Fetching all nodes status for UI.")
response_nodes = []
for node_uuid, data in known_nodes_db.items():
# Dynamically calculate health for each node based on its current data
current_health = get_node_health(data)
response_nodes.append({
"uuid": node_uuid,
"last_seen": data["last_seen"],
"ip": data["ip"],
"health_status": current_health,
"uptime_seconds": data.get("uptime_seconds"),
"load_avg": data.get("load_avg"),
"memory_usage_percent": data.get("memory_usage_percent")
})
return {"nodes": response_nodes}
@app.get("/health")
async def health_check():

73
app/web/static/script.js Normal file
View File

@ -0,0 +1,73 @@
document.addEventListener('DOMContentLoaded', () => {
const nodeGridContainer = document.getElementById('node-grid-container');
const nodeCountSpan = document.getElementById('node-count');
const POLLING_INTERVAL_MS = 3000; // Poll every 3 seconds
async function fetchNodeData() {
try {
const response = await fetch('/nodes/status');
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.json();
renderNodeGrid(data.nodes);
} catch (error) {
console.error("Error fetching node data:", error);
nodeGridContainer.innerHTML = '<p class="loading-message">Error loading node data. Please check server connection.</p>';
}
}
function renderNodeGrid(nodes) {
nodeGridContainer.innerHTML = ''; // Clear existing nodes
nodeCountSpan.textContent = nodes.length; // Update total node count
if (nodes.length === 0) {
nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>';
return;
}
nodes.forEach(node => {
const nodeCell = document.createElement('div');
nodeCell.classList.add('node-cell');
nodeCell.classList.add(`node-${node.health_status}`); // Apply health color class
// Truncate UUID for display
const displayUuid = node.uuid.substring(0, 8) + '...';
nodeCell.innerHTML = `
<div class="node-uuid" title="${node.uuid}">${displayUuid}</div>
<div class="node-status-text">Status: ${node.health_status.toUpperCase()}</div>
<div class="node-tooltip">
<p><strong>UUID:</strong> ${node.uuid}</p>
<p><strong>IP:</strong> ${node.ip}</p>
<p><strong>Last Seen:</strong> ${new Date(node.last_seen).toLocaleTimeString()}</p>
<p><strong>Uptime:</strong> ${node.uptime_seconds ? formatUptime(node.uptime_seconds) : 'N/A'}</p>
<p><strong>Load Avg (1m, 5m, 15m):</strong> ${node.load_avg ? node.load_avg.join(', ') : 'N/A'}</p>
<p><strong>Memory Usage:</strong> ${node.memory_usage_percent ? node.memory_usage_percent.toFixed(2) + '%' : 'N/A'}</p>
</div>
`;
nodeGridContainer.appendChild(nodeCell);
});
}
function formatUptime(seconds) {
const days = Math.floor(seconds / (3600 * 24));
seconds %= (3600 * 24);
const hours = Math.floor(seconds / 3600);
seconds %= 3600;
const minutes = Math.floor(seconds / 60);
const remainingSeconds = Math.floor(seconds % 60);
let parts = [];
if (days > 0) parts.push(`${days}d`);
if (hours > 0) parts.push(`${hours}h`);
if (minutes > 0) parts.push(`${minutes}m`);
if (remainingSeconds > 0 || parts.length === 0) parts.push(`${remainingSeconds}s`); // Ensure at least seconds are shown
return parts.join(' ');
}
// Initial fetch and then set up polling
fetchNodeData();
setInterval(fetchNodeData, POLLING_INTERVAL_MS);
});

156
app/web/static/style.css Normal file
View File

@ -0,0 +1,156 @@
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
display: flex;
flex-direction: column; /* Changed to column for header + grid */
align-items: center;
min-height: 100vh; /* Use min-height to allow content to push body height */
margin: 0;
background-color: #f4f7f6;
color: #333;
padding: 20px; /* Add some padding */
box-sizing: border-box; /* Include padding in element's total width and height */
}
.header-container {
text-align: center;
padding: 20px;
background-color: white;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
margin-bottom: 20px;
width: 90%; /* Adjust width */
max-width: 800px; /* Max width for header */
}
h1 {
color: #0b2d48;
margin-bottom: 10px;
}
p {
font-size: 1rem;
color: #555;
margin: 5px 0;
}
code {
background-color: #e8e8e8;
padding: 3px 8px;
border-radius: 4px;
font-family: "Courier New", Courier, monospace;
font-size: 0.9rem;
}
#node-grid-container {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(150px, 1fr)); /* Responsive grid */
gap: 15px; /* Space between grid items */
width: 90%; /* Adjust width */
max-width: 1200px; /* Max width for grid */
padding: 20px;
background-color: white;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
}
.loading-message {
grid-column: 1 / -1; /* Span across all columns */
text-align: center;
font-style: italic;
color: #888;
}
.node-cell {
border: 1px solid #ddd;
border-radius: 6px;
padding: 15px;
text-align: center;
font-size: 0.9rem;
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
transition: background-color 0.3s ease, border-color 0.3s ease, transform 0.1s ease;
cursor: pointer;
position: relative; /* For tooltip positioning */
overflow: hidden; /* Hide overflow for truncated UUID */
}
.node-cell:hover {
transform: translateY(-2px);
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
.node-uuid {
font-weight: bold;
margin-bottom: 5px;
color: #333;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis; /* Truncate long UUIDs */
}
.node-status-text {
font-size: 0.8rem;
color: #666;
}
/* Health Status Colors */
.node-healthy {
background-color: #e6ffe6; /* Light green */
border-color: #4CAF50; /* Green */
}
.node-warning {
background-color: #fffacd; /* Light yellow */
border-color: #FFC107; /* Orange */
}
.node-critical {
background-color: #ffe6e6; /* Light red */
border-color: #F44336; /* Red */
}
.node-unknown {
background-color: #f0f0f0; /* Light gray */
border-color: #9E9E9E; /* Gray */
}
/* Tooltip styles */
.node-tooltip {
visibility: hidden;
opacity: 0;
width: 200px;
background-color: #333;
color: #fff;
text-align: left;
border-radius: 6px;
padding: 10px;
position: absolute;
z-index: 1;
bottom: 100%; /* Position above the node cell */
left: 50%;
margin-left: -100px; /* Center the tooltip */
transition: opacity 0.3s;
font-size: 0.8rem;
white-space: normal; /* Allow text to wrap */
box-shadow: 0 2px 10px rgba(0,0,0,0.2);
}
.node-tooltip::after {
content: " ";
position: absolute;
top: 100%; /* At the bottom of the tooltip */
left: 50%;
margin-left: -5px;
border-width: 5px;
border-style: solid;
border-color: #333 transparent transparent transparent;
}
.node-cell:hover .node-tooltip {
visibility: visible;
opacity: 1;
}
.node-tooltip p {
margin: 2px 0;
color: #eee;
}

View File

@ -0,0 +1,23 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Node Monitor</title>
<link rel="stylesheet" href="/static/style.css"> <!-- NEW: Link to CSS -->
</head>
<body>
<div class="header-container">
<h1>Distributed Node Monitoring System</h1>
<p>Service ID: <code>{{ service_uuid }}</code></p>
<p>Total Nodes: <span id="node-count">0</span></p>
</div>
<div id="node-grid-container" class="node-grid">
<!-- Node cells will be dynamically inserted here by JavaScript -->
<p class="loading-message">Loading node data...</p>
</div>
<script src="/static/script.js"></script> <!-- NEW: Link to JavaScript -->
</body>
</html>

163
client.py Normal file
View File

@ -0,0 +1,163 @@
import os
import uuid
import time
import requests
import random
import json
import logging
from datetime import datetime, timezone
# --- Client Configuration ---
# The UUID of THIS client node. Generated on startup.
# Can be overridden by an environment variable for persistent client identity.
NODE_UUID = os.environ.get("NODE_UUID", str(uuid.uuid4()))
# The UUID of the target monitoring service (the main.py server).
# IMPORTANT: This MUST match the SERVICE_UUID of your running FastAPI server.
# You can get this from the server's initial console output or by accessing its root endpoint ('/').
# Replace the placeholder string below with your actual server's SERVICE_UUID.
# For example: TARGET_SERVICE_UUID = "a1b2c3d4-e5f6-7890-1234-567890abcdef"
TARGET_SERVICE_UUID = os.environ.get(
"TARGET_SERVICE_UUID", "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID"
)
# The base URL of the FastAPI monitoring service
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
# How often to send status updates (in seconds)
UPDATE_INTERVAL_SECONDS = int(os.environ.get("UPDATE_INTERVAL_SECONDS", 5))
# --- Logging Configuration ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("NodeClient")
# --- Global state for simulation ---
uptime_seconds = 0
# Dictionary to store UUIDs of other nodes received from the server
# Format: { "node_uuid_str": { "last_seen": "iso_timestamp", "ip": "..." } }
known_peers = {}
# --- Data Generation Functions ---
def generate_node_status_data():
"""Generates simulated node status metrics."""
global uptime_seconds
uptime_seconds += UPDATE_INTERVAL_SECONDS + random.randint(0, 2) # Simulate slight variation
# Simulate load average (3 values: 1-min, 5-min, 15-min)
# Load averages will fluctuate.
load_avg = [
round(random.uniform(0.1, 2.0), 2),
round(random.uniform(0.1, 1.8), 2),
round(random.uniform(0.1, 1.5), 2)
]
# Simulate memory usage percentage
memory_usage_percent = round(random.uniform(30.0, 90.0), 2)
return {
"uptime_seconds": uptime_seconds,
"load_avg": load_avg,
"memory_usage_percent": memory_usage_percent
}
def generate_ping_data():
"""Generates simulated ping latencies to known peers."""
pings = {}
# Simulate ping to self (loopback) - always very low latency
pings[str(NODE_UUID)] = round(random.uniform(0.1, 1.0), 2)
# Simulate pings to other known peers
for peer_uuid in known_peers.keys():
if peer_uuid != str(NODE_UUID): # Don't ping self twice
# Varying latency for external peers
pings[peer_uuid] = round(random.uniform(10.0, 200.0), 2)
return pings
# --- Main Client Logic ---
def run_client():
global known_peers
logger.info(f"Starting Node Client {NODE_UUID}")
logger.info(f"Target Service UUID: {TARGET_SERVICE_UUID}")
logger.info(f"Server URL: {SERVER_BASE_URL}")
logger.info(f"Update Interval: {UPDATE_INTERVAL_SECONDS} seconds")
if TARGET_SERVICE_UUID == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
logger.error("-" * 50)
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
logger.error("Please replace 'REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID' in client.py")
logger.error("or set the environment variable TARGET_SERVICE_UUID.")
logger.error("You can find the server's UUID by running main.py and checking its console output")
logger.error("or by visiting 'http://localhost:8000/' in your browser.")
logger.error("-" * 50)
return
while True:
try:
# 1. Generate status data
status_data = generate_node_status_data()
ping_data = generate_ping_data()
# 2. Construct the payload matching the StatusUpdate model
# Use datetime.now(timezone.utc) for timezone-aware UTC timestamp
payload = {
"node": str(NODE_UUID),
"timestamp": datetime.now(timezone.utc).isoformat(),
"status": status_data,
"pings": ping_data
}
# 3. Define the endpoint URL
endpoint_url = f"{SERVER_BASE_URL}/{TARGET_SERVICE_UUID}/{NODE_UUID}/"
# 4. Send the PUT request
logger.info(f"Sending update to {endpoint_url}. Uptime: {status_data['uptime_seconds']}s, Load: {status_data['load_avg']}, Pings: {len(ping_data)}")
response = requests.put(endpoint_url, json=payload, timeout=10) # 10-second timeout
# 5. Process the response
if response.status_code == 200:
response_data = response.json()
logger.info(f"Successfully sent update. Server message: '{response_data.get('message')}'")
if "peers" in response_data and isinstance(response_data["peers"], dict):
# Update known_peers, converting keys to strings from JSON
new_peers = {k: v for k, v in response_data["peers"].items()}
# Log if new peers are discovered
newly_discovered = set(new_peers.keys()) - set(known_peers.keys())
if newly_discovered:
logger.info(f"Discovered new peer(s): {', '.join(newly_discovered)}")
known_peers = new_peers
logger.info(f"Total known peers (including self if returned by server): {len(known_peers)}")
else:
logger.warning("Server response did not contain a valid 'peers' field or it was empty.")
else:
logger.error(f"Failed to send update. Status code: {response.status_code}, Response: {response.text}")
if response.status_code == 404:
logger.error("Hint: The TARGET_SERVICE_UUID might be incorrect, or the server isn't running at this endpoint.")
elif response.status_code == 422: # Pydantic validation error
logger.error(f"Server validation error (422 Unprocessable Entity): {response.json()}")
except requests.exceptions.Timeout:
logger.error(f"Request timed out after {10} seconds. Is the server running and responsive?")
except requests.exceptions.ConnectionError as e:
logger.error(f"Connection error: {e}. Is the server running at {SERVER_BASE_URL}?")
except requests.exceptions.RequestException as e:
logger.error(f"An unexpected request error occurred: {e}", exc_info=True)
except json.JSONDecodeError:
logger.error(f"Failed to decode JSON response: {response.text}. Is the server returning valid JSON?")
except Exception as e:
logger.error(f"An unexpected error occurred in the client loop: {e}", exc_info=True)
# 6. Wait for the next update
time.sleep(UPDATE_INTERVAL_SECONDS)
if __name__ == "__main__":
run_client()

19
docker-compose.yml Normal file
View File

@ -0,0 +1,19 @@
version: '3.8'
services:
node-monitor:
build: .
ports:
- "8000:8000"
volumes:
- ./data:/data
environment:
- DATA_DIR=/data
- SERVICE_UUID=${SERVICE_UUID:-auto-generated}
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 5s

View File

@ -1,6 +1,6 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
python-rrdtool==1.4.7
rrdtool==0.1.16
jinja2==3.1.2
python-multipart==0.0.6
python-json-logger==2.0.7