From 94c988ee7bfd1f7170992722bdd9f3cf0a014b1a Mon Sep 17 00:00:00 2001
From: Kalzu Rekku <kalzu@laiska.net>
Date: Wed, 11 Jun 2025 00:01:10 +0300
Subject: [PATCH] It works now

---
 Dockerfile                   |  11 +-
 app/database.py              | 257 +++++++++++++++++++++++++++++++++++
 app/main.py                  | 123 +++++++++++++----
 app/web/static/script.js     |  73 ++++++++++
 app/web/static/style.css     | 156 +++++++++++++++++++++
 app/web/templates/index.html |  23 ++++
 client.py                    | 163 ++++++++++++++++++++++
 docker-compose.yml           |  19 +++
 requirements.txt             |   2 +-
 9 files changed, 797 insertions(+), 30 deletions(-)
 create mode 100644 app/database.py
 create mode 100644 app/web/static/script.js
 create mode 100644 app/web/static/style.css
 create mode 100644 client.py
 create mode 100644 docker-compose.yml

diff --git a/Dockerfile b/Dockerfile
index 6414bbc..af2af21 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,15 +28,17 @@ RUN apt-get purge -y build-essential python3-dev && \
 # Copy application code
 COPY app/ ./app/
 
-# Create directory for RRD data
-RUN mkdir -p /app/data
+# Create directory for RRD data at /data (will be volume mounted)
+RUN mkdir -p /data
 
 # Expose port
 EXPOSE 8000
 
 # Create non-root user for security
 RUN useradd --create-home --shell /bin/bash appuser && \
-    chown -R appuser:appuser /app
+    chown -R appuser:appuser /app && \
+    chown -R appuser:appuser /data && \
+    chmod 777 /data
 USER appuser
 
 # Health check
@@ -44,4 +46,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
     CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
 
 # Run the application
-CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
+CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+
diff --git a/app/database.py b/app/database.py
new file mode 100644
index 0000000..e5d11d6
--- /dev/null
+++ b/app/database.py
@@ -0,0 +1,257 @@
+import os
+import rrdtool
+import logging
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+class RRDDatabase:
+    def __init__(self, data_dir: str = None):
+        # Use environment variable or default to /data
+        if data_dir is None:
+            data_dir = os.environ.get("DATA_DIR", "/data")
+            
+        self.data_dir = Path(data_dir)
+        
+        # Create data directory if it doesn't exist
+        try:
+            self.data_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Using data directory: {self.data_dir}")
+        except PermissionError:
+            logger.error(f"Permission denied creating data directory: {self.data_dir}")
+            raise
+        except Exception as e:
+            logger.error(f"Failed to create data directory {self.data_dir}: {e}")
+            raise
+
+        # RRD configuration
+        self.step = 60  # 1-minute intervals
+        self.heartbeat = 120  # 2-minute heartbeat (allow 1 missed update)
+        
+        # Retention policy (6 months total)
+        self.rra_config = [
+            "RRA:AVERAGE:0.5:1:1440",      # 1-min avg for 24 hours (1440 points)
+            "RRA:AVERAGE:0.5:60:744",      # 1-hour avg for 31 days (744 points) 
+            "RRA:AVERAGE:0.5:1440:180",    # 1-day avg for 6 months (180 points)
+            "RRA:MAX:0.5:1:1440",          # 1-min max for 24 hours
+            "RRA:MAX:0.5:60:744",          # 1-hour max for 31 days
+            "RRA:MIN:0.5:60:744",          # 1-hour min for 31 days
+        ]
+
+    def _get_node_dir(self, node_uuid: str) -> Path:
+        """Get the directory path for a specific node's RRD files."""
+        node_dir = self.data_dir / node_uuid
+        node_dir.mkdir(exist_ok=True)
+        return node_dir
+
+    def _create_system_rrd(self, node_uuid: str) -> str:
+        """Create RRD file for system metrics (uptime, load, memory)."""
+        rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
+        
+        if rrd_file.exists():
+            return str(rrd_file)
+            
+        try:
+            rrdtool.create(
+                str(rrd_file),
+                "--step", str(self.step),
+                # Data sources
+                f"DS:uptime:GAUGE:{self.heartbeat}:0:U",           # Uptime in seconds
+                f"DS:load1:GAUGE:{self.heartbeat}:0:100",         # 1-min load average
+                f"DS:load5:GAUGE:{self.heartbeat}:0:100",         # 5-min load average  
+                f"DS:load15:GAUGE:{self.heartbeat}:0:100",        # 15-min load average
+                f"DS:memory:GAUGE:{self.heartbeat}:0:100",        # Memory usage %
+                # Round Robin Archives
+                *self.rra_config
+            )
+            logger.info(f"Created system RRD for node {node_uuid}")
+            return str(rrd_file)
+        except Exception as e:
+            logger.error(f"Failed to create system RRD for {node_uuid}: {e}")
+            raise
+
+    def _create_ping_rrd(self, node_uuid: str, target_uuid: str) -> str:
+        """Create RRD file for ping metrics between two nodes."""
+        rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
+        
+        if rrd_file.exists():
+            return str(rrd_file)
+            
+        try:
+            rrdtool.create(
+                str(rrd_file),
+                "--step", str(self.step),
+                # Data sources for ping metrics
+                f"DS:latency:GAUGE:{self.heartbeat}:0:10000",     # Ping latency in ms
+                f"DS:loss:GAUGE:{self.heartbeat}:0:100",         # Packet loss %
+                # Round Robin Archives
+                *self.rra_config
+            )
+            logger.info(f"Created ping RRD for {node_uuid} -> {target_uuid}")
+            return str(rrd_file)
+        except Exception as e:
+            logger.error(f"Failed to create ping RRD for {node_uuid}->{target_uuid}: {e}")
+            raise
+
+    def update_system_metrics(self, node_uuid: str, timestamp: datetime, 
+                            uptime_seconds: int, load_avg: List[float], 
+                            memory_usage_percent: float):
+        """Update system metrics for a node."""
+        try:
+            rrd_file = self._create_system_rrd(node_uuid)
+            
+            # Convert datetime to Unix timestamp
+            unix_time = int(timestamp.timestamp())
+            
+            # Format: timestamp:uptime:load1:load5:load15:memory
+            values = f"{unix_time}:{uptime_seconds}:{load_avg[0]}:{load_avg[1]}:{load_avg[2]}:{memory_usage_percent}"
+            
+            rrdtool.update(rrd_file, values)
+            logger.debug(f"Updated system metrics for {node_uuid}: {values}")
+            
+        except Exception as e:
+            logger.error(f"Failed to update system metrics for {node_uuid}: {e}")
+            raise
+
+    def update_ping_metrics(self, node_uuid: str, target_uuid: str, 
+                          timestamp: datetime, latency_ms: float):
+        """Update ping metrics between two nodes."""
+        try:
+            rrd_file = self._create_ping_rrd(node_uuid, target_uuid)
+            
+            unix_time = int(timestamp.timestamp())
+            
+            # For now, we only track latency. Loss can be calculated from missing updates
+            values = f"{unix_time}:{latency_ms}:0"  # 0% loss (could be enhanced)
+            
+            rrdtool.update(rrd_file, values)
+            logger.debug(f"Updated ping metrics {node_uuid}->{target_uuid}: {latency_ms}ms")
+            
+        except Exception as e:
+            logger.error(f"Failed to update ping metrics {node_uuid}->{target_uuid}: {e}")
+            raise
+
+    def get_system_data(self, node_uuid: str, start_time: str = "-24h", 
+                       end_time: str = "now") -> Optional[Dict]:
+        """Retrieve system metrics data for a node."""
+        try:
+            rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
+            if not rrd_file.exists():
+                return None
+                
+            result = rrdtool.fetch(
+                str(rrd_file),
+                "AVERAGE",
+                "--start", start_time,
+                "--end", end_time
+            )
+            
+            # Parse RRDtool fetch result
+            start, end, step = result[0]
+            ds_names = result[1]  # ['uptime', 'load1', 'load5', 'load15', 'memory']
+            data_points = result[2]
+            
+            # Convert to more usable format
+            timestamps = []
+            data = {ds: [] for ds in ds_names}
+            
+            current_time = start
+            for point in data_points:
+                timestamps.append(current_time)
+                for i, ds in enumerate(ds_names):
+                    value = point[i] if point[i] is not None else 0
+                    data[ds].append(value)
+                current_time += step
+                
+            return {
+                'timestamps': timestamps,
+                'data': data,
+                'step': step
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to get system data for {node_uuid}: {e}")
+            return None
+
+    def get_ping_data(self, node_uuid: str, target_uuid: str, 
+                     start_time: str = "-24h", end_time: str = "now") -> Optional[Dict]:
+        """Retrieve ping metrics between two nodes."""
+        try:
+            rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
+            if not rrd_file.exists():
+                return None
+                
+            result = rrdtool.fetch(
+                str(rrd_file),
+                "AVERAGE", 
+                "--start", start_time,
+                "--end", end_time
+            )
+            
+            start, end, step = result[0]
+            ds_names = result[1]  # ['latency', 'loss']
+            data_points = result[2]
+            
+            timestamps = []
+            data = {ds: [] for ds in ds_names}
+            
+            current_time = start
+            for point in data_points:
+                timestamps.append(current_time)
+                for i, ds in enumerate(ds_names):
+                    value = point[i] if point[i] is not None else 0
+                    data[ds].append(value)
+                current_time += step
+                
+            return {
+                'timestamps': timestamps,
+                'data': data,
+                'step': step
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to get ping data {node_uuid}->{target_uuid}: {e}")
+            return None
+
+    def list_nodes(self) -> List[str]:
+        """Get list of all nodes with RRD data."""
+        try:
+            nodes = []
+            for item in self.data_dir.iterdir():
+                if item.is_dir() and (item / "system.rrd").exists():
+                    nodes.append(item.name)
+            return nodes
+        except Exception as e:
+            logger.error(f"Failed to list nodes: {e}")
+            return []
+
+    def cleanup_old_data(self):
+        """Clean up RRD files older than 6 months (handled automatically by RRD retention)."""
+        # RRD automatically handles data retention based on RRA configuration
+        # This method could be used for cleaning up orphaned files
+        cutoff_date = datetime.now() - timedelta(days=190)  # 6+ months
+        
+        try:
+            for node_dir in self.data_dir.iterdir():
+                if not node_dir.is_dir():
+                    continue
+                    
+                # Check if any RRD files have been modified recently
+                rrd_files = list(node_dir.glob("*.rrd"))
+                if not rrd_files:
+                    continue
+                    
+                # If all RRD files are old, the node is probably dead
+                all_old = all(
+                    datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date 
+                    for f in rrd_files
+                )
+                
+                if all_old:
+                    logger.info(f"Node {node_dir.name} appears inactive for >6 months")
+                    # Could optionally remove the directory here
+                    
+        except Exception as e:
+            logger.error(f"Failed during cleanup: {e}")
diff --git a/app/main.py b/app/main.py
index a55c0bc..c8b021d 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,15 +1,14 @@
-
 import os
 import uuid
 import json
 import logging
-from datetime import datetime
-
+from datetime import datetime, timezone, timedelta
 from fastapi import FastAPI, Request, status
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
-from pydantic import BaseModel, Field, validator, constr, conlist
-from typing import Dict, List
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel, Field, validator
+from typing import Dict, List, Annotated
 import uuid as uuid_lib
 
 from collections import deque
@@ -44,7 +43,6 @@ class BufferHandler(logging.Handler):
         except Exception as e:
             # Log the error to stderr, to avoid recursion or filling the buffer with errors
             print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
-            # Optionally, you could log record.msg or record.exc_info here for more context
 
 
 class LogBuffer:
@@ -60,7 +58,6 @@ class LogBuffer:
             'message': record.get('message'),
             'extra': {k: v for k, v in record.items()
                      if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
-            # Added more common LogRecord attributes to exclude from 'extra'
         })
 
     def get_logs(self, limit=100):
@@ -80,10 +77,8 @@ formatter = jsonlogger.JsonFormatter(
 logHandler.setFormatter(formatter)
 
 # Add handlers to the root logger
-# Avoid adding handlers multiple times in a uvicorn environment
 if not logger.handlers:
     logger.addHandler(logHandler)
-    # Add buffer handler to logger ONLY ONCE
     buffer_handler = BufferHandler()
     logger.addHandler(buffer_handler)
 
@@ -97,15 +92,18 @@ app = FastAPI(
 # Configure templates for the web interface
 templates = Jinja2Templates(directory="app/web/templates")
 
+# Mount static files directory
+app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
+
 
 # --- Data Models (as defined in the project spec) ---
 class NodeStatusModel(BaseModel):
     uptime_seconds: int
-    load_avg: conlist(float, min_length=3, max_length=3)
+    load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
     memory_usage_percent: float
 
 class PingModel(BaseModel):
-    pings: Dict[constr(regex=r'^[0-9a-fA-F-]{36}$'), float]
+    pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]
 
 class StatusUpdate(BaseModel):
     node: str = Field(..., description="Node UUID")
@@ -131,15 +129,61 @@ class StatusUpdate(BaseModel):
         return v
 
 
-# A mock database of known nodes for the auto-discovery demo
-# In a real app, this would be managed more dynamically
-known_nodes_db = {}
+# --- Node Management and Health Logic ---
+# A mock database of known nodes, now storing more comprehensive data
+known_nodes_db: Dict[str, Dict] = {}
+
+# Health calculation constants (can be tuned)
+LOAD_AVG_WARNING_THRESHOLD = 1.5
+LOAD_AVG_CRITICAL_THRESHOLD = 3.0
+MEMORY_WARNING_THRESHOLD = 75.0
+MEMORY_CRITICAL_THRESHOLD = 90.0
+# If a node hasn't reported in this many seconds, it's considered critical
+LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30 
+
+def get_node_health(node_data: Dict) -> str:
+    """Calculates the health status based on node metrics and last seen time."""
+    # Check for liveness first
+    last_seen_str = node_data.get("last_seen")
+    if last_seen_str:
+        last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
+        time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
+        if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
+            return "critical" # Node has not reported recently
+    else:
+        return "unknown" # Should not happen if 'last_seen' is always set
+
+    status_model_data = node_data.get("status")
+    if not status_model_data:
+        # This could happen if a node is just discovered but hasn't sent a full status update yet
+        return "unknown" 
+
+    try:
+        status = NodeStatusModel(**status_model_data)
+    except Exception:
+        logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
+        return "unknown" # Or critical if parsing fails
+
+    # Check load average (using 1-minute load for primary indicator)
+    load_1min = status.load_avg[0]
+    if load_1min >= LOAD_AVG_CRITICAL_THRESHOLD:
+        return "critical"
+    elif load_1min >= LOAD_AVG_WARNING_THRESHOLD:
+        return "warning"
+
+    # Check memory usage
+    if status.memory_usage_percent >= MEMORY_CRITICAL_THRESHOLD:
+        return "critical"
+    elif status.memory_usage_percent >= MEMORY_WARNING_THRESHOLD:
+        return "warning"
+
+    return "healthy"
 
 
 # --- API Endpoints ---
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
-    """Serves the main web page which displays the Service UUID."""
+    """Serves the main web page which displays the Service UUID and the node grid."""
     logger.info(
         "Web root accessed",
         extra={'client_ip': request.client.host, 'service_uuid': SERVICE_UUID}
@@ -149,7 +193,6 @@ async def read_root(request: Request):
         {"request": request, "service_uuid": SERVICE_UUID}
     )
 
-# Add the logs endpoint
 @app.get("/{service_uuid}/logs")
 async def get_logs(service_uuid: str, limit: int = 100):
     """Get recent logs for the service."""
@@ -175,7 +218,6 @@ async def update_node_status(
     request: Request
 ):
     """Receives status updates from a node and returns a list of peers."""
-    # Log the incoming status update with structured context
     logger.info(
         "Received node status update",
         extra={
@@ -205,7 +247,7 @@ async def update_node_status(
         )
         
         # Update ping metrics
-        for target_uuid, latency in status_update.pings.pings.items():
+        for target_uuid, latency in status_update.pings.items():
             database.update_ping_metrics(
                 node_uuid=node_uuid,
                 target_uuid=target_uuid,
@@ -214,20 +256,51 @@ async def update_node_status(
             )
             
     except Exception as e:
-        logger.error(f"Database update failed: {e}")
+        logger.error(f"Database update failed: {e}", exc_info=True)
         # Continue processing even if DB update fails
 
-    # Auto-discovery logic
-    if node_uuid not in known_nodes_db:
-        logger.info(f"New node discovered: {node_uuid}")
-    # A real app would need a strategy to handle node addresses
-    known_nodes_db[node_uuid] = {"last_seen": datetime.utcnow().isoformat(), "ip": request.client.host}
+    # Auto-discovery logic and update known_nodes_db with full status
+    current_time_utc = datetime.now(timezone.utc)
+    
+    known_nodes_db[node_uuid] = {
+        "last_seen": current_time_utc.isoformat(),
+        "ip": request.client.host,
+        "status": status_update.status.dict(), # Store the dict representation
+        # Store direct values for convenience in /nodes/status endpoint
+        "uptime_seconds": status_update.status.uptime_seconds,
+        "load_avg": status_update.status.load_avg,
+        "memory_usage_percent": status_update.status.memory_usage_percent
+    }
+    
+    # Calculate health for logging purposes (it will be recalculated for /nodes/status)
+    health_status_for_log = get_node_health(known_nodes_db[node_uuid])
+    logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
 
     # Respond with the list of other known peers
-    peer_list = {uuid: data for uuid, data in known_nodes_db.items() if uuid != node_uuid}
+    peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
+                 for uuid, data in known_nodes_db.items() if uuid != node_uuid}
 
     return {"message": "Status received", "peers": peer_list}
 
+@app.get("/nodes/status")
+async def get_all_nodes_status():
+    """Returns the current status of all known nodes for the UI."""
+    logger.info("Fetching all nodes status for UI.")
+    response_nodes = []
+    for node_uuid, data in known_nodes_db.items():
+        # Dynamically calculate health for each node based on its current data
+        current_health = get_node_health(data)
+        
+        response_nodes.append({
+            "uuid": node_uuid,
+            "last_seen": data["last_seen"],
+            "ip": data["ip"],
+            "health_status": current_health,
+            "uptime_seconds": data.get("uptime_seconds"),
+            "load_avg": data.get("load_avg"),
+            "memory_usage_percent": data.get("memory_usage_percent")
+        })
+    return {"nodes": response_nodes}
 
 @app.get("/health")
 async def health_check():
diff --git a/app/web/static/script.js b/app/web/static/script.js
new file mode 100644
index 0000000..74d50ab
--- /dev/null
+++ b/app/web/static/script.js
@@ -0,0 +1,73 @@
+document.addEventListener('DOMContentLoaded', () => {
+    const nodeGridContainer = document.getElementById('node-grid-container');
+    const nodeCountSpan = document.getElementById('node-count');
+    const POLLING_INTERVAL_MS = 3000; // Poll every 3 seconds
+
+    async function fetchNodeData() {
+        try {
+            const response = await fetch('/nodes/status');
+            if (!response.ok) {
+                throw new Error(`HTTP error! status: ${response.status}`);
+            }
+            const data = await response.json();
+            renderNodeGrid(data.nodes);
+        } catch (error) {
+            console.error("Error fetching node data:", error);
+            nodeGridContainer.innerHTML = '<p class="loading-message">Error loading node data. Please check server connection.</p>';
+        }
+    }
+
+    function renderNodeGrid(nodes) {
+        nodeGridContainer.innerHTML = ''; // Clear existing nodes
+        nodeCountSpan.textContent = nodes.length; // Update total node count
+
+        if (nodes.length === 0) {
+            nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>';
+            return;
+        }
+
+        nodes.forEach(node => {
+            const nodeCell = document.createElement('div');
+            nodeCell.classList.add('node-cell');
+            nodeCell.classList.add(`node-${node.health_status}`); // Apply health color class
+
+            // Truncate UUID for display
+            const displayUuid = node.uuid.substring(0, 8) + '...';
+
+            nodeCell.innerHTML = `
+                <div class="node-uuid" title="${node.uuid}">${displayUuid}</div>
+                <div class="node-status-text">Status: ${node.health_status.toUpperCase()}</div>
+                <div class="node-tooltip">
+                    <p><strong>UUID:</strong> ${node.uuid}</p>
+                    <p><strong>IP:</strong> ${node.ip}</p>
+                    <p><strong>Last Seen:</strong> ${new Date(node.last_seen).toLocaleTimeString()}</p>
+                    <p><strong>Uptime:</strong> ${node.uptime_seconds ? formatUptime(node.uptime_seconds) : 'N/A'}</p>
+                    <p><strong>Load Avg (1m, 5m, 15m):</strong> ${node.load_avg ? node.load_avg.join(', ') : 'N/A'}</p>
+                    <p><strong>Memory Usage:</strong> ${node.memory_usage_percent ? node.memory_usage_percent.toFixed(2) + '%' : 'N/A'}</p>
+                </div>
+            `;
+            nodeGridContainer.appendChild(nodeCell);
+        });
+    }
+
+    function formatUptime(seconds) {
+        const days = Math.floor(seconds / (3600 * 24));
+        seconds %= (3600 * 24);
+        const hours = Math.floor(seconds / 3600);
+        seconds %= 3600;
+        const minutes = Math.floor(seconds / 60);
+        const remainingSeconds = Math.floor(seconds % 60);
+
+        let parts = [];
+        if (days > 0) parts.push(`${days}d`);
+        if (hours > 0) parts.push(`${hours}h`);
+        if (minutes > 0) parts.push(`${minutes}m`);
+        if (remainingSeconds > 0 || parts.length === 0) parts.push(`${remainingSeconds}s`); // Ensure at least seconds are shown
+
+        return parts.join(' ');
+    }
+
+    // Initial fetch and then set up polling
+    fetchNodeData();
+    setInterval(fetchNodeData, POLLING_INTERVAL_MS);
+});
\ No newline at end of file
diff --git a/app/web/static/style.css b/app/web/static/style.css
new file mode 100644
index 0000000..532d97d
--- /dev/null
+++ b/app/web/static/style.css
@@ -0,0 +1,156 @@
+body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+    display: flex;
+    flex-direction: column; /* Changed to column for header + grid */
+    align-items: center;
+    min-height: 100vh; /* Use min-height to allow content to push body height */
+    margin: 0;
+    background-color: #f4f7f6;
+    color: #333;
+    padding: 20px; /* Add some padding */
+    box-sizing: border-box; /* Include padding in element's total width and height */
+}
+
+.header-container {
+    text-align: center;
+    padding: 20px;
+    background-color: white;
+    border-radius: 8px;
+    box-shadow: 0 4px 12px rgba(0,0,0,0.1);
+    margin-bottom: 20px;
+    width: 90%; /* Adjust width */
+    max-width: 800px; /* Max width for header */
+}
+
+h1 {
+    color: #0b2d48;
+    margin-bottom: 10px;
+}
+
+p {
+    font-size: 1rem;
+    color: #555;
+    margin: 5px 0;
+}
+
+code {
+    background-color: #e8e8e8;
+    padding: 3px 8px;
+    border-radius: 4px;
+    font-family: "Courier New", Courier, monospace;
+    font-size: 0.9rem;
+}
+
+#node-grid-container {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(150px, 1fr)); /* Responsive grid */
+    gap: 15px; /* Space between grid items */
+    width: 90%; /* Adjust width */
+    max-width: 1200px; /* Max width for grid */
+    padding: 20px;
+    background-color: white;
+    border-radius: 8px;
+    box-shadow: 0 4px 12px rgba(0,0,0,0.1);
+}
+
+.loading-message {
+    grid-column: 1 / -1; /* Span across all columns */
+    text-align: center;
+    font-style: italic;
+    color: #888;
+}
+
+.node-cell {
+    border: 1px solid #ddd;
+    border-radius: 6px;
+    padding: 15px;
+    text-align: center;
+    font-size: 0.9rem;
+    box-shadow: 0 2px 5px rgba(0,0,0,0.05);
+    transition: background-color 0.3s ease, border-color 0.3s ease, transform 0.1s ease;
+    cursor: pointer;
+    position: relative; /* For tooltip positioning */
+    overflow: hidden; /* Hide overflow for truncated UUID */
+}
+
+.node-cell:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
+}
+
+.node-uuid {
+    font-weight: bold;
+    margin-bottom: 5px;
+    color: #333;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis; /* Truncate long UUIDs */
+}
+
+.node-status-text {
+    font-size: 0.8rem;
+    color: #666;
+}
+
+/* Health Status Colors */
+.node-healthy {
+    background-color: #e6ffe6; /* Light green */
+    border-color: #4CAF50; /* Green */
+}
+
+.node-warning {
+    background-color: #fffacd; /* Light yellow */
+    border-color: #FFC107; /* Orange */
+}
+
+.node-critical {
+    background-color: #ffe6e6; /* Light red */
+    border-color: #F44336; /* Red */
+}
+
+.node-unknown {
+    background-color: #f0f0f0; /* Light gray */
+    border-color: #9E9E9E; /* Gray */
+}
+
+/* Tooltip styles */
+.node-tooltip {
+    visibility: hidden;
+    opacity: 0;
+    width: 200px;
+    background-color: #333;
+    color: #fff;
+    text-align: left;
+    border-radius: 6px;
+    padding: 10px;
+    position: absolute;
+    z-index: 1;
+    bottom: 100%; /* Position above the node cell */
+    left: 50%;
+    margin-left: -100px; /* Center the tooltip */
+    transition: opacity 0.3s;
+    font-size: 0.8rem;
+    white-space: normal; /* Allow text to wrap */
+    box-shadow: 0 2px 10px rgba(0,0,0,0.2);
+}
+
+.node-tooltip::after {
+    content: " ";
+    position: absolute;
+    top: 100%; /* At the bottom of the tooltip */
+    left: 50%;
+    margin-left: -5px;
+    border-width: 5px;
+    border-style: solid;
+    border-color: #333 transparent transparent transparent;
+}
+
+.node-cell:hover .node-tooltip {
+    visibility: visible;
+    opacity: 1;
+}
+
+.node-tooltip p {
+    margin: 2px 0;
+    color: #eee;
+}
\ No newline at end of file
diff --git a/app/web/templates/index.html b/app/web/templates/index.html
index e69de29..1c96a5f 100644
--- a/app/web/templates/index.html
+++ b/app/web/templates/index.html
@@ -0,0 +1,23 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Node Monitor</title>
+    <link rel="stylesheet" href="/static/style.css"> <!-- NEW: Link to CSS -->
+</head>
+<body>
+    <div class="header-container">
+        <h1>Distributed Node Monitoring System</h1>
+        <p>Service ID: <code>{{ service_uuid }}</code></p>
+        <p>Total Nodes: <span id="node-count">0</span></p>
+    </div>
+
+    <div id="node-grid-container" class="node-grid">
+        <!-- Node cells will be dynamically inserted here by JavaScript -->
+        <p class="loading-message">Loading node data...</p>
+    </div>
+
+    <script src="/static/script.js"></script> <!-- NEW: Link to JavaScript -->
+</body>
+</html>
\ No newline at end of file
diff --git a/client.py b/client.py
new file mode 100644
index 0000000..6e40753
--- /dev/null
+++ b/client.py
@@ -0,0 +1,163 @@
+import os
+import uuid
+import time
+import requests
+import random
+import json
+import logging
+from datetime import datetime, timezone
+
+# --- Client Configuration ---
+# The UUID of THIS client node. Generated on startup.
+# Can be overridden by an environment variable for persistent client identity.
+NODE_UUID = os.environ.get("NODE_UUID", str(uuid.uuid4()))
+
+# The UUID of the target monitoring service (the main.py server).
+# IMPORTANT: This MUST match the SERVICE_UUID of your running FastAPI server.
+# You can get this from the server's initial console output or by accessing its root endpoint ('/').
+# Replace the placeholder string below with your actual server's SERVICE_UUID.
+# For example: TARGET_SERVICE_UUID = "a1b2c3d4-e5f6-7890-1234-567890abcdef"
+TARGET_SERVICE_UUID = os.environ.get(
+    "TARGET_SERVICE_UUID", "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID"
+)
+
+# The base URL of the FastAPI monitoring service
+SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
+
+# How often to send status updates (in seconds)
+UPDATE_INTERVAL_SECONDS = int(os.environ.get("UPDATE_INTERVAL_SECONDS", 5))
+
+# --- Logging Configuration ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("NodeClient")
+
+# --- Global state for simulation ---
+uptime_seconds = 0
+# Dictionary to store UUIDs of other nodes received from the server
+# Format: { "node_uuid_str": { "last_seen": "iso_timestamp", "ip": "..." } }
+known_peers = {} 
+
+# --- Data Generation Functions ---
+
+def generate_node_status_data():
+    """Generates simulated node status metrics."""
+    global uptime_seconds
+    uptime_seconds += UPDATE_INTERVAL_SECONDS + random.randint(0, 2) # Simulate slight variation
+
+    # Simulate load average (3 values: 1-min, 5-min, 15-min)
+    # Load averages will fluctuate.
+    load_avg = [
+        round(random.uniform(0.1, 2.0), 2),
+        round(random.uniform(0.1, 1.8), 2),
+        round(random.uniform(0.1, 1.5), 2)
+    ]
+
+    # Simulate memory usage percentage
+    memory_usage_percent = round(random.uniform(30.0, 90.0), 2)
+
+    return {
+        "uptime_seconds": uptime_seconds,
+        "load_avg": load_avg,
+        "memory_usage_percent": memory_usage_percent
+    }
+
+def generate_ping_data():
+    """Generates simulated ping latencies to known peers."""
+    pings = {}
+    
+    # Simulate ping to self (loopback) - always very low latency
+    pings[str(NODE_UUID)] = round(random.uniform(0.1, 1.0), 2) 
+
+    # Simulate pings to other known peers
+    for peer_uuid in known_peers.keys():
+        if peer_uuid != str(NODE_UUID): # Don't ping self twice
+            # Varying latency for external peers
+            pings[peer_uuid] = round(random.uniform(10.0, 200.0), 2) 
+    return pings
+
+# --- Main Client Logic ---
+
+def run_client():
+    global known_peers
+    logger.info(f"Starting Node Client {NODE_UUID}")
+    logger.info(f"Target Service UUID: {TARGET_SERVICE_UUID}")
+    logger.info(f"Server URL: {SERVER_BASE_URL}")
+    logger.info(f"Update Interval: {UPDATE_INTERVAL_SECONDS} seconds")
+
+    if TARGET_SERVICE_UUID == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
+        logger.error("-" * 50)
+        logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
+        logger.error("Please replace 'REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID' in client.py")
+        logger.error("or set the environment variable TARGET_SERVICE_UUID.")
+        logger.error("You can find the server's UUID by running main.py and checking its console output")
+        logger.error("or by visiting 'http://localhost:8000/' in your browser.")
+        logger.error("-" * 50)
+        return
+
+    while True:
+        try:
+            # 1. Generate status data
+            status_data = generate_node_status_data()
+            ping_data = generate_ping_data()
+
+            # 2. Construct the payload matching the StatusUpdate model
+            # Use datetime.now(timezone.utc) for timezone-aware UTC timestamp
+            payload = {
+                "node": str(NODE_UUID),
+                "timestamp": datetime.now(timezone.utc).isoformat(), 
+                "status": status_data,
+                "pings": ping_data
+            }
+
+            # 3. Define the endpoint URL
+            endpoint_url = f"{SERVER_BASE_URL}/{TARGET_SERVICE_UUID}/{NODE_UUID}/"
+
+            # 4. Send the PUT request
+            logger.info(f"Sending update to {endpoint_url}. Uptime: {status_data['uptime_seconds']}s, Load: {status_data['load_avg']}, Pings: {len(ping_data)}")
+            
+            response = requests.put(endpoint_url, json=payload, timeout=10) # 10-second timeout
+
+            # 5. Process the response
+            if response.status_code == 200:
+                response_data = response.json()
+                logger.info(f"Successfully sent update. Server message: '{response_data.get('message')}'")
+                
+                if "peers" in response_data and isinstance(response_data["peers"], dict):
+                    # Update known_peers, converting keys to strings from JSON
+                    new_peers = {k: v for k, v in response_data["peers"].items()}
+                    
+                    # Log if new peers are discovered
+                    newly_discovered = set(new_peers.keys()) - set(known_peers.keys())
+                    if newly_discovered:
+                        logger.info(f"Discovered new peer(s): {', '.join(newly_discovered)}")
+                    
+                    known_peers = new_peers
+                    logger.info(f"Total known peers (including self if returned by server): {len(known_peers)}")
+                else:
+                    logger.warning("Server response did not contain a valid 'peers' field or it was empty.")
+            else:
+                logger.error(f"Failed to send update. Status code: {response.status_code}, Response: {response.text}")
+                if response.status_code == 404:
+                    logger.error("Hint: The TARGET_SERVICE_UUID might be incorrect, or the server isn't running at this endpoint.")
+                elif response.status_code == 422: # Pydantic validation error
+                    logger.error(f"Server validation error (422 Unprocessable Entity): {response.json()}")
+
+        except requests.exceptions.Timeout:
+            logger.error(f"Request timed out after {10} seconds. Is the server running and responsive?")
+        except requests.exceptions.ConnectionError as e:
+            logger.error(f"Connection error: {e}. Is the server running at {SERVER_BASE_URL}?")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"An unexpected request error occurred: {e}", exc_info=True)
+        except json.JSONDecodeError:
+            logger.error(f"Failed to decode JSON response: {response.text}. Is the server returning valid JSON?")
+        except Exception as e:
+            logger.error(f"An unexpected error occurred in the client loop: {e}", exc_info=True)
+
+        # 6. Wait for the next update
+        time.sleep(UPDATE_INTERVAL_SECONDS)
+
+if __name__ == "__main__":
+    run_client()
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..cc9f478
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,19 @@
+version: '3.8'
+
+services:
+  node-monitor:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./data:/data
+    environment:
+      - DATA_DIR=/data
+      - SERVICE_UUID=${SERVICE_UUID:-auto-generated}
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 5s
diff --git a/requirements.txt b/requirements.txt
index 5e62eb6..1a89758 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 fastapi==0.104.1
 uvicorn[standard]==0.24.0
-python-rrdtool==1.4.7
+rrdtool==0.1.16
 jinja2==3.1.2
 python-multipart==0.0.6
 python-json-logger==2.0.7