starting to be usable.
This commit is contained in:
88
Dockerfile
88
Dockerfile
@ -1,50 +1,78 @@
|
|||||||
# Use Debian-based Python 3.13 slim image
|
# Stage 1:
|
||||||
FROM python:3.13-slim-bookworm
|
# This stage installs build dependencies and builds Python packages into wheels.
|
||||||
|
FROM python:3.13-slim-bookworm AS builder
|
||||||
|
|
||||||
|
# Install build dependencies for rrdtool and Python packages
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
librrd-dev \
|
||||||
|
build-essential \
|
||||||
|
python3-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install system dependencies for rrdtool
|
# Copy requirements file
|
||||||
RUN apt-get update && apt-get install -y \
|
|
||||||
rrdtool \
|
|
||||||
librrd-dev \
|
|
||||||
build-essential \
|
|
||||||
python3-dev \
|
|
||||||
wget \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Copy requirements first for better Docker layer caching
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|
||||||
# Install Python dependencies
|
# Install Python dependencies into a wheelhouse
|
||||||
|
# This builds source distributions (like rrdtool) into wheels
|
||||||
|
# We don't need a venv here as we're just creating wheels, not installing them
|
||||||
RUN pip install --no-cache-dir --upgrade pip && \
|
RUN pip install --no-cache-dir --upgrade pip && \
|
||||||
pip install --no-cache-dir -r requirements.txt
|
pip wheel --no-cache-dir --wheel-dir /tmp/wheels -r requirements.txt
|
||||||
|
|
||||||
# Remove build dependencies to reduce image size
|
|
||||||
RUN apt-get purge -y build-essential python3-dev && \
|
# Stage 2: Runtime
|
||||||
apt-get autoremove -y && \
|
# This stage takes the minimal base image and copies only the necessary runtime artifacts.
|
||||||
apt-get clean
|
FROM python:3.13-slim-bookworm
|
||||||
|
|
||||||
|
# Install runtime system dependencies for rrdtool and wget for healthcheck
|
||||||
|
# rrdtool and librrd8 are the runtime libraries for rrdtool (not librrd-dev)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
rrdtool \
|
||||||
|
librrd8 \
|
||||||
|
wget \
|
||||||
|
# Final cleanup to reduce image size
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Create a non-root user for security (before creating venv in their home if desired, or in /opt)
|
||||||
|
RUN useradd --create-home --shell /bin/bash appuser
|
||||||
|
|
||||||
|
# Create a virtual environment for the application
|
||||||
|
# We'll put it in /opt/venv for consistency, and ensure appuser can access it
|
||||||
|
RUN python3 -m venv /opt/venv && \
|
||||||
|
/opt/venv/bin/pip install --no-cache-dir --upgrade pip
|
||||||
|
|
||||||
|
# Copy the built Python wheels from the builder stage
|
||||||
|
COPY --from=builder /tmp/wheels /tmp/wheels/
|
||||||
|
|
||||||
|
# Install Python dependencies from the wheels into the virtual environment
|
||||||
|
RUN /opt/venv/bin/pip install --no-cache-dir /tmp/wheels/*.whl && \
|
||||||
|
rm -rf /tmp/wheels # Remove the wheels after installation to save space
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY app/ ./app/
|
COPY app/ ./app/
|
||||||
|
|
||||||
# Create directory for RRD data at /data (will be volume mounted)
|
# Set permissions for the appuser and data directory
|
||||||
RUN mkdir -p /data
|
RUN chown -R appuser:appuser /app && \
|
||||||
|
chown -R appuser:appuser /opt/venv && \
|
||||||
|
mkdir -p /data && \
|
||||||
|
chown -R appuser:appuser /data && \
|
||||||
|
chmod 777 /data # Ensure volume mount has write permissions
|
||||||
|
|
||||||
|
# Switch to the non-root user
|
||||||
|
USER appuser
|
||||||
|
|
||||||
# Expose port
|
# Expose port
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
# Create non-root user for security
|
|
||||||
RUN useradd --create-home --shell /bin/bash appuser && \
|
|
||||||
chown -R appuser:appuser /app && \
|
|
||||||
chown -R appuser:appuser /data && \
|
|
||||||
chmod 777 /data
|
|
||||||
USER appuser
|
|
||||||
|
|
||||||
# Health check
|
# Health check
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
|
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
# Run the application
|
# Run the application using the virtual environment's python interpreter
|
||||||
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|
||||||
|
254
app/main.py
254
app/main.py
@ -12,7 +12,7 @@ from typing import Dict, List, Annotated
|
|||||||
import uuid as uuid_lib
|
import uuid as uuid_lib
|
||||||
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from dateutil.parser import isoparse
|
from dateutil.parser import isoparse # Import isoparse for robust date parsing
|
||||||
|
|
||||||
from pythonjsonlogger import jsonlogger
|
from pythonjsonlogger import jsonlogger
|
||||||
import sys
|
import sys
|
||||||
@ -27,19 +27,26 @@ database = RRDDatabase()
|
|||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class BufferHandler(logging.Handler):
|
class BufferHandler(logging.Handler):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.formatter = jsonlogger.JsonFormatter()
|
# Use the same formatter string as the StreamHandler for consistency
|
||||||
|
self.formatter = jsonlogger.JsonFormatter(
|
||||||
|
"%(asctime)s %(name)s %(levelname)s %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
def emit(self, record):
|
def emit(self, record):
|
||||||
try:
|
try:
|
||||||
# Format the record as a JSON string and then parse it back to a dict
|
log_entry_str = self.formatter.format(record)
|
||||||
# This ensures consistency with the jsonlogger's output format
|
log_entry = json.loads(log_entry_str)
|
||||||
log_entry = json.loads(self.formatter.format(record))
|
|
||||||
log_buffer.add_log(log_entry)
|
log_buffer.add_log(log_entry)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in BufferHandler: Could not process log record: {e}", file=sys.stderr)
|
print(
|
||||||
|
f"Error in BufferHandler: Could not process log record: {e}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LogBuffer:
|
class LogBuffer:
|
||||||
def __init__(self, maxlen=1000):
|
def __init__(self, maxlen=1000):
|
||||||
@ -47,14 +54,35 @@ class LogBuffer:
|
|||||||
|
|
||||||
def add_log(self, record):
|
def add_log(self, record):
|
||||||
# Assuming 'record' here is already a dictionary parsed from the JSON log string
|
# Assuming 'record' here is already a dictionary parsed from the JSON log string
|
||||||
timestamp = record.get('asctime') or datetime.utcnow().isoformat()
|
timestamp = record.get("asctime") or datetime.utcnow().isoformat()
|
||||||
self.buffer.append({
|
self.buffer.append(
|
||||||
'timestamp': timestamp,
|
{
|
||||||
'level': record.get('levelname'),
|
"timestamp": timestamp,
|
||||||
'message': record.get('message'),
|
"level": record.get(
|
||||||
'extra': {k: v for k, v in record.items()
|
"levelname"
|
||||||
if k not in ['asctime', 'levelname', 'message', 'name', 'lineno', 'filename', 'pathname', 'funcName', 'process', 'processName', 'thread', 'threadName']}
|
), # This should now correctly get 'levelname'
|
||||||
})
|
"message": record.get("message"),
|
||||||
|
"extra": {
|
||||||
|
k: v
|
||||||
|
for k, v in record.items()
|
||||||
|
if k
|
||||||
|
not in [
|
||||||
|
"asctime",
|
||||||
|
"levelname",
|
||||||
|
"message",
|
||||||
|
"name",
|
||||||
|
"lineno",
|
||||||
|
"filename",
|
||||||
|
"pathname",
|
||||||
|
"funcName",
|
||||||
|
"process",
|
||||||
|
"processName",
|
||||||
|
"thread",
|
||||||
|
"threadName",
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def get_logs(self, limit=100, level=None, since=None):
|
def get_logs(self, limit=100, level=None, since=None):
|
||||||
logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}")
|
logger.debug(f"Fetching logs with limit={limit}, level={level}, since={since}")
|
||||||
@ -62,9 +90,9 @@ class LogBuffer:
|
|||||||
# Apply level filter
|
# Apply level filter
|
||||||
if level and level.strip():
|
if level and level.strip():
|
||||||
level = level.upper()
|
level = level.upper()
|
||||||
valid_levels = {'INFO', 'WARNING', 'ERROR', 'DEBUG'}
|
valid_levels = {"INFO", "WARNING", "ERROR", "DEBUG"}
|
||||||
if level in valid_levels:
|
if level in valid_levels:
|
||||||
logs = [log for log in logs if log['level'].upper() == level]
|
logs = [log for log in logs if log["level"].upper() == level]
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Invalid log level: {level}")
|
logger.warning(f"Invalid log level: {level}")
|
||||||
# Apply since filter
|
# Apply since filter
|
||||||
@ -72,7 +100,7 @@ class LogBuffer:
|
|||||||
try:
|
try:
|
||||||
# Use isoparse for robust parsing of ISO 8601 strings
|
# Use isoparse for robust parsing of ISO 8601 strings
|
||||||
since_dt = isoparse(since)
|
since_dt = isoparse(since)
|
||||||
|
|
||||||
# If the parsed datetime is naive (no timezone info), assume it's UTC
|
# If the parsed datetime is naive (no timezone info), assume it's UTC
|
||||||
if since_dt.tzinfo is None:
|
if since_dt.tzinfo is None:
|
||||||
since_dt = since_dt.replace(tzinfo=timezone.utc)
|
since_dt = since_dt.replace(tzinfo=timezone.utc)
|
||||||
@ -80,19 +108,24 @@ class LogBuffer:
|
|||||||
# If it has timezone info, convert it to UTC for consistent comparison
|
# If it has timezone info, convert it to UTC for consistent comparison
|
||||||
since_dt = since_dt.astimezone(timezone.utc)
|
since_dt = since_dt.astimezone(timezone.utc)
|
||||||
|
|
||||||
logs = [log for log in logs if
|
logs = [
|
||||||
datetime.fromisoformat(log['timestamp'].replace('Z', '+00:00')).astimezone(timezone.utc) >= since_dt]
|
log
|
||||||
|
for log in logs
|
||||||
|
if datetime.fromisoformat(
|
||||||
|
log["timestamp"].replace("Z", "+00:00")
|
||||||
|
).astimezone(timezone.utc)
|
||||||
|
>= since_dt
|
||||||
|
]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.warning(f"Invalid 'since' timestamp: {since}")
|
logger.warning(f"Invalid 'since' timestamp: {since}")
|
||||||
logger.debug(f"Returning {len(logs[-limit:])} logs")
|
logger.debug(f"Returning {len(logs[-limit:])} logs")
|
||||||
return logs[-limit:]
|
return logs[-limit:]
|
||||||
|
|
||||||
|
|
||||||
log_buffer = LogBuffer()
|
log_buffer = LogBuffer()
|
||||||
|
|
||||||
logHandler = logging.StreamHandler()
|
logHandler = logging.StreamHandler()
|
||||||
formatter = jsonlogger.JsonFormatter(
|
formatter = jsonlogger.JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s")
|
||||||
'%(asctime)s %(name)s %(levelname)s %(message)s'
|
|
||||||
)
|
|
||||||
logHandler.setFormatter(formatter)
|
logHandler.setFormatter(formatter)
|
||||||
|
|
||||||
if not logger.handlers:
|
if not logger.handlers:
|
||||||
@ -109,20 +142,23 @@ logging.getLogger("uvicorn.error").propagate = True
|
|||||||
# --- FastAPI Application ---
|
# --- FastAPI Application ---
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Node Monitoring System",
|
title="Node Monitoring System",
|
||||||
description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}"
|
description=f"A distributed monitoring system. Service UUID: {SERVICE_UUID}",
|
||||||
)
|
)
|
||||||
|
|
||||||
templates = Jinja2Templates(directory="app/web/templates")
|
templates = Jinja2Templates(directory="app/web/templates")
|
||||||
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
||||||
|
|
||||||
|
|
||||||
# --- Data Models ---
|
# --- Data Models ---
|
||||||
class NodeStatusModel(BaseModel):
|
class NodeStatusModel(BaseModel):
|
||||||
uptime_seconds: int
|
uptime_seconds: int
|
||||||
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
|
load_avg: Annotated[List[float], Field(min_length=3, max_length=3)]
|
||||||
memory_usage_percent: float
|
memory_usage_percent: float
|
||||||
|
|
||||||
|
|
||||||
class PingModel(BaseModel):
|
class PingModel(BaseModel):
|
||||||
pings: Dict[Annotated[str, Field(pattern=r'^[0-9a-fA-F-]{36}$')], float]
|
pings: Dict[Annotated[str, Field(pattern=r"^[0-9a-fA-F-]{36}$")], float]
|
||||||
|
|
||||||
|
|
||||||
class StatusUpdate(BaseModel):
|
class StatusUpdate(BaseModel):
|
||||||
node: str = Field(..., description="Node UUID")
|
node: str = Field(..., description="Node UUID")
|
||||||
@ -130,23 +166,24 @@ class StatusUpdate(BaseModel):
|
|||||||
status: NodeStatusModel
|
status: NodeStatusModel
|
||||||
pings: Dict[str, float]
|
pings: Dict[str, float]
|
||||||
|
|
||||||
@validator('node')
|
@validator("node")
|
||||||
def validate_node_uuid(cls, v):
|
def validate_node_uuid(cls, v):
|
||||||
try:
|
try:
|
||||||
uuid_lib.UUID(v)
|
uuid_lib.UUID(v)
|
||||||
return v
|
return v
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError('Invalid UUID format')
|
raise ValueError("Invalid UUID format")
|
||||||
|
|
||||||
@validator('pings')
|
@validator("pings")
|
||||||
def validate_ping_uuids(cls, v):
|
def validate_ping_uuids(cls, v):
|
||||||
for key in v.keys():
|
for key in v.keys():
|
||||||
try:
|
try:
|
||||||
uuid_lib.UUID(key)
|
uuid_lib.UUID(key)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f'Invalid UUID format in pings: {key}')
|
raise ValueError(f"Invalid UUID format in pings: {key}")
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
|
||||||
# --- Node Management and Health Logic ---
|
# --- Node Management and Health Logic ---
|
||||||
known_nodes_db: Dict[str, Dict] = {}
|
known_nodes_db: Dict[str, Dict] = {}
|
||||||
|
|
||||||
@ -155,13 +192,20 @@ LOAD_AVG_CRITICAL_THRESHOLD = 3.0
|
|||||||
MEMORY_WARNING_THRESHOLD = 75.0
|
MEMORY_WARNING_THRESHOLD = 75.0
|
||||||
MEMORY_CRITICAL_THRESHOLD = 90.0
|
MEMORY_CRITICAL_THRESHOLD = 90.0
|
||||||
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
|
LAST_SEEN_CRITICAL_THRESHOLD_SECONDS = 30
|
||||||
NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS = 300 # Remove node from UI after 5 minutes of inactivity
|
NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS = (
|
||||||
|
300 # Remove node from UI after 5 minutes of inactivity
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_node_health(node_data: Dict) -> str:
|
def get_node_health(node_data: Dict) -> str:
|
||||||
last_seen_str = node_data.get("last_seen")
|
last_seen_str = node_data.get("last_seen")
|
||||||
if last_seen_str:
|
if last_seen_str:
|
||||||
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(tzinfo=timezone.utc)
|
last_seen_dt = datetime.fromisoformat(last_seen_str).replace(
|
||||||
time_since_last_seen = (datetime.now(timezone.utc) - last_seen_dt).total_seconds()
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
time_since_last_seen = (
|
||||||
|
datetime.now(timezone.utc) - last_seen_dt
|
||||||
|
).total_seconds()
|
||||||
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
|
if time_since_last_seen > LAST_SEEN_CRITICAL_THRESHOLD_SECONDS:
|
||||||
return "critical"
|
return "critical"
|
||||||
else:
|
else:
|
||||||
@ -174,7 +218,10 @@ def get_node_health(node_data: Dict) -> str:
|
|||||||
try:
|
try:
|
||||||
status = NodeStatusModel(**status_model_data)
|
status = NodeStatusModel(**status_model_data)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.error(f"Could not parse status data for node {node_data.get('uuid')}", exc_info=True)
|
logger.error(
|
||||||
|
f"Could not parse status data for node {node_data.get('uuid')}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
return "unknown"
|
return "unknown"
|
||||||
|
|
||||||
load_1min = status.load_avg[0]
|
load_1min = status.load_avg[0]
|
||||||
@ -190,6 +237,7 @@ def get_node_health(node_data: Dict) -> str:
|
|||||||
|
|
||||||
return "healthy"
|
return "healthy"
|
||||||
|
|
||||||
|
|
||||||
# --- API Endpoints ---
|
# --- API Endpoints ---
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def read_root(request: Request):
|
async def read_root(request: Request):
|
||||||
@ -197,62 +245,61 @@ async def read_root(request: Request):
|
|||||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Web root accessed",
|
"Web root accessed",
|
||||||
extra={'client_ip': client_ip, 'service_uuid': SERVICE_UUID}
|
extra={"client_ip": client_ip, "service_uuid": SERVICE_UUID},
|
||||||
)
|
)
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"index.html",
|
"index.html",
|
||||||
{
|
{
|
||||||
"request": request,
|
"request": request,
|
||||||
"service_uuid": SERVICE_UUID,
|
"service_uuid": SERVICE_UUID,
|
||||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
||||||
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
|
"root_path": request.scope.get(
|
||||||
}
|
"root_path", ""
|
||||||
|
), # Pass root_path for JS base URL
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/{service_uuid}/logs")
|
@app.get("/{service_uuid}/logs")
|
||||||
async def get_logs(
|
async def get_logs(
|
||||||
request: Request,
|
request: Request,
|
||||||
service_uuid: str,
|
service_uuid: str,
|
||||||
limit: int = 100,
|
limit: int = 100,
|
||||||
format: str = Query(None, description="Response format: 'json' for JSON, default is HTML"),
|
format: str = Query(
|
||||||
|
None, description="Response format: 'json' for JSON, default is HTML"
|
||||||
|
),
|
||||||
level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"),
|
level: str = Query(None, description="Filter logs by level: INFO, WARNING, ERROR"),
|
||||||
since: str = Query(None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00")
|
since: str = Query(
|
||||||
|
None, description="Fetch logs since ISO timestamp, e.g., 2025-06-11T13:32:00"
|
||||||
|
),
|
||||||
):
|
):
|
||||||
# Use X-Forwarded-For if available, otherwise client.host
|
# Use X-Forwarded-For if available, otherwise client.host
|
||||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Logs endpoint accessed",
|
"Logs endpoint accessed",
|
||||||
extra={
|
extra={
|
||||||
'service_uuid': service_uuid,
|
"service_uuid": service_uuid,
|
||||||
'format': format,
|
"format": format,
|
||||||
'level': level,
|
"level": level,
|
||||||
'since': since,
|
"since": since,
|
||||||
'limit': limit,
|
"limit": limit,
|
||||||
'client_ip': client_ip
|
"client_ip": client_ip,
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
if service_uuid != SERVICE_UUID:
|
if service_uuid != SERVICE_UUID:
|
||||||
logger.warning(f"Invalid service UUID: {service_uuid}")
|
logger.warning(f"Invalid service UUID: {service_uuid}")
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
status_code=404,
|
status_code=404, content={"error": "Service UUID not found"}
|
||||||
content={"error": "Service UUID not found"}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logs = log_buffer.get_logs(limit=limit, level=level, since=since)
|
logs = log_buffer.get_logs(limit=limit, level=level, since=since)
|
||||||
log_data = {
|
log_data = {"service_uuid": service_uuid, "log_count": len(logs), "logs": logs}
|
||||||
"service_uuid": service_uuid,
|
|
||||||
"log_count": len(logs),
|
|
||||||
"logs": logs
|
|
||||||
}
|
|
||||||
logger.debug(f"Fetched {len(logs)} logs for response")
|
logger.debug(f"Fetched {len(logs)} logs for response")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error fetching logs: {e}", exc_info=True)
|
logger.error(f"Error fetching logs: {e}", exc_info=True)
|
||||||
return JSONResponse(
|
return JSONResponse(status_code=500, content={"error": "Failed to fetch logs"})
|
||||||
status_code=500,
|
|
||||||
content={"error": "Failed to fetch logs"}
|
|
||||||
)
|
|
||||||
|
|
||||||
if format == "json":
|
if format == "json":
|
||||||
logger.debug("Returning JSON response")
|
logger.debug("Returning JSON response")
|
||||||
@ -267,41 +314,40 @@ async def get_logs(
|
|||||||
"service_uuid": service_uuid,
|
"service_uuid": service_uuid,
|
||||||
"logs": logs,
|
"logs": logs,
|
||||||
"log_count": len(logs),
|
"log_count": len(logs),
|
||||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
||||||
"root_path": request.scope.get('root_path', '') # Pass root_path for JS base URL
|
"root_path": request.scope.get(
|
||||||
}
|
"root_path", ""
|
||||||
|
), # Pass root_path for JS base URL
|
||||||
|
},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error rendering logs.html: {e}", exc_info=True)
|
logger.error(f"Error rendering logs.html: {e}", exc_info=True)
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
status_code=500,
|
status_code=500, content={"error": "Failed to render logs page"}
|
||||||
content={"error": "Failed to render logs page"}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.put("/{service_uuid}/{node_uuid}/")
|
@app.put("/{service_uuid}/{node_uuid}/")
|
||||||
async def update_node_status(
|
async def update_node_status(
|
||||||
service_uuid: str,
|
service_uuid: str, node_uuid: str, status_update: StatusUpdate, request: Request
|
||||||
node_uuid: str,
|
|
||||||
status_update: StatusUpdate,
|
|
||||||
request: Request
|
|
||||||
):
|
):
|
||||||
# Use X-Forwarded-For if available, otherwise client.host
|
# Use X-Forwarded-For if available, otherwise client.host
|
||||||
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
client_ip = request.headers.get("x-forwarded-for", request.client.host)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Received node status update",
|
"Received node status update",
|
||||||
extra={
|
extra={
|
||||||
'event_type': 'node_status_update',
|
"event_type": "node_status_update",
|
||||||
'client_ip': client_ip,
|
"client_ip": client_ip,
|
||||||
'service_uuid': service_uuid,
|
"service_uuid": service_uuid,
|
||||||
'node_uuid': node_uuid,
|
"node_uuid": node_uuid,
|
||||||
'data': status_update.dict()
|
"data": status_update.dict(),
|
||||||
}
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
if service_uuid != SERVICE_UUID:
|
if service_uuid != SERVICE_UUID:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Node sent status to wrong service UUID",
|
"Node sent status to wrong service UUID",
|
||||||
extra={'client_node_uuid': node_uuid, 'target_uuid': service_uuid}
|
extra={"client_node_uuid": node_uuid, "target_uuid": service_uuid},
|
||||||
)
|
)
|
||||||
return {"error": "Service UUID mismatch", "peers": []}
|
return {"error": "Service UUID mismatch", "peers": []}
|
||||||
|
|
||||||
@ -311,7 +357,7 @@ async def update_node_status(
|
|||||||
timestamp=status_update.timestamp,
|
timestamp=status_update.timestamp,
|
||||||
uptime_seconds=status_update.status.uptime_seconds,
|
uptime_seconds=status_update.status.uptime_seconds,
|
||||||
load_avg=status_update.status.load_avg,
|
load_avg=status_update.status.load_avg,
|
||||||
memory_usage_percent=status_update.status.memory_usage_percent
|
memory_usage_percent=status_update.status.memory_usage_percent,
|
||||||
)
|
)
|
||||||
|
|
||||||
for target_uuid, latency in status_update.pings.items():
|
for target_uuid, latency in status_update.pings.items():
|
||||||
@ -319,7 +365,7 @@ async def update_node_status(
|
|||||||
node_uuid=node_uuid,
|
node_uuid=node_uuid,
|
||||||
target_uuid=target_uuid,
|
target_uuid=target_uuid,
|
||||||
timestamp=status_update.timestamp,
|
timestamp=status_update.timestamp,
|
||||||
latency_ms=latency
|
latency_ms=latency,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -329,31 +375,39 @@ async def update_node_status(
|
|||||||
|
|
||||||
known_nodes_db[node_uuid] = {
|
known_nodes_db[node_uuid] = {
|
||||||
"last_seen": current_time_utc.isoformat(),
|
"last_seen": current_time_utc.isoformat(),
|
||||||
"ip": request.client.host, # Keep original client.host here as it's the direct connection
|
"ip": request.client.host, # Keep original client.host here as it's the direct connection
|
||||||
"status": status_update.status.dict(),
|
"status": status_update.status.dict(),
|
||||||
"uptime_seconds": status_update.status.uptime_seconds,
|
"uptime_seconds": status_update.status.uptime_seconds,
|
||||||
"load_avg": status_update.status.load_avg,
|
"load_avg": status_update.status.load_avg,
|
||||||
"memory_usage_percent": status_update.status.memory_usage_percent
|
"memory_usage_percent": status_update.status.memory_usage_percent,
|
||||||
}
|
}
|
||||||
|
|
||||||
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
|
health_status_for_log = get_node_health(known_nodes_db[node_uuid])
|
||||||
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
|
logger.info(f"Node {node_uuid} updated. Health: {health_status_for_log}")
|
||||||
|
|
||||||
peer_list = {uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
|
peer_list = {
|
||||||
for uuid, data in known_nodes_db.items() if uuid != node_uuid}
|
uuid: {"last_seen": data["last_seen"], "ip": data["ip"]}
|
||||||
|
for uuid, data in known_nodes_db.items()
|
||||||
|
if uuid != node_uuid
|
||||||
|
}
|
||||||
|
|
||||||
return {"message": "Status received", "peers": peer_list}
|
return {"message": "Status received", "peers": peer_list}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/nodes/status")
|
@app.get("/nodes/status")
|
||||||
async def get_all_nodes_status():
|
async def get_all_nodes_status():
|
||||||
logger.info("Fetching all nodes status for UI.")
|
logger.info("Fetching all nodes status for UI.")
|
||||||
|
|
||||||
# Prune inactive nodes from known_nodes_db before processing
|
# Prune inactive nodes from known_nodes_db before processing
|
||||||
current_time_utc = datetime.now(timezone.utc)
|
current_time_utc = datetime.now(timezone.utc)
|
||||||
nodes_to_remove = []
|
nodes_to_remove = []
|
||||||
for node_uuid, data in known_nodes_db.items():
|
for node_uuid, data in known_nodes_db.items():
|
||||||
last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(tzinfo=timezone.utc)
|
last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(
|
||||||
if (current_time_utc - last_seen_dt).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS:
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
current_time_utc - last_seen_dt
|
||||||
|
).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS:
|
||||||
nodes_to_remove.append(node_uuid)
|
nodes_to_remove.append(node_uuid)
|
||||||
logger.info(f"Removing inactive node {node_uuid} from known_nodes_db.")
|
logger.info(f"Removing inactive node {node_uuid} from known_nodes_db.")
|
||||||
|
|
||||||
@ -365,31 +419,37 @@ async def get_all_nodes_status():
|
|||||||
current_health = get_node_health(data)
|
current_health = get_node_health(data)
|
||||||
|
|
||||||
connections = {}
|
connections = {}
|
||||||
for target_uuid in known_nodes_db: # Only iterate over currently active nodes
|
for target_uuid in known_nodes_db: # Only iterate over currently active nodes
|
||||||
if target_uuid != node_uuid:
|
if target_uuid != node_uuid:
|
||||||
ping_data = database.get_ping_data(node_uuid, target_uuid, start_time="-300s")
|
ping_data = database.get_ping_data(
|
||||||
|
node_uuid, target_uuid, start_time="-300s"
|
||||||
|
)
|
||||||
latency_ms = None
|
latency_ms = None
|
||||||
if ping_data and ping_data['data']['latency']:
|
if ping_data and ping_data["data"]["latency"]:
|
||||||
# Get the most recent non-None latency
|
# Get the most recent non-None latency
|
||||||
for latency in reversed(ping_data['data']['latency']):
|
for latency in reversed(ping_data["data"]["latency"]):
|
||||||
if latency is not None and not (isinstance(latency, float) and latency == 0.0): # Exclude 0.0 which might be a default
|
if latency is not None and not (
|
||||||
|
isinstance(latency, float) and latency == 0.0
|
||||||
|
): # Exclude 0.0 which might be a default
|
||||||
latency_ms = float(latency)
|
latency_ms = float(latency)
|
||||||
break
|
break
|
||||||
connections[target_uuid] = latency_ms
|
connections[target_uuid] = latency_ms
|
||||||
|
|
||||||
response_nodes.append({
|
response_nodes.append(
|
||||||
"uuid": node_uuid,
|
{
|
||||||
"last_seen": data["last_seen"],
|
"uuid": node_uuid,
|
||||||
"ip": data["ip"],
|
"last_seen": data["last_seen"],
|
||||||
"health_status": current_health,
|
"ip": data["ip"],
|
||||||
"uptime_seconds": data.get("uptime_seconds"),
|
"health_status": current_health,
|
||||||
"load_avg": data.get("load_avg"),
|
"uptime_seconds": data.get("uptime_seconds"),
|
||||||
"memory_usage_percent": data.get("memory_usage_percent"),
|
"load_avg": data.get("load_avg"),
|
||||||
"connections": connections
|
"memory_usage_percent": data.get("memory_usage_percent"),
|
||||||
})
|
"connections": connections,
|
||||||
|
}
|
||||||
|
)
|
||||||
return {"nodes": response_nodes}
|
return {"nodes": response_nodes}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
# --- END OF FILE main.py ---
|
|
||||||
|
@ -24,9 +24,11 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
console.log('Fetch URL:', url);
|
console.log('Fetch URL:', url);
|
||||||
const response = await fetch(url);
|
const response = await fetch(url);
|
||||||
console.log('Response status:', response.status);
|
console.log('Response status:', response.status);
|
||||||
|
console.log('Response Content-Type:', response.headers.get('Content-Type')); // NEW: Log Content-Type
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const errorText = await response.text(); // Try to get response body as text
|
const errorText = await response.text(); // Try to get response body as text
|
||||||
console.error('Response text on error:', errorText); // Log it
|
console.error('Raw response text on error:', errorText.substring(0, 500) + (errorText.length > 500 ? '...' : '')); // Log first 500 chars
|
||||||
// If the server returns a 404, it might be due to a stale UUID.
|
// If the server returns a 404, it might be due to a stale UUID.
|
||||||
// Log a more specific message.
|
// Log a more specific message.
|
||||||
if (response.status === 404) {
|
if (response.status === 404) {
|
||||||
@ -39,7 +41,9 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
}
|
}
|
||||||
return; // Stop further processing if error
|
return; // Stop further processing if error
|
||||||
}
|
}
|
||||||
const data = await response.json();
|
|
||||||
|
// Attempt to parse JSON. This is where the error would occur if the content is HTML.
|
||||||
|
const data = await response.json();
|
||||||
console.log('Received logs:', data.logs.length);
|
console.log('Received logs:', data.logs.length);
|
||||||
renderLogTable(data.logs);
|
renderLogTable(data.logs);
|
||||||
logCountSpan.textContent = data.log_count;
|
logCountSpan.textContent = data.log_count;
|
||||||
@ -51,7 +55,7 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
|
|
||||||
function renderLogTable(logs) {
|
function renderLogTable(logs) {
|
||||||
console.log('Rendering logs:', logs.length);
|
console.log('Rendering logs:', logs.length);
|
||||||
logTableContainer.innerHTML = '';
|
logTableContainer.innerHTML = ''; // Clear existing content before rendering
|
||||||
|
|
||||||
if (logs.length === 0) {
|
if (logs.length === 0) {
|
||||||
logTableContainer.innerHTML = '<p class="loading-message">No logs available.</p>';
|
logTableContainer.innerHTML = '<p class="loading-message">No logs available.</p>';
|
||||||
@ -86,7 +90,7 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
const row = document.createElement('tr');
|
const row = document.createElement('tr');
|
||||||
row.innerHTML = `
|
row.innerHTML = `
|
||||||
<td>${new Date(log.timestamp).toLocaleString()}</td>
|
<td>${new Date(log.timestamp).toLocaleString()}</td>
|
||||||
<td class="log-level log-level-${log.level.toLowerCase()}">${log.level}</td>
|
<td class="log-level log-level-${(log.level || '').toLowerCase()}">${log.level || 'N/A'}</td>
|
||||||
<td>${escapeHtml(log.message)}</td>
|
<td>${escapeHtml(log.message)}</td>
|
||||||
<td>
|
<td>
|
||||||
${log.extra ? `
|
${log.extra ? `
|
||||||
@ -158,6 +162,8 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
console.log('Initializing logs page');
|
console.log('Initializing logs page');
|
||||||
|
// Call fetchLogs immediately on page load to populate the table with fresh data
|
||||||
|
// and handle the initial refresh logic.
|
||||||
fetchLogs();
|
fetchLogs();
|
||||||
setInterval(fetchLogs, POLLING_INTERVAL_MS);
|
setInterval(fetchLogs, POLLING_INTERVAL_MS);
|
||||||
});
|
});
|
||||||
|
@ -19,8 +19,8 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function renderNodeGrid(nodes) {
|
function renderNodeGrid(nodes) {
|
||||||
nodeGridContainer.innerHTML = '';
|
nodeGridContainer.innerHTML = ''; // Clear existing content
|
||||||
nodeCountSpan.textContent = nodes.length;
|
nodeCountSpan.textContent = nodes.length; // Update total node count
|
||||||
|
|
||||||
if (nodes.length === 0) {
|
if (nodes.length === 0) {
|
||||||
nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>';
|
nodeGridContainer.innerHTML = '<p class="loading-message">No nodes reporting yet. Start a client!</p>';
|
||||||
@ -132,4 +132,3 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
fetchNodeData();
|
fetchNodeData();
|
||||||
setInterval(fetchNodeData, POLLING_INTERVAL_MS);
|
setInterval(fetchNodeData, POLLING_INTERVAL_MS);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -38,10 +38,10 @@ body {
|
|||||||
border-radius: 8px;
|
border-radius: 8px;
|
||||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px;
|
||||||
width: 80vw;
|
width: 80vw; /* Keep this fixed width for the header */
|
||||||
max-width: 1200px;
|
max-width: 1200px; /* Keep this max-width for the header */
|
||||||
margin-left: auto;
|
margin-left: auto; /* Center the header */
|
||||||
margin-right: auto;
|
margin-right: auto; /* Center the header */
|
||||||
}
|
}
|
||||||
|
|
||||||
h1 {
|
h1 {
|
||||||
@ -65,17 +65,18 @@ code {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#node-grid-container, #log-table-container {
|
#node-grid-container, #log-table-container {
|
||||||
width: 95vw;
|
/* Adjusted width/max-width to allow dynamic resizing and scrolling */
|
||||||
max-width: 1600px;
|
width: 95vw; /* Allow it to take up to 95% of viewport width */
|
||||||
min-width: 400px;
|
max-width: 1800px; /* Increased max-width to accommodate more columns */
|
||||||
|
min-width: 400px; /* Keep a minimum width */
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
background-color: var(--nord3);
|
background-color: var(--nord3);
|
||||||
border-radius: 8px;
|
border-radius: 8px;
|
||||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px; /* Spacing below the container */
|
||||||
margin-left: auto;
|
margin-left: auto; /* Center the block */
|
||||||
margin-right: auto;
|
margin-right: auto; /* Center the block */
|
||||||
overflow-x: auto;
|
overflow-x: auto; /* Enable horizontal scrolling if content overflows */
|
||||||
}
|
}
|
||||||
|
|
||||||
.connection-grid {
|
.connection-grid {
|
||||||
@ -263,7 +264,7 @@ code {
|
|||||||
color: var(--nord11); /* Red */
|
color: var(--nord11); /* Red */
|
||||||
}
|
}
|
||||||
|
|
||||||
.log-level-debug { /* Added for potential debug logs */
|
.log-level-debug {
|
||||||
color: var(--nord9); /* Blue */
|
color: var(--nord9); /* Blue */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
<div class="header-container">
|
<div class="header-container">
|
||||||
<h1>Node Monitoring System</h1>
|
<h1>Node Monitoring System</h1>
|
||||||
<p>Total Nodes: <span id="node-count">0</span></p>
|
<p>Total Nodes: <span id="node-count">0</span></p>
|
||||||
<p>Service UUID: <code>{{ service_uuid }}</code></p>
|
<p>Service UUID: <code>{{ service_uuid }}</code></p> <!-- ALWAYS DISPLAYED -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="node-grid-container">
|
<div id="node-grid-container">
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div id="log-table-container" data-service-uuid="{{ service_uuid }}">
|
<div id="log-table-container" data-service-uuid="{{ service_uuid }}">
|
||||||
|
{# The initial logs are rendered by Jinja2 here #}
|
||||||
{% if logs %}
|
{% if logs %}
|
||||||
<table class="log-table">
|
<table class="log-table">
|
||||||
<thead>
|
<thead>
|
||||||
@ -63,4 +64,3 @@
|
|||||||
<script src="{{ url_for('static', path='/logs.js') }}"></script>
|
<script src="{{ url_for('static', path='/logs.js') }}"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
|
@ -2,18 +2,34 @@ version: '3.8'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
node-monitor:
|
node-monitor:
|
||||||
build: .
|
image: node-monitor:latest
|
||||||
|
container_name: node-monitor-app
|
||||||
|
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
|
|
||||||
|
# Mount the 'data' directory for RRD files.
|
||||||
|
# The left side './data' refers to a 'data' directory in the same location
|
||||||
|
# as this docker-compose.yml file.
|
||||||
|
# For Podman, if you encounter SELinux issues, you might need to append ':Z' or ':z'
|
||||||
|
# to the host path, e.g., './data:/data:Z'
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ../data:/data:Z
|
||||||
|
|
||||||
|
# Environment variables for the application
|
||||||
environment:
|
environment:
|
||||||
- DATA_DIR=/data
|
# Set a fixed SERVICE_UUID here. Replace this with your desired UUID.
|
||||||
- SERVICE_UUID=${SERVICE_UUID:-auto-generated}
|
# This UUID will be used by the FastAPI app and passed to the frontend.
|
||||||
|
SERVICE_UUID: "ab73d00a-8169-46bb-997d-f13e5f760973"
|
||||||
|
DATA_DIR: "/data" # Inform the application where its data volume is mounted
|
||||||
|
|
||||||
|
# Restart the container if it stops for any reason, unless explicitly stopped
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Healthcheck to ensure the container is running and responsive
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
|
retries: 3
|
||||||
|
Reference in New Issue
Block a user