Moving basic grid stuff to jinja2 from JS. Gemini fixed the logs display it seems.
This commit is contained in:
@ -75,4 +75,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
|
||||
|
||||
# Run the application using the virtual environment's python interpreter
|
||||
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips", "*"]
|
||||
|
@ -166,9 +166,9 @@ class RRDDatabase:
|
||||
current_time += step
|
||||
|
||||
return {
|
||||
'timestamps': timestamps,
|
||||
'data': data,
|
||||
'step': step
|
||||
'timestamps': timestamps,
|
||||
'data': data,
|
||||
'step': step
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@ -206,9 +206,9 @@ class RRDDatabase:
|
||||
current_time += step
|
||||
|
||||
return {
|
||||
'timestamps': timestamps,
|
||||
'data': data,
|
||||
'step': step
|
||||
'timestamps': timestamps,
|
||||
'data': data,
|
||||
'step': step
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
|
132
app/main.py
132
app/main.py
@ -8,7 +8,7 @@ from fastapi.responses import HTMLResponse, JSONResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from typing import Dict, List, Annotated
|
||||
from typing import Dict, List, Annotated, Optional
|
||||
import uuid as uuid_lib
|
||||
|
||||
from collections import deque
|
||||
@ -32,14 +32,17 @@ class BufferHandler(logging.Handler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Use the same formatter string as the StreamHandler for consistency
|
||||
# Ensure asctime is formatted as ISO 8601 UTC with milliseconds and 'Z'
|
||||
self.formatter = jsonlogger.JsonFormatter(
|
||||
"%(asctime)s %(name)s %(levelname)s %(message)s"
|
||||
"%(asctime)s %(name)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S.%fZ" # ISO 8601 format with milliseconds and Z for UTC
|
||||
)
|
||||
|
||||
def emit(self, record):
|
||||
try:
|
||||
log_entry_str = self.formatter.format(record)
|
||||
log_entry = json.loads(log_entry_str)
|
||||
# The 'asctime' field in log_entry is now guaranteed to be ISO 8601
|
||||
log_buffer.add_log(log_entry)
|
||||
except Exception as e:
|
||||
print(
|
||||
@ -53,14 +56,30 @@ class LogBuffer:
|
||||
self.buffer = deque(maxlen=maxlen)
|
||||
|
||||
def add_log(self, record):
|
||||
# Assuming 'record' here is already a dictionary parsed from the JSON log string
|
||||
timestamp = record.get("asctime") or datetime.utcnow().isoformat()
|
||||
# 'record' is a dictionary parsed from the JSON log string.
|
||||
# 'asctime' should now be in ISO 8601 format due to BufferHandler's formatter.
|
||||
timestamp_str = record.get("asctime")
|
||||
if timestamp_str:
|
||||
try:
|
||||
# Use isoparse for robust parsing, then convert to UTC and store as ISO 8601 with 'Z'
|
||||
dt_obj = isoparse(timestamp_str)
|
||||
if dt_obj.tzinfo is None:
|
||||
# Assume UTC if naive (common for logs without explicit timezone info)
|
||||
dt_obj = dt_obj.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
# Convert to UTC for consistent storage
|
||||
dt_obj = dt_obj.astimezone(timezone.utc)
|
||||
timestamp_to_store = dt_obj.isoformat(timespec='milliseconds').replace('+00:00', 'Z')
|
||||
except ValueError:
|
||||
logger.warning(f"Failed to parse log timestamp '{timestamp_str}' from formatter. Using current UTC time.")
|
||||
timestamp_to_store = datetime.utcnow().isoformat(timespec='milliseconds') + 'Z'
|
||||
else:
|
||||
timestamp_to_store = datetime.utcnow().isoformat(timespec='milliseconds') + 'Z'
|
||||
|
||||
self.buffer.append(
|
||||
{
|
||||
"timestamp": timestamp,
|
||||
"level": record.get(
|
||||
"levelname"
|
||||
), # This should now correctly get 'levelname'
|
||||
"timestamp": timestamp_to_store,
|
||||
"level": record.get("levelname"),
|
||||
"message": record.get("message"),
|
||||
"extra": {
|
||||
k: v
|
||||
@ -111,9 +130,8 @@ class LogBuffer:
|
||||
logs = [
|
||||
log
|
||||
for log in logs
|
||||
if datetime.fromisoformat(
|
||||
log["timestamp"].replace("Z", "+00:00")
|
||||
).astimezone(timezone.utc)
|
||||
# log["timestamp"] is now guaranteed to be ISO 8601 with 'Z'
|
||||
if isoparse(log["timestamp"]).astimezone(timezone.utc)
|
||||
>= since_dt
|
||||
]
|
||||
except ValueError:
|
||||
@ -125,7 +143,11 @@ class LogBuffer:
|
||||
log_buffer = LogBuffer()
|
||||
|
||||
logHandler = logging.StreamHandler()
|
||||
formatter = jsonlogger.JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s")
|
||||
# Ensure StreamHandler also formats asctime into ISO 8601 UTC
|
||||
formatter = jsonlogger.JsonFormatter(
|
||||
"%(asctime)s %(name)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S.%fZ" # ISO 8601 format with milliseconds and Z for UTC
|
||||
)
|
||||
logHandler.setFormatter(formatter)
|
||||
|
||||
if not logger.handlers:
|
||||
@ -148,6 +170,10 @@ app = FastAPI(
|
||||
templates = Jinja2Templates(directory="app/web/templates")
|
||||
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
||||
|
||||
# To correctly handle HTTPS behind a reverse proxy, ensure your Uvicorn server
|
||||
# is run with --proxy-headers and --forwarded-allow-ips.
|
||||
# e.g., uvicorn main:app --host 0.0.0.0 --port 8000 --proxy-headers --forwarded-allow-ips '*'
|
||||
|
||||
|
||||
# --- Data Models ---
|
||||
class NodeStatusModel(BaseModel):
|
||||
@ -238,6 +264,30 @@ def get_node_health(node_data: Dict) -> str:
|
||||
return "healthy"
|
||||
|
||||
|
||||
def format_uptime(seconds: Optional[int]) -> str:
|
||||
"""Formats uptime in seconds into a human-readable string (e.g., "1d 2h 3m 4s")."""
|
||||
if seconds is None:
|
||||
return "N/A"
|
||||
days = seconds // (3600 * 24)
|
||||
seconds %= (3600 * 24)
|
||||
hours = seconds // 3600
|
||||
seconds %= 3600
|
||||
minutes = seconds // 60
|
||||
remaining_seconds = seconds % 60
|
||||
|
||||
parts = []
|
||||
if days > 0:
|
||||
parts.append(f"{days}d")
|
||||
if hours > 0:
|
||||
parts.append(f"{hours}h")
|
||||
if minutes > 0:
|
||||
parts.append(f"{minutes}m")
|
||||
# Always include seconds if no other parts, or if there are remaining seconds
|
||||
if remaining_seconds > 0 or not parts:
|
||||
parts.append(f"{remaining_seconds}s")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
# --- API Endpoints ---
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def read_root(request: Request):
|
||||
@ -247,15 +297,71 @@ async def read_root(request: Request):
|
||||
"Web root accessed",
|
||||
extra={"client_ip": client_ip, "service_uuid": SERVICE_UUID},
|
||||
)
|
||||
|
||||
# --- Prepare initial node data for server-side rendering ---
|
||||
current_time_utc = datetime.now(timezone.utc)
|
||||
nodes_to_remove = []
|
||||
for node_uuid, data in known_nodes_db.items():
|
||||
last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
if (
|
||||
current_time_utc - last_seen_dt
|
||||
).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS:
|
||||
nodes_to_remove.append(node_uuid)
|
||||
logger.info(f"Node {node_uuid} inactive for >{NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS}s. Will not render initially.")
|
||||
|
||||
# Filter out inactive nodes for the initial render
|
||||
active_known_nodes_db = {
|
||||
k: v for k, v in known_nodes_db.items()
|
||||
if k not in nodes_to_remove
|
||||
}
|
||||
|
||||
initial_nodes_data = []
|
||||
for node_uuid, data in active_known_nodes_db.items():
|
||||
current_health = get_node_health(data)
|
||||
|
||||
connections = {}
|
||||
for target_uuid in active_known_nodes_db: # Only iterate over currently active nodes
|
||||
if target_uuid != node_uuid:
|
||||
ping_data = database.get_ping_data(
|
||||
node_uuid, target_uuid, start_time="-300s"
|
||||
)
|
||||
latency_ms = None
|
||||
if ping_data and ping_data["data"]["latency"]:
|
||||
# Get the most recent non-None, non-zero latency
|
||||
for latency in reversed(ping_data["data"]["latency"]):
|
||||
if latency is not None and not (isinstance(latency, float) and latency == 0.0):
|
||||
latency_ms = float(latency)
|
||||
break
|
||||
connections[target_uuid] = latency_ms
|
||||
|
||||
initial_nodes_data.append(
|
||||
{
|
||||
"uuid": node_uuid,
|
||||
"last_seen": data["last_seen"], # Keep original for JS
|
||||
"formatted_last_seen": datetime.fromisoformat(data["last_seen"]).strftime("%Y-%m-%d %H:%M:%S UTC"),
|
||||
"ip": data["ip"],
|
||||
"health_status": current_health,
|
||||
"uptime_seconds": data.get("uptime_seconds"),
|
||||
"formatted_uptime": format_uptime(data.get("uptime_seconds")), # Pre-format uptime for HTML
|
||||
"load_avg": data.get("load_avg"),
|
||||
"memory_usage_percent": data.get("memory_usage_percent"),
|
||||
"connections": connections,
|
||||
}
|
||||
)
|
||||
# --- End initial node data preparation ---
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
{
|
||||
"request": request,
|
||||
"service_uuid": SERVICE_UUID,
|
||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
||||
"url_for": request.url_for,
|
||||
"root_path": request.scope.get(
|
||||
"root_path", ""
|
||||
), # Pass root_path for JS base URL
|
||||
"nodes": initial_nodes_data, # Pass initial node data for server-side rendering
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -9,12 +9,79 @@
|
||||
<body>
|
||||
<div class="header-container">
|
||||
<h1>Node Monitoring System</h1>
|
||||
<p>Total Nodes: <span id="node-count">0</span></p>
|
||||
<p>Service UUID: <code>{{ service_uuid }}</code></p> <!-- ALWAYS DISPLAYED -->
|
||||
<p>Total Nodes: <span id="node-count">{{ nodes|length }}</span></p>
|
||||
<p>Service UUID: <code>{{ service_uuid }}</code></p>
|
||||
</div>
|
||||
|
||||
<div id="node-grid-container">
|
||||
<p class="loading-message">Loading node data...</p>
|
||||
{% if nodes %}
|
||||
<div class="connection-grid" style="grid-template-columns: minmax(100px, 1fr) repeat({{ nodes|length }}, minmax(100px, 1fr));">
|
||||
<!-- Header Row (Column UUIDs) -->
|
||||
<div class="grid-row header-row">
|
||||
<div class="grid-cell empty-cell"></div> {# Top-left corner #}
|
||||
{% for node in nodes %}
|
||||
<div class="grid-cell header-cell">
|
||||
<div class="node-uuid" title="{{ node.uuid }}">{{ node.uuid[:8] }}...</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<!-- Data Rows -->
|
||||
{% for row_node in nodes %}
|
||||
<div class="grid-row">
|
||||
<!-- Row Header (UUID) -->
|
||||
<div class="grid-cell header-cell">
|
||||
<div class="node-uuid" title="{{ row_node.uuid }}">{{ row_node.uuid[:8] }}...</div>
|
||||
</div>
|
||||
|
||||
<!-- Cells for connections/status -->
|
||||
{% for col_node in nodes %}
|
||||
<div class="grid-cell
|
||||
{% if row_node.uuid == col_node.uuid %}
|
||||
node-{{ row_node.health_status }}
|
||||
{% else %}
|
||||
{% set latency = row_node.connections[col_node.uuid] if col_node.uuid in row_node.connections else None %}
|
||||
{% if latency is not none and latency is not equalto 0.0 %}
|
||||
{% if latency <= 200 %}latency-low
|
||||
{% elif latency <= 1000 %}latency-medium
|
||||
{% else %}latency-high
|
||||
{% endif %}
|
||||
{% else %}
|
||||
latency-unavailable
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
">
|
||||
{% if row_node.uuid == col_node.uuid %}
|
||||
<div class="node-status-text">Status: {{ row_node.health_status.upper() }}</div>
|
||||
<div class="node-tooltip">
|
||||
<p><strong>UUID:</strong> {{ row_node.uuid }}</p>
|
||||
<p><strong>IP:</strong> {{ row_node.ip }}</p>
|
||||
<p><strong>Last Seen:</strong> {{ row_node.formatted_last_seen }}</p>
|
||||
<p><strong>Uptime:</strong> {{ row_node.formatted_uptime }}</p>
|
||||
<p><strong>Load Avg (1m, 5m, 15m):</strong> {{ row_node.load_avg | map('%.2f' | format) | join(', ') if row_node.load_avg else 'N/A' }}</p>
|
||||
<p><strong>Memory Usage:</strong> {{ '%.2f' | format(row_node.memory_usage_percent) + '%' if row_node.memory_usage_percent is not none else 'N/A' }}</p>
|
||||
</div>
|
||||
{% else %}
|
||||
{% set latency = row_node.connections[col_node.uuid] if col_node.uuid in row_node.connections else None %}
|
||||
{% set display_latency = 'N/A' %}
|
||||
{% if latency is not none and latency is not equalto 0.0 %}
|
||||
{% set display_latency = '%.1f ms' | format(latency) %}
|
||||
{% endif %}
|
||||
<div class="conn-status-text">Ping: {{ display_latency }}</div>
|
||||
<div class="node-tooltip">
|
||||
<p><strong>From:</strong> {{ row_node.uuid[:8] }}...</p>
|
||||
<p><strong>to:</strong> {{ col_node.uuid[:8] }}...</p>
|
||||
<p><strong>Latency:</strong> {{ display_latency }}</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% else %}
|
||||
<p class="loading-message">No nodes reporting yet. Start a client!</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<script>
|
||||
|
832
test-client-multi-node-flux.py
Normal file
832
test-client-multi-node-flux.py
Normal file
@ -0,0 +1,832 @@
|
||||
import os
|
||||
import uuid
|
||||
import time
|
||||
import requests
|
||||
import random
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import socket
|
||||
from datetime import datetime, timezone
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import argparse
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.connection import create_connection
|
||||
|
||||
# --- Multi-Node Client Configuration ---
|
||||
TARGET_SERVICE_UUID = os.environ.get(
|
||||
"TARGET_SERVICE_UUID", "ab73d00a-8169-46bb-997d-f13e5f760973"
|
||||
)
|
||||
|
||||
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
|
||||
UPDATE_INTERVAL_SECONDS = int(os.environ.get("UPDATE_INTERVAL_SECONDS", 5))
|
||||
NUM_NODES = int(os.environ.get("NUM_NODES", 3))
|
||||
|
||||
# Base IP for loopback binding (127.0.0.x where x starts from this base)
|
||||
LOOPBACK_IP_BASE = int(os.environ.get("LOOPBACK_IP_BASE", 2)) # Start from 127.0.0.2
|
||||
|
||||
# Dynamic node management settings
|
||||
DYNAMIC_MIN_NODES = int(os.environ.get("DYNAMIC_MIN_NODES", 3))
|
||||
DYNAMIC_MAX_NODES = int(os.environ.get("DYNAMIC_MAX_NODES", 7))
|
||||
NODE_CHANGE_INTERVAL = int(os.environ.get("NODE_CHANGE_INTERVAL", 30)) # seconds
|
||||
NODE_LIFECYCLE_VARIANCE = int(os.environ.get("NODE_LIFECYCLE_VARIANCE", 10)) # seconds
|
||||
|
||||
# --- Logging Configuration ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("MultiNodeClient")
|
||||
|
||||
|
||||
# --- Custom HTTP Adapter for Source IP Binding ---
|
||||
class SourceIPHTTPAdapter(HTTPAdapter):
|
||||
def __init__(self, source_ip, *args, **kwargs):
|
||||
self.source_ip = source_ip
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
# Override the socket creation to bind to specific source IP
|
||||
def custom_create_connection(
|
||||
address,
|
||||
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||
source_address=None,
|
||||
socket_options=None,
|
||||
):
|
||||
# Force our custom source address
|
||||
return create_connection(
|
||||
address,
|
||||
timeout,
|
||||
source_address=(self.source_ip, 0), # 0 = any available port
|
||||
socket_options=socket_options,
|
||||
)
|
||||
|
||||
# Monkey patch the connection creation
|
||||
original_create_connection = socket.create_connection
|
||||
socket.create_connection = custom_create_connection
|
||||
|
||||
try:
|
||||
result = super().init_poolmanager(*args, **kwargs)
|
||||
finally:
|
||||
# Restore original function
|
||||
socket.create_connection = original_create_connection
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- Enhanced Node Class with IP Binding ---
|
||||
class SimulatedNode:
|
||||
def __init__(
|
||||
self,
|
||||
node_id: int,
|
||||
total_nodes: int,
|
||||
server_url: str,
|
||||
service_uuid: str,
|
||||
update_interval: int,
|
||||
ip_base: int,
|
||||
persistent_uuid: str = None, # Allow reusing UUIDs for returning nodes
|
||||
):
|
||||
self.node_id = node_id
|
||||
self.node_uuid = persistent_uuid or str(uuid.uuid4())
|
||||
self.server_url = server_url # Store server URL
|
||||
self.service_uuid = service_uuid # Store service UUID
|
||||
self.update_interval = update_interval # Store update interval
|
||||
self.uptime_seconds = 0
|
||||
self.known_peers = {}
|
||||
self.total_nodes = total_nodes
|
||||
self.running = False
|
||||
self.is_persistent = (
|
||||
persistent_uuid is not None
|
||||
) # Track if this is a returning node
|
||||
|
||||
# Assign unique loopback IP to this node using the passed ip_base
|
||||
self.source_ip = f"127.0.0.{ip_base + node_id - 1}"
|
||||
|
||||
# Create requests session with custom adapter for IP binding
|
||||
self.session = requests.Session()
|
||||
adapter = SourceIPHTTPAdapter(self.source_ip)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# Each node gets slightly different characteristics
|
||||
if not self.is_persistent: # Only randomize for new nodes
|
||||
self.base_load = random.uniform(0.2, 1.0)
|
||||
self.base_memory = random.uniform(40.0, 70.0)
|
||||
self.load_variance = random.uniform(0.1, 0.5)
|
||||
self.memory_variance = random.uniform(5.0, 15.0)
|
||||
|
||||
# Some nodes might be "problematic" (higher load/memory)
|
||||
if random.random() < 0.2: # 20% chance of being a "problematic" node
|
||||
self.base_load *= 2.0
|
||||
self.base_memory += 20.0
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})"
|
||||
)
|
||||
else:
|
||||
# Returning nodes keep some consistency but with variation
|
||||
self.base_load = random.uniform(0.3, 0.8)
|
||||
self.base_memory = random.uniform(45.0, 65.0)
|
||||
self.load_variance = random.uniform(0.1, 0.4)
|
||||
self.memory_variance = random.uniform(5.0, 12.0)
|
||||
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({'returning' if self.is_persistent else 'new'}) will bind to source IP: {self.source_ip}"
|
||||
)
|
||||
|
||||
def test_ip_binding(self):
|
||||
"""Test if we can bind to the assigned IP address."""
|
||||
try:
|
||||
# Try to create a socket and bind to the IP
|
||||
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
test_socket.bind((self.source_ip, 0)) # Bind to any available port
|
||||
test_socket.close()
|
||||
logger.debug(
|
||||
f"Node {self.node_id} successfully tested binding to {self.source_ip}"
|
||||
)
|
||||
return True
|
||||
except OSError as e:
|
||||
logger.error(f"Node {self.node_id} cannot bind to {self.source_ip}: {e}")
|
||||
logger.error(
|
||||
"Make sure the IP address is available on your loopback interface."
|
||||
)
|
||||
logger.error(
|
||||
f"You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)"
|
||||
)
|
||||
logger.error(f"Or: sudo ip addr add {self.source_ip}/8 dev lo (Linux)")
|
||||
return False
|
||||
|
||||
def generate_node_status_data(self):
|
||||
"""Generates simulated node status metrics with per-node characteristics."""
|
||||
|
||||
self.uptime_seconds += self.update_interval + random.randint(-1, 2)
|
||||
|
||||
# Generate load with some randomness but consistent per-node baseline
|
||||
load_1min = max(
|
||||
0.1,
|
||||
self.base_load + random.uniform(-self.load_variance, self.load_variance),
|
||||
)
|
||||
load_5min = max(0.1, load_1min * random.uniform(0.8, 1.0))
|
||||
load_15min = max(0.1, load_5min * random.uniform(0.8, 1.0))
|
||||
|
||||
load_avg = [round(load_1min, 2), round(load_5min, 2), round(load_15min, 2)]
|
||||
|
||||
# Generate memory usage with baseline + variance
|
||||
memory_usage = max(
|
||||
10.0,
|
||||
min(
|
||||
95.0,
|
||||
self.base_memory
|
||||
+ random.uniform(-self.memory_variance, self.memory_variance),
|
||||
),
|
||||
)
|
||||
|
||||
return {
|
||||
"uptime_seconds": self.uptime_seconds,
|
||||
"load_avg": load_avg,
|
||||
"memory_usage_percent": round(memory_usage, 2),
|
||||
}
|
||||
|
||||
def generate_ping_data(self):
|
||||
"""Generates simulated ping latencies to known peers."""
|
||||
pings = {}
|
||||
|
||||
# Ping to self (loopback)
|
||||
pings[self.node_uuid] = round(random.uniform(0.1, 1.5), 2)
|
||||
|
||||
# Ping to known peers
|
||||
for peer_uuid in self.known_peers.keys():
|
||||
if peer_uuid != self.node_uuid:
|
||||
# Simulate network latency with some consistency per peer
|
||||
base_latency = random.uniform(5.0, 100.0)
|
||||
variation = random.uniform(-10.0, 10.0)
|
||||
latency = max(0.1, base_latency + variation)
|
||||
pings[peer_uuid] = round(latency, 2)
|
||||
|
||||
return pings
|
||||
|
||||
def send_update(self):
|
||||
"""Sends a single status update to the server using bound IP."""
|
||||
try:
|
||||
status_data = self.generate_node_status_data()
|
||||
ping_data = self.generate_ping_data()
|
||||
|
||||
payload = {
|
||||
"node": self.node_uuid,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"status": status_data,
|
||||
"pings": ping_data,
|
||||
}
|
||||
endpoint_url = f"{self.server_url}/{self.service_uuid}/{self.node_uuid}/"
|
||||
|
||||
logger.debug(
|
||||
f"Node {self.node_id} ({self.source_ip}) sending update. "
|
||||
f"Uptime: {status_data['uptime_seconds']}s, "
|
||||
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
||||
f"Pings: {len(ping_data)}"
|
||||
)
|
||||
|
||||
# Use the custom session with IP binding
|
||||
response = self.session.put(endpoint_url, json=payload, timeout=10)
|
||||
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
|
||||
if "peers" in response_data and isinstance(
|
||||
response_data["peers"], dict
|
||||
):
|
||||
new_peers = {k: v for k, v in response_data["peers"].items()}
|
||||
|
||||
# Log new peer discoveries
|
||||
newly_discovered = set(new_peers.keys()) - set(
|
||||
self.known_peers.keys()
|
||||
)
|
||||
if newly_discovered:
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)"
|
||||
)
|
||||
|
||||
self.known_peers = new_peers
|
||||
|
||||
if (
|
||||
len(newly_discovered) > 0
|
||||
or len(self.known_peers) != self.total_nodes - 1
|
||||
):
|
||||
logger.debug(
|
||||
f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
||||
f"(expected {self.total_nodes - 1})"
|
||||
)
|
||||
|
||||
return True
|
||||
else:
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
||||
f"Status: {response.status_code}, Response: {response.text}"
|
||||
)
|
||||
return False
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) request timed out")
|
||||
return False
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) connection error: {e}"
|
||||
)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
"""Main loop for this simulated node."""
|
||||
# Test IP binding before starting
|
||||
if not self.test_ip_binding():
|
||||
logger.error(f"Node {self.node_id} cannot start due to IP binding failure")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
logger.info(
|
||||
f"Starting Node {self.node_id} ({'returning' if self.is_persistent else 'new'}) with UUID: {self.node_uuid[:8]}... (IP: {self.source_ip})"
|
||||
)
|
||||
|
||||
# Add some initial delay to stagger node starts
|
||||
initial_delay = self.node_id * 0.5
|
||||
time.sleep(initial_delay)
|
||||
|
||||
consecutive_failures = 0
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
success = self.send_update()
|
||||
|
||||
if success:
|
||||
consecutive_failures = 0
|
||||
else:
|
||||
consecutive_failures += 1
|
||||
if consecutive_failures >= 3:
|
||||
logger.warning(
|
||||
f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates"
|
||||
)
|
||||
|
||||
# Add some jitter to prevent thundering herd
|
||||
jitter = random.uniform(-1.0, 1.0)
|
||||
sleep_time = max(1.0, self.update_interval + jitter)
|
||||
time.sleep(sleep_time)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.source_ip}) received interrupt signal"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}"
|
||||
)
|
||||
time.sleep(self.update_interval)
|
||||
|
||||
logger.info(f"Node {self.node_id} ({self.source_ip}) stopped")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the node."""
|
||||
self.running = False
|
||||
|
||||
|
||||
# --- Dynamic Multi-Node Manager ---
|
||||
class DynamicMultiNodeManager:
|
||||
def __init__(
|
||||
self,
|
||||
min_nodes: int,
|
||||
max_nodes: int,
|
||||
server_url: str,
|
||||
service_uuid: str,
|
||||
update_interval: int,
|
||||
ip_base: int,
|
||||
change_interval: int,
|
||||
lifecycle_variance: int,
|
||||
):
|
||||
self.min_nodes = min_nodes
|
||||
self.max_nodes = max_nodes
|
||||
self.server_url = server_url
|
||||
self.service_uuid = service_uuid
|
||||
self.update_interval = update_interval
|
||||
self.ip_base = ip_base
|
||||
self.change_interval = change_interval
|
||||
self.lifecycle_variance = lifecycle_variance
|
||||
|
||||
# Track active nodes and their threads
|
||||
self.active_nodes = {} # node_id -> SimulatedNode
|
||||
self.node_threads = {} # node_id -> Thread
|
||||
self.node_counter = 1 # For assigning unique node IDs
|
||||
|
||||
# Track node history for potential returns
|
||||
self.departed_nodes = {} # UUID -> {'characteristics', 'last_seen'}
|
||||
|
||||
self.running = False
|
||||
self.manager_thread = None
|
||||
|
||||
def get_available_ip_addresses(self):
|
||||
"""Get list of available IP addresses for new nodes."""
|
||||
max_possible_ips = self.max_nodes * 2 # Allow some extra IPs
|
||||
available_ips = []
|
||||
used_ips = {node.source_ip for node in self.active_nodes.values()}
|
||||
|
||||
for i in range(max_possible_ips):
|
||||
ip = f"127.0.0.{self.ip_base + i}"
|
||||
if ip not in used_ips:
|
||||
available_ips.append((i + 1, ip)) # (node_id_offset, ip)
|
||||
|
||||
return available_ips
|
||||
|
||||
def check_ip_availability(self, num_ips_needed):
|
||||
"""Check if required number of IP addresses are available."""
|
||||
logger.info(f"Checking availability of {num_ips_needed} IP addresses...")
|
||||
available_ips = self.get_available_ip_addresses()
|
||||
|
||||
if len(available_ips) < num_ips_needed:
|
||||
logger.error(
|
||||
f"Only {len(available_ips)} IP addresses available, need {num_ips_needed}"
|
||||
)
|
||||
return False
|
||||
|
||||
# Test the first few IPs we'd actually use
|
||||
test_ips = available_ips[:num_ips_needed]
|
||||
all_available = True
|
||||
|
||||
for node_id_offset, ip in test_ips:
|
||||
try:
|
||||
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
test_socket.bind((ip, 0))
|
||||
test_socket.close()
|
||||
except OSError as e:
|
||||
logger.error(f"Cannot bind to {ip}: {e}")
|
||||
all_available = False
|
||||
|
||||
if not all_available:
|
||||
logger.error("Some IP addresses are not available.")
|
||||
self.show_setup_commands()
|
||||
return False
|
||||
|
||||
logger.info(f"All {num_ips_needed} IP addresses are available!")
|
||||
return True
|
||||
|
||||
def show_setup_commands(self):
|
||||
"""Show commands for setting up loopback IPs."""
|
||||
logger.info("=== Loopback IP Setup Commands ===")
|
||||
logger.info("Run these commands to add the required loopback IP addresses:")
|
||||
|
||||
import platform
|
||||
|
||||
system = platform.system().lower()
|
||||
|
||||
max_ips_needed = self.max_nodes * 2
|
||||
for i in range(max_ips_needed):
|
||||
ip = f"127.0.0.{self.ip_base + i}"
|
||||
if system == "linux":
|
||||
logger.info(f"sudo ip addr add {ip}/8 dev lo")
|
||||
elif system == "darwin": # macOS
|
||||
logger.info(f"sudo ifconfig lo0 alias {ip}")
|
||||
else:
|
||||
logger.info(f"Add {ip} to loopback interface (OS: {system})")
|
||||
|
||||
logger.info("=" * 40)
|
||||
|
||||
def create_new_node(self, return_existing=False):
|
||||
"""Create a new node, optionally bringing back a departed node."""
|
||||
available_ips = self.get_available_ip_addresses()
|
||||
if not available_ips:
|
||||
logger.warning("No available IP addresses for new nodes")
|
||||
return None
|
||||
|
||||
node_id_offset, source_ip = available_ips[0]
|
||||
node_id = self.node_counter
|
||||
self.node_counter += 1
|
||||
|
||||
# Decide if we should bring back an old node (30% chance if we have departed nodes)
|
||||
persistent_uuid = None
|
||||
if return_existing and self.departed_nodes and random.random() < 0.3:
|
||||
# Pick a random departed node to bring back
|
||||
persistent_uuid = random.choice(list(self.departed_nodes.keys()))
|
||||
logger.info(
|
||||
f"Bringing back departed node {persistent_uuid[:8]}... as Node {node_id}"
|
||||
)
|
||||
# Remove from departed list since it's returning
|
||||
del self.departed_nodes[persistent_uuid]
|
||||
|
||||
node = SimulatedNode(
|
||||
node_id=node_id,
|
||||
total_nodes=len(self.active_nodes) + 1, # +1 for this new node
|
||||
server_url=self.server_url,
|
||||
service_uuid=self.service_uuid,
|
||||
update_interval=self.update_interval,
|
||||
ip_base=self.ip_base + node_id_offset - 1, # Adjust IP calculation
|
||||
persistent_uuid=persistent_uuid,
|
||||
)
|
||||
|
||||
return node
|
||||
|
||||
def start_node(self, node):
|
||||
"""Start a single node in its own thread."""
|
||||
thread = threading.Thread(target=node.run, name=f"Node-{node.node_id}")
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
self.active_nodes[node.node_id] = node
|
||||
self.node_threads[node.node_id] = thread
|
||||
|
||||
logger.info(
|
||||
f"✅ Started Node {node.node_id} ({node.node_uuid[:8]}...) on {node.source_ip}"
|
||||
)
|
||||
|
||||
def stop_node(self, node_id, permanently=False):
|
||||
"""Stop a specific node."""
|
||||
if node_id not in self.active_nodes:
|
||||
return False
|
||||
|
||||
node = self.active_nodes[node_id]
|
||||
thread = self.node_threads[node_id]
|
||||
|
||||
# Store node info for potential return (unless it's permanently leaving)
|
||||
if not permanently and random.random() < 0.7: # 70% chance node might return
|
||||
self.departed_nodes[node.node_uuid] = {
|
||||
"last_seen": datetime.now(),
|
||||
"characteristics": {
|
||||
"base_load": node.base_load,
|
||||
"base_memory": node.base_memory,
|
||||
"load_variance": node.load_variance,
|
||||
"memory_variance": node.memory_variance,
|
||||
},
|
||||
}
|
||||
logger.info(
|
||||
f"⏸️ Node {node_id} ({node.node_uuid[:8]}...) departing temporarily"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"❌ Node {node_id} ({node.node_uuid[:8]}...) leaving permanently"
|
||||
)
|
||||
|
||||
node.stop()
|
||||
thread.join(timeout=5.0)
|
||||
|
||||
del self.active_nodes[node_id]
|
||||
del self.node_threads[node_id]
|
||||
|
||||
return True
|
||||
|
||||
def adjust_node_count(self):
|
||||
"""Randomly adjust the number of active nodes within the specified range."""
|
||||
current_count = len(self.active_nodes)
|
||||
|
||||
# Decide on target count
|
||||
target_count = random.randint(self.min_nodes, self.max_nodes)
|
||||
|
||||
if target_count == current_count:
|
||||
logger.debug(f"Node count staying at {current_count}")
|
||||
return
|
||||
|
||||
logger.info(f"🔄 Adjusting node count: {current_count} → {target_count}")
|
||||
|
||||
if target_count > current_count:
|
||||
# Add nodes
|
||||
nodes_to_add = target_count - current_count
|
||||
for _ in range(nodes_to_add):
|
||||
node = self.create_new_node(return_existing=True)
|
||||
if node:
|
||||
self.start_node(node)
|
||||
time.sleep(random.uniform(1, 3)) # Stagger starts
|
||||
|
||||
elif target_count < current_count:
|
||||
# Remove nodes
|
||||
nodes_to_remove = current_count - target_count
|
||||
active_node_ids = list(self.active_nodes.keys())
|
||||
nodes_to_stop = random.sample(active_node_ids, nodes_to_remove)
|
||||
|
||||
for node_id in nodes_to_stop:
|
||||
# 20% chance node leaves permanently
|
||||
permanently = random.random() < 0.2
|
||||
self.stop_node(node_id, permanently=permanently)
|
||||
time.sleep(random.uniform(0.5, 2)) # Stagger stops
|
||||
|
||||
def manage_node_lifecycle(self):
|
||||
"""Main loop for managing node lifecycle changes."""
|
||||
logger.info("🚀 Starting dynamic node lifecycle management")
|
||||
|
||||
# Start with minimum nodes
|
||||
logger.info(f"Initializing with {self.min_nodes} nodes...")
|
||||
for _ in range(self.min_nodes):
|
||||
node = self.create_new_node()
|
||||
if node:
|
||||
self.start_node(node)
|
||||
time.sleep(1) # Brief delay between starts
|
||||
|
||||
# Main management loop
|
||||
while self.running:
|
||||
try:
|
||||
# Wait for the change interval (with some randomness)
|
||||
variance = random.randint(
|
||||
-self.lifecycle_variance, self.lifecycle_variance
|
||||
)
|
||||
sleep_time = max(
|
||||
10, self.change_interval + variance
|
||||
) # Minimum 10 seconds
|
||||
|
||||
logger.debug(
|
||||
f"Waiting {sleep_time} seconds until next potential change..."
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
|
||||
if not self.running:
|
||||
break
|
||||
|
||||
# Randomly decide if we should make a change (70% chance)
|
||||
if random.random() < 0.7:
|
||||
self.adjust_node_count()
|
||||
else:
|
||||
logger.debug("Skipping this change cycle")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Node lifecycle manager received interrupt signal")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in node lifecycle management: {e}", exc_info=True)
|
||||
time.sleep(5) # Brief pause before continuing
|
||||
|
||||
def start_dynamic_management(self):
|
||||
"""Start the dynamic node management system."""
|
||||
if not self.check_ip_availability(self.max_nodes * 2):
|
||||
logger.error(
|
||||
"Cannot start dynamic management due to IP availability issues"
|
||||
)
|
||||
return False
|
||||
|
||||
self.running = True
|
||||
self.manager_thread = threading.Thread(
|
||||
target=self.manage_node_lifecycle, name="NodeLifecycleManager"
|
||||
)
|
||||
self.manager_thread.daemon = True
|
||||
self.manager_thread.start()
|
||||
|
||||
return True
|
||||
|
||||
def stop_all_nodes(self):
|
||||
"""Stop all nodes and the management system."""
|
||||
logger.info("🛑 Stopping dynamic node management...")
|
||||
self.running = False
|
||||
|
||||
# Stop the manager thread
|
||||
if self.manager_thread and self.manager_thread.is_alive():
|
||||
self.manager_thread.join(timeout=5.0)
|
||||
|
||||
# Stop all active nodes
|
||||
for node_id in list(self.active_nodes.keys()):
|
||||
self.stop_node(node_id, permanently=True)
|
||||
|
||||
logger.info("All nodes stopped")
|
||||
|
||||
def print_status(self):
|
||||
"""Print current status of the dynamic system."""
|
||||
active_count = len(self.active_nodes)
|
||||
departed_count = len(self.departed_nodes)
|
||||
|
||||
logger.info(f"=== Dynamic Multi-Node Status ===")
|
||||
logger.info(
|
||||
f"Active nodes: {active_count} (range: {self.min_nodes}-{self.max_nodes})"
|
||||
)
|
||||
logger.info(f"Departed nodes (may return): {departed_count}")
|
||||
|
||||
for node_id, node in self.active_nodes.items():
|
||||
status = "returning" if node.is_persistent else "new"
|
||||
logger.info(
|
||||
f" Node {node_id}: {status}, IP={node.source_ip}, UUID={node.node_uuid[:8]}..., Uptime={node.uptime_seconds}s"
|
||||
)
|
||||
|
||||
if departed_count > 0:
|
||||
logger.info("Departed nodes that might return:")
|
||||
for uuid_val in list(self.departed_nodes.keys())[:5]: # Show first 5
|
||||
logger.info(f" UUID={uuid_val[:8]}...")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dynamic multi-node test client with fluctuating node counts"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-nodes",
|
||||
type=int,
|
||||
default=DYNAMIC_MIN_NODES,
|
||||
help=f"Minimum number of nodes (default: {DYNAMIC_MIN_NODES})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-nodes",
|
||||
type=int,
|
||||
default=DYNAMIC_MAX_NODES,
|
||||
help=f"Maximum number of nodes (default: {DYNAMIC_MAX_NODES})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--change-interval",
|
||||
type=int,
|
||||
default=NODE_CHANGE_INTERVAL,
|
||||
help=f"Average seconds between node changes (default: {NODE_CHANGE_INTERVAL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lifecycle-variance",
|
||||
type=int,
|
||||
default=NODE_LIFECYCLE_VARIANCE,
|
||||
help=f"Random variance in change timing (default: {NODE_LIFECYCLE_VARIANCE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--interval",
|
||||
type=int,
|
||||
default=UPDATE_INTERVAL_SECONDS,
|
||||
help=f"Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--server",
|
||||
type=str,
|
||||
default=SERVER_BASE_URL,
|
||||
help=f"Server URL (default: {SERVER_BASE_URL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--service-uuid",
|
||||
type=str,
|
||||
default=TARGET_SERVICE_UUID,
|
||||
help="Target service UUID",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ip-base",
|
||||
type=int,
|
||||
default=LOOPBACK_IP_BASE,
|
||||
help=f"Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--setup-help",
|
||||
action="store_true",
|
||||
help="Show commands to set up loopback IP addresses",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v", action="store_true", help="Enable verbose logging"
|
||||
)
|
||||
# Keep the old --nodes argument for compatibility, but make it set max nodes
|
||||
parser.add_argument(
|
||||
"--nodes",
|
||||
type=int,
|
||||
help="Set max nodes (compatibility mode, same as --max-nodes)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle compatibility
|
||||
if args.nodes:
|
||||
args.max_nodes = args.nodes
|
||||
if (
|
||||
args.min_nodes == DYNAMIC_MIN_NODES
|
||||
): # Only adjust min if it wasn't explicitly set
|
||||
args.min_nodes = max(
|
||||
1, args.nodes - 2
|
||||
) # Set min to a reasonable lower bound
|
||||
|
||||
min_nodes = args.min_nodes
|
||||
max_nodes = args.max_nodes
|
||||
change_interval = args.change_interval
|
||||
lifecycle_variance = args.lifecycle_variance
|
||||
update_interval = args.interval
|
||||
server_url = args.server
|
||||
service_uuid = args.service_uuid
|
||||
ip_base = args.ip_base
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
# Validate configuration
|
||||
if min_nodes >= max_nodes:
|
||||
logger.error("Minimum nodes must be less than maximum nodes")
|
||||
return
|
||||
|
||||
if max_nodes < 1:
|
||||
logger.error("Maximum nodes must be at least 1")
|
||||
return
|
||||
|
||||
if args.setup_help:
|
||||
# Create a temporary manager just to show setup commands
|
||||
temp_manager = DynamicMultiNodeManager(
|
||||
min_nodes,
|
||||
max_nodes,
|
||||
server_url,
|
||||
service_uuid,
|
||||
update_interval,
|
||||
ip_base,
|
||||
change_interval,
|
||||
lifecycle_variance,
|
||||
)
|
||||
temp_manager.show_setup_commands()
|
||||
return
|
||||
|
||||
if service_uuid == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
|
||||
logger.error("=" * 60)
|
||||
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
|
||||
logger.error(
|
||||
"Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable."
|
||||
)
|
||||
logger.error("=" * 60)
|
||||
return
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Dynamic Multi-Node Test Client Configuration:")
|
||||
logger.info(f" Node range: {min_nodes} - {max_nodes} nodes")
|
||||
logger.info(
|
||||
f" Change interval: {change_interval}s (±{lifecycle_variance}s variance)"
|
||||
)
|
||||
logger.info(f" Update interval: {update_interval} seconds per node")
|
||||
logger.info(f" Server URL: {server_url}")
|
||||
logger.info(f" Target Service UUID: {service_uuid}")
|
||||
logger.info(
|
||||
f" IP range: 127.0.0.{ip_base} - 127.0.0.{ip_base + max_nodes * 2 - 1}"
|
||||
)
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Create and start the dynamic multi-node manager
|
||||
manager = DynamicMultiNodeManager(
|
||||
min_nodes,
|
||||
max_nodes,
|
||||
server_url,
|
||||
service_uuid,
|
||||
update_interval,
|
||||
ip_base,
|
||||
change_interval,
|
||||
lifecycle_variance,
|
||||
)
|
||||
|
||||
try:
|
||||
if not manager.start_dynamic_management():
|
||||
logger.error("Failed to start dynamic node management")
|
||||
return
|
||||
|
||||
# Main monitoring loop
|
||||
status_interval = 30 # Print status every 30 seconds
|
||||
last_status_time = time.time()
|
||||
|
||||
logger.info(
|
||||
"🎯 Dynamic node management started! Nodes will fluctuate between {} and {} over time.".format(
|
||||
min_nodes, max_nodes
|
||||
)
|
||||
)
|
||||
logger.info("Press Ctrl+C to stop...")
|
||||
|
||||
while True:
|
||||
time.sleep(5) # Check every 5 seconds
|
||||
|
||||
current_time = time.time()
|
||||
if current_time - last_status_time >= status_interval:
|
||||
manager.print_status()
|
||||
last_status_time = current_time
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received interrupt signal, shutting down...")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main: {e}", exc_info=True)
|
||||
finally:
|
||||
manager.stop_all_nodes()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -15,7 +15,7 @@ from urllib3.util.connection import create_connection
|
||||
|
||||
# --- Multi-Node Client Configuration ---
|
||||
TARGET_SERVICE_UUID = os.environ.get(
|
||||
"TARGET_SERVICE_UUID", "c7c883fd-46f3-4b14-a727-d805ae0a6ec0"
|
||||
"TARGET_SERVICE_UUID", "ab73d00a-8169-46bb-997d-f13e5f760973"
|
||||
)
|
||||
|
||||
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
|
||||
@ -28,10 +28,11 @@ LOOPBACK_IP_BASE = int(os.environ.get("LOOPBACK_IP_BASE", 2)) # Start from 127.
|
||||
# --- Logging Configuration ---
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s'
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("MultiNodeClient")
|
||||
|
||||
|
||||
# --- Custom HTTP Adapter for Source IP Binding ---
|
||||
class SourceIPHTTPAdapter(HTTPAdapter):
|
||||
def __init__(self, source_ip, *args, **kwargs):
|
||||
@ -40,14 +41,18 @@ class SourceIPHTTPAdapter(HTTPAdapter):
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
# Override the socket creation to bind to specific source IP
|
||||
def custom_create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||
source_address=None, socket_options=None):
|
||||
def custom_create_connection(
|
||||
address,
|
||||
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||
source_address=None,
|
||||
socket_options=None,
|
||||
):
|
||||
# Force our custom source address
|
||||
return create_connection(
|
||||
address,
|
||||
timeout,
|
||||
source_address=(self.source_ip, 0), # 0 = any available port
|
||||
socket_options=socket_options
|
||||
socket_options=socket_options,
|
||||
)
|
||||
|
||||
# Monkey patch the connection creation
|
||||
@ -62,14 +67,23 @@ class SourceIPHTTPAdapter(HTTPAdapter):
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- Enhanced Node Class with IP Binding ---
|
||||
class SimulatedNode:
|
||||
def __init__(self, node_id: int, total_nodes: int, server_url: str, service_uuid: str, update_interval: int, ip_base: int):
|
||||
def __init__(
|
||||
self,
|
||||
node_id: int,
|
||||
total_nodes: int,
|
||||
server_url: str,
|
||||
service_uuid: str,
|
||||
update_interval: int,
|
||||
ip_base: int,
|
||||
):
|
||||
self.node_id = node_id
|
||||
self.node_uuid = str(uuid.uuid4())
|
||||
self.server_url = server_url # Store server URL
|
||||
self.service_uuid = service_uuid # Store service UUID
|
||||
self.update_interval = update_interval # Store update interval
|
||||
self.server_url = server_url # Store server URL
|
||||
self.service_uuid = service_uuid # Store service UUID
|
||||
self.update_interval = update_interval # Store update interval
|
||||
self.uptime_seconds = 0
|
||||
self.known_peers = {}
|
||||
self.total_nodes = total_nodes
|
||||
@ -81,8 +95,8 @@ class SimulatedNode:
|
||||
# Create requests session with custom adapter for IP binding
|
||||
self.session = requests.Session()
|
||||
adapter = SourceIPHTTPAdapter(self.source_ip)
|
||||
self.session.mount('http://', adapter)
|
||||
self.session.mount('https://', adapter)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# Each node gets slightly different characteristics
|
||||
self.base_load = random.uniform(0.2, 1.0)
|
||||
@ -94,7 +108,9 @@ class SimulatedNode:
|
||||
if random.random() < 0.2: # 20% chance of being a "problematic" node
|
||||
self.base_load *= 2.0
|
||||
self.base_memory += 20.0
|
||||
logger.info(f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})")
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})"
|
||||
)
|
||||
|
||||
logger.info(f"Node {self.node_id} will bind to source IP: {self.source_ip}")
|
||||
|
||||
@ -105,12 +121,18 @@ class SimulatedNode:
|
||||
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
test_socket.bind((self.source_ip, 0)) # Bind to any available port
|
||||
test_socket.close()
|
||||
logger.debug(f"Node {self.node_id} successfully tested binding to {self.source_ip}")
|
||||
logger.debug(
|
||||
f"Node {self.node_id} successfully tested binding to {self.source_ip}"
|
||||
)
|
||||
return True
|
||||
except OSError as e:
|
||||
logger.error(f"Node {self.node_id} cannot bind to {self.source_ip}: {e}")
|
||||
logger.error("Make sure the IP address is available on your loopback interface.")
|
||||
logger.error("You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)")
|
||||
logger.error(
|
||||
"Make sure the IP address is available on your loopback interface."
|
||||
)
|
||||
logger.error(
|
||||
"You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)"
|
||||
)
|
||||
logger.error("Or: sudo ip addr add {self.source_ip}/8 dev lo (Linux)")
|
||||
return False
|
||||
|
||||
@ -120,24 +142,29 @@ class SimulatedNode:
|
||||
self.uptime_seconds += self.update_interval + random.randint(-1, 2)
|
||||
|
||||
# Generate load with some randomness but consistent per-node baseline
|
||||
load_1min = max(0.1, self.base_load + random.uniform(-self.load_variance, self.load_variance))
|
||||
load_1min = max(
|
||||
0.1,
|
||||
self.base_load + random.uniform(-self.load_variance, self.load_variance),
|
||||
)
|
||||
load_5min = max(0.1, load_1min * random.uniform(0.8, 1.0))
|
||||
load_15min = max(0.1, load_5min * random.uniform(0.8, 1.0))
|
||||
|
||||
load_avg = [
|
||||
round(load_1min, 2),
|
||||
round(load_5min, 2),
|
||||
round(load_15min, 2)
|
||||
]
|
||||
load_avg = [round(load_1min, 2), round(load_5min, 2), round(load_15min, 2)]
|
||||
|
||||
# Generate memory usage with baseline + variance
|
||||
memory_usage = max(10.0, min(95.0,
|
||||
self.base_memory + random.uniform(-self.memory_variance, self.memory_variance)))
|
||||
memory_usage = max(
|
||||
10.0,
|
||||
min(
|
||||
95.0,
|
||||
self.base_memory
|
||||
+ random.uniform(-self.memory_variance, self.memory_variance),
|
||||
),
|
||||
)
|
||||
|
||||
return {
|
||||
"uptime_seconds": self.uptime_seconds,
|
||||
"load_avg": load_avg,
|
||||
"memory_usage_percent": round(memory_usage, 2)
|
||||
"memory_usage_percent": round(memory_usage, 2),
|
||||
}
|
||||
|
||||
def generate_ping_data(self):
|
||||
@ -168,14 +195,16 @@ class SimulatedNode:
|
||||
"node": self.node_uuid,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"status": status_data,
|
||||
"pings": ping_data
|
||||
"pings": ping_data,
|
||||
}
|
||||
endpoint_url = f"{self.server_url}/{self.service_uuid}/{self.node_uuid}/"
|
||||
|
||||
logger.debug(f"Node {self.node_id} ({self.source_ip}) sending update. "
|
||||
f"Uptime: {status_data['uptime_seconds']}s, "
|
||||
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
||||
f"Pings: {len(ping_data)}")
|
||||
logger.debug(
|
||||
f"Node {self.node_id} ({self.source_ip}) sending update. "
|
||||
f"Uptime: {status_data['uptime_seconds']}s, "
|
||||
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
||||
f"Pings: {len(ping_data)}"
|
||||
)
|
||||
|
||||
# Use the custom session with IP binding
|
||||
response = self.session.put(endpoint_url, json=payload, timeout=10)
|
||||
@ -183,34 +212,51 @@ class SimulatedNode:
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
|
||||
if "peers" in response_data and isinstance(response_data["peers"], dict):
|
||||
if "peers" in response_data and isinstance(
|
||||
response_data["peers"], dict
|
||||
):
|
||||
new_peers = {k: v for k, v in response_data["peers"].items()}
|
||||
|
||||
# Log new peer discoveries
|
||||
newly_discovered = set(new_peers.keys()) - set(self.known_peers.keys())
|
||||
newly_discovered = set(new_peers.keys()) - set(
|
||||
self.known_peers.keys()
|
||||
)
|
||||
if newly_discovered:
|
||||
logger.info(f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)")
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)"
|
||||
)
|
||||
|
||||
self.known_peers = new_peers
|
||||
|
||||
if len(newly_discovered) > 0 or len(self.known_peers) != self.total_nodes - 1:
|
||||
logger.debug(f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
||||
f"(expected {self.total_nodes - 1})")
|
||||
if (
|
||||
len(newly_discovered) > 0
|
||||
or len(self.known_peers) != self.total_nodes - 1
|
||||
):
|
||||
logger.debug(
|
||||
f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
||||
f"(expected {self.total_nodes - 1})"
|
||||
)
|
||||
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
||||
f"Status: {response.status_code}, Response: {response.text}")
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
||||
f"Status: {response.status_code}, Response: {response.text}"
|
||||
)
|
||||
return False
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) request timed out")
|
||||
return False
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) connection error: {e}")
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) connection error: {e}"
|
||||
)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}")
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
@ -221,7 +267,9 @@ class SimulatedNode:
|
||||
return
|
||||
|
||||
self.running = True
|
||||
logger.info(f"Starting Node {self.node_id} with UUID: {self.node_uuid} (IP: {self.source_ip})")
|
||||
logger.info(
|
||||
f"Starting Node {self.node_id} with UUID: {self.node_uuid} (IP: {self.source_ip})"
|
||||
)
|
||||
|
||||
# Add some initial delay to stagger node starts
|
||||
initial_delay = self.node_id * 0.5
|
||||
@ -238,7 +286,9 @@ class SimulatedNode:
|
||||
else:
|
||||
consecutive_failures += 1
|
||||
if consecutive_failures >= 3:
|
||||
logger.warning(f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates")
|
||||
logger.warning(
|
||||
f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates"
|
||||
)
|
||||
|
||||
# Add some jitter to prevent thundering herd
|
||||
jitter = random.uniform(-1.0, 1.0)
|
||||
@ -246,10 +296,14 @@ class SimulatedNode:
|
||||
time.sleep(sleep_time)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info(f"Node {self.node_id} ({self.source_ip}) received interrupt signal")
|
||||
logger.info(
|
||||
f"Node {self.node_id} ({self.source_ip}) received interrupt signal"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}")
|
||||
logger.error(
|
||||
f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}"
|
||||
)
|
||||
time.sleep(self.update_interval)
|
||||
|
||||
logger.info(f"Node {self.node_id} ({self.source_ip}) stopped")
|
||||
@ -258,9 +312,17 @@ class SimulatedNode:
|
||||
"""Stop the node."""
|
||||
self.running = False
|
||||
|
||||
|
||||
# --- Multi-Node Manager ---
|
||||
class MultiNodeManager:
|
||||
def __init__(self, num_nodes: int, server_url: str, service_uuid: str, update_interval: int, ip_base: int):
|
||||
def __init__(
|
||||
self,
|
||||
num_nodes: int,
|
||||
server_url: str,
|
||||
service_uuid: str,
|
||||
update_interval: int,
|
||||
ip_base: int,
|
||||
):
|
||||
self.num_nodes = num_nodes
|
||||
self.server_url = server_url
|
||||
self.service_uuid = service_uuid
|
||||
@ -272,7 +334,9 @@ class MultiNodeManager:
|
||||
|
||||
# Create simulated nodes
|
||||
for i in range(num_nodes):
|
||||
node = SimulatedNode(i + 1, num_nodes, server_url, service_uuid, update_interval, ip_base)
|
||||
node = SimulatedNode(
|
||||
i + 1, num_nodes, server_url, service_uuid, update_interval, ip_base
|
||||
)
|
||||
self.nodes.append(node)
|
||||
|
||||
def check_ip_availability(self):
|
||||
@ -285,12 +349,16 @@ class MultiNodeManager:
|
||||
all_available = False
|
||||
|
||||
if not all_available:
|
||||
logger.error("Some IP addresses are not available. See individual node errors above.")
|
||||
logger.error(
|
||||
"Some IP addresses are not available. See individual node errors above."
|
||||
)
|
||||
logger.info("To add loopback IP addresses:")
|
||||
logger.info(" Linux: sudo ip addr add 127.0.0.X/8 dev lo")
|
||||
logger.info(" macOS: sudo ifconfig lo0 alias 127.0.0.X")
|
||||
# Use self.ip_base for the range
|
||||
logger.info(f" Where X ranges from {self.ip_base} to {self.ip_base + self.num_nodes - 1}")
|
||||
logger.info(
|
||||
f" Where X ranges from {self.ip_base} to {self.ip_base + self.num_nodes - 1}"
|
||||
)
|
||||
return False
|
||||
|
||||
logger.info("All IP addresses are available!")
|
||||
@ -301,7 +369,9 @@ class MultiNodeManager:
|
||||
if not self.check_ip_availability():
|
||||
return False
|
||||
|
||||
logger.info(f"Starting {self.num_nodes} simulated nodes with unique IP addresses...")
|
||||
logger.info(
|
||||
f"Starting {self.num_nodes} simulated nodes with unique IP addresses..."
|
||||
)
|
||||
self.running = True
|
||||
|
||||
for node in self.nodes:
|
||||
@ -331,8 +401,11 @@ class MultiNodeManager:
|
||||
"""Print current status of all nodes."""
|
||||
logger.info(f"=== Multi-Node Status ({self.num_nodes} nodes) ===")
|
||||
for node in self.nodes:
|
||||
logger.info(f"Node {node.node_id}: IP={node.source_ip}, UUID={node.node_uuid[:8]}..., "
|
||||
f"Uptime={node.uptime_seconds}s, Peers={len(node.known_peers)}")
|
||||
logger.info(
|
||||
f"Node {node.node_id}: IP={node.source_ip}, UUID={node.node_uuid[:8]}..., "
|
||||
f"Uptime={node.uptime_seconds}s, Peers={len(node.known_peers)}"
|
||||
)
|
||||
|
||||
|
||||
def setup_loopback_ips(num_nodes, base_ip):
|
||||
"""Helper function to show commands for setting up loopback IPs."""
|
||||
@ -342,6 +415,7 @@ def setup_loopback_ips(num_nodes, base_ip):
|
||||
|
||||
# Detect OS and show appropriate commands
|
||||
import platform
|
||||
|
||||
system = platform.system().lower()
|
||||
|
||||
for i in range(num_nodes):
|
||||
@ -364,22 +438,49 @@ def setup_loopback_ips(num_nodes, base_ip):
|
||||
|
||||
logger.info("=" * 40)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Multi-node test client with unique IP binding')
|
||||
parser.add_argument('--nodes', type=int, default=NUM_NODES,
|
||||
help=f'Number of simulated nodes (default: {NUM_NODES})')
|
||||
parser.add_argument('--interval', type=int, default=UPDATE_INTERVAL_SECONDS,
|
||||
help=f'Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})')
|
||||
parser.add_argument('--server', type=str, default=SERVER_BASE_URL,
|
||||
help=f'Server URL (default: {SERVER_BASE_URL})')
|
||||
parser.add_argument('--service-uuid', type=str, default=TARGET_SERVICE_UUID,
|
||||
help='Target service UUID')
|
||||
parser.add_argument('--ip-base', type=int, default=LOOPBACK_IP_BASE,
|
||||
help=f'Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})')
|
||||
parser.add_argument('--setup-help', action='store_true',
|
||||
help='Show commands to set up loopback IP addresses')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Enable verbose logging')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Multi-node test client with unique IP binding"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nodes",
|
||||
type=int,
|
||||
default=NUM_NODES,
|
||||
help=f"Number of simulated nodes (default: {NUM_NODES})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--interval",
|
||||
type=int,
|
||||
default=UPDATE_INTERVAL_SECONDS,
|
||||
help=f"Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--server",
|
||||
type=str,
|
||||
default=SERVER_BASE_URL,
|
||||
help=f"Server URL (default: {SERVER_BASE_URL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--service-uuid",
|
||||
type=str,
|
||||
default=TARGET_SERVICE_UUID,
|
||||
help="Target service UUID",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ip-base",
|
||||
type=int,
|
||||
default=LOOPBACK_IP_BASE,
|
||||
help=f"Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--setup-help",
|
||||
action="store_true",
|
||||
help="Show commands to set up loopback IP addresses",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v", action="store_true", help="Enable verbose logging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -400,8 +501,12 @@ def main():
|
||||
if service_uuid == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
|
||||
logger.error("=" * 60)
|
||||
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
|
||||
logger.error("Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable.")
|
||||
logger.error("You can find the server's UUID by running main.py and checking its console output")
|
||||
logger.error(
|
||||
"Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable."
|
||||
)
|
||||
logger.error(
|
||||
"You can find the server's UUID by running main.py and checking its console output"
|
||||
)
|
||||
logger.error("or by visiting the server's root endpoint in your browser.")
|
||||
logger.error("=" * 60)
|
||||
return
|
||||
@ -416,7 +521,9 @@ def main():
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Create and start the multi-node manager
|
||||
manager = MultiNodeManager(num_nodes, server_url, service_uuid, update_interval, ip_base)
|
||||
manager = MultiNodeManager(
|
||||
num_nodes, server_url, service_uuid, update_interval, ip_base
|
||||
)
|
||||
|
||||
try:
|
||||
if not manager.start_all_nodes():
|
||||
@ -436,5 +543,6 @@ def main():
|
||||
finally:
|
||||
manager.stop_all_nodes()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Reference in New Issue
Block a user