Moving basic grid stuff to jinja2 from JS. Gemini fixed the logs display it seems.
This commit is contained in:
@ -75,4 +75,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|||||||
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
|
CMD wget --no-verbose --tries=1 --spider http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
# Run the application using the virtual environment's python interpreter
|
# Run the application using the virtual environment's python interpreter
|
||||||
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["/opt/venv/bin/python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips", "*"]
|
||||||
|
@ -12,9 +12,9 @@ class RRDDatabase:
|
|||||||
# Use environment variable or default to /data
|
# Use environment variable or default to /data
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = os.environ.get("DATA_DIR", "/data")
|
data_dir = os.environ.get("DATA_DIR", "/data")
|
||||||
|
|
||||||
self.data_dir = Path(data_dir)
|
self.data_dir = Path(data_dir)
|
||||||
|
|
||||||
# Create data directory if it doesn't exist
|
# Create data directory if it doesn't exist
|
||||||
try:
|
try:
|
||||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
@ -29,11 +29,11 @@ class RRDDatabase:
|
|||||||
# RRD configuration
|
# RRD configuration
|
||||||
self.step = 60 # 1-minute intervals
|
self.step = 60 # 1-minute intervals
|
||||||
self.heartbeat = 120 # 2-minute heartbeat (allow 1 missed update)
|
self.heartbeat = 120 # 2-minute heartbeat (allow 1 missed update)
|
||||||
|
|
||||||
# Retention policy (6 months total)
|
# Retention policy (6 months total)
|
||||||
self.rra_config = [
|
self.rra_config = [
|
||||||
"RRA:AVERAGE:0.5:1:1440", # 1-min avg for 24 hours (1440 points)
|
"RRA:AVERAGE:0.5:1:1440", # 1-min avg for 24 hours (1440 points)
|
||||||
"RRA:AVERAGE:0.5:60:744", # 1-hour avg for 31 days (744 points)
|
"RRA:AVERAGE:0.5:60:744", # 1-hour avg for 31 days (744 points)
|
||||||
"RRA:AVERAGE:0.5:1440:180", # 1-day avg for 6 months (180 points)
|
"RRA:AVERAGE:0.5:1440:180", # 1-day avg for 6 months (180 points)
|
||||||
"RRA:MAX:0.5:1:1440", # 1-min max for 24 hours
|
"RRA:MAX:0.5:1:1440", # 1-min max for 24 hours
|
||||||
"RRA:MAX:0.5:60:744", # 1-hour max for 31 days
|
"RRA:MAX:0.5:60:744", # 1-hour max for 31 days
|
||||||
@ -49,10 +49,10 @@ class RRDDatabase:
|
|||||||
def _create_system_rrd(self, node_uuid: str) -> str:
|
def _create_system_rrd(self, node_uuid: str) -> str:
|
||||||
"""Create RRD file for system metrics (uptime, load, memory)."""
|
"""Create RRD file for system metrics (uptime, load, memory)."""
|
||||||
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
|
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
|
||||||
|
|
||||||
if rrd_file.exists():
|
if rrd_file.exists():
|
||||||
return str(rrd_file)
|
return str(rrd_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rrdtool.create(
|
rrdtool.create(
|
||||||
str(rrd_file),
|
str(rrd_file),
|
||||||
@ -60,7 +60,7 @@ class RRDDatabase:
|
|||||||
# Data sources
|
# Data sources
|
||||||
f"DS:uptime:GAUGE:{self.heartbeat}:0:U", # Uptime in seconds
|
f"DS:uptime:GAUGE:{self.heartbeat}:0:U", # Uptime in seconds
|
||||||
f"DS:load1:GAUGE:{self.heartbeat}:0:100", # 1-min load average
|
f"DS:load1:GAUGE:{self.heartbeat}:0:100", # 1-min load average
|
||||||
f"DS:load5:GAUGE:{self.heartbeat}:0:100", # 5-min load average
|
f"DS:load5:GAUGE:{self.heartbeat}:0:100", # 5-min load average
|
||||||
f"DS:load15:GAUGE:{self.heartbeat}:0:100", # 15-min load average
|
f"DS:load15:GAUGE:{self.heartbeat}:0:100", # 15-min load average
|
||||||
f"DS:memory:GAUGE:{self.heartbeat}:0:100", # Memory usage %
|
f"DS:memory:GAUGE:{self.heartbeat}:0:100", # Memory usage %
|
||||||
# Round Robin Archives
|
# Round Robin Archives
|
||||||
@ -75,10 +75,10 @@ class RRDDatabase:
|
|||||||
def _create_ping_rrd(self, node_uuid: str, target_uuid: str) -> str:
|
def _create_ping_rrd(self, node_uuid: str, target_uuid: str) -> str:
|
||||||
"""Create RRD file for ping metrics between two nodes."""
|
"""Create RRD file for ping metrics between two nodes."""
|
||||||
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
|
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
|
||||||
|
|
||||||
if rrd_file.exists():
|
if rrd_file.exists():
|
||||||
return str(rrd_file)
|
return str(rrd_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rrdtool.create(
|
rrdtool.create(
|
||||||
str(rrd_file),
|
str(rrd_file),
|
||||||
@ -95,68 +95,68 @@ class RRDDatabase:
|
|||||||
logger.error(f"Failed to create ping RRD for {node_uuid}->{target_uuid}: {e}")
|
logger.error(f"Failed to create ping RRD for {node_uuid}->{target_uuid}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def update_system_metrics(self, node_uuid: str, timestamp: datetime,
|
def update_system_metrics(self, node_uuid: str, timestamp: datetime,
|
||||||
uptime_seconds: int, load_avg: List[float],
|
uptime_seconds: int, load_avg: List[float],
|
||||||
memory_usage_percent: float):
|
memory_usage_percent: float):
|
||||||
"""Update system metrics for a node."""
|
"""Update system metrics for a node."""
|
||||||
try:
|
try:
|
||||||
rrd_file = self._create_system_rrd(node_uuid)
|
rrd_file = self._create_system_rrd(node_uuid)
|
||||||
|
|
||||||
# Convert datetime to Unix timestamp
|
# Convert datetime to Unix timestamp
|
||||||
unix_time = int(timestamp.timestamp())
|
unix_time = int(timestamp.timestamp())
|
||||||
|
|
||||||
# Format: timestamp:uptime:load1:load5:load15:memory
|
# Format: timestamp:uptime:load1:load5:load15:memory
|
||||||
values = f"{unix_time}:{uptime_seconds}:{load_avg[0]}:{load_avg[1]}:{load_avg[2]}:{memory_usage_percent}"
|
values = f"{unix_time}:{uptime_seconds}:{load_avg[0]}:{load_avg[1]}:{load_avg[2]}:{memory_usage_percent}"
|
||||||
|
|
||||||
rrdtool.update(rrd_file, values)
|
rrdtool.update(rrd_file, values)
|
||||||
logger.debug(f"Updated system metrics for {node_uuid}: {values}")
|
logger.debug(f"Updated system metrics for {node_uuid}: {values}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to update system metrics for {node_uuid}: {e}")
|
logger.error(f"Failed to update system metrics for {node_uuid}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def update_ping_metrics(self, node_uuid: str, target_uuid: str,
|
def update_ping_metrics(self, node_uuid: str, target_uuid: str,
|
||||||
timestamp: datetime, latency_ms: float):
|
timestamp: datetime, latency_ms: float):
|
||||||
"""Update ping metrics between two nodes."""
|
"""Update ping metrics between two nodes."""
|
||||||
try:
|
try:
|
||||||
rrd_file = self._create_ping_rrd(node_uuid, target_uuid)
|
rrd_file = self._create_ping_rrd(node_uuid, target_uuid)
|
||||||
|
|
||||||
unix_time = int(timestamp.timestamp())
|
unix_time = int(timestamp.timestamp())
|
||||||
|
|
||||||
# For now, we only track latency. Loss can be calculated from missing updates
|
# For now, we only track latency. Loss can be calculated from missing updates
|
||||||
values = f"{unix_time}:{latency_ms}:0" # 0% loss (could be enhanced)
|
values = f"{unix_time}:{latency_ms}:0" # 0% loss (could be enhanced)
|
||||||
|
|
||||||
rrdtool.update(rrd_file, values)
|
rrdtool.update(rrd_file, values)
|
||||||
logger.debug(f"Updated ping metrics {node_uuid}->{target_uuid}: {latency_ms}ms")
|
logger.debug(f"Updated ping metrics {node_uuid}->{target_uuid}: {latency_ms}ms")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to update ping metrics {node_uuid}->{target_uuid}: {e}")
|
logger.error(f"Failed to update ping metrics {node_uuid}->{target_uuid}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def get_system_data(self, node_uuid: str, start_time: str = "-24h",
|
def get_system_data(self, node_uuid: str, start_time: str = "-24h",
|
||||||
end_time: str = "now") -> Optional[Dict]:
|
end_time: str = "now") -> Optional[Dict]:
|
||||||
"""Retrieve system metrics data for a node."""
|
"""Retrieve system metrics data for a node."""
|
||||||
try:
|
try:
|
||||||
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
|
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
|
||||||
if not rrd_file.exists():
|
if not rrd_file.exists():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = rrdtool.fetch(
|
result = rrdtool.fetch(
|
||||||
str(rrd_file),
|
str(rrd_file),
|
||||||
"AVERAGE",
|
"AVERAGE",
|
||||||
"--start", start_time,
|
"--start", start_time,
|
||||||
"--end", end_time
|
"--end", end_time
|
||||||
)
|
)
|
||||||
|
|
||||||
# Parse RRDtool fetch result
|
# Parse RRDtool fetch result
|
||||||
start, end, step = result[0]
|
start, end, step = result[0]
|
||||||
ds_names = result[1] # ['uptime', 'load1', 'load5', 'load15', 'memory']
|
ds_names = result[1] # ['uptime', 'load1', 'load5', 'load15', 'memory']
|
||||||
data_points = result[2]
|
data_points = result[2]
|
||||||
|
|
||||||
# Convert to more usable format
|
# Convert to more usable format
|
||||||
timestamps = []
|
timestamps = []
|
||||||
data = {ds: [] for ds in ds_names}
|
data = {ds: [] for ds in ds_names}
|
||||||
|
|
||||||
current_time = start
|
current_time = start
|
||||||
for point in data_points:
|
for point in data_points:
|
||||||
timestamps.append(current_time)
|
timestamps.append(current_time)
|
||||||
@ -164,39 +164,39 @@ class RRDDatabase:
|
|||||||
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
|
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
|
||||||
data[ds].append(value)
|
data[ds].append(value)
|
||||||
current_time += step
|
current_time += step
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'timestamps': timestamps,
|
'timestamps': timestamps,
|
||||||
'data': data,
|
'data': data,
|
||||||
'step': step
|
'step': step
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to get system data for {node_uuid}: {e}")
|
logger.error(f"Failed to get system data for {node_uuid}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_ping_data(self, node_uuid: str, target_uuid: str,
|
def get_ping_data(self, node_uuid: str, target_uuid: str,
|
||||||
start_time: str = "-24h", end_time: str = "now") -> Optional[Dict]:
|
start_time: str = "-24h", end_time: str = "now") -> Optional[Dict]:
|
||||||
"""Retrieve ping metrics between two nodes."""
|
"""Retrieve ping metrics between two nodes."""
|
||||||
try:
|
try:
|
||||||
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
|
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
|
||||||
if not rrd_file.exists():
|
if not rrd_file.exists():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = rrdtool.fetch(
|
result = rrdtool.fetch(
|
||||||
str(rrd_file),
|
str(rrd_file),
|
||||||
"AVERAGE",
|
"AVERAGE",
|
||||||
"--start", start_time,
|
"--start", start_time,
|
||||||
"--end", end_time
|
"--end", end_time
|
||||||
)
|
)
|
||||||
|
|
||||||
start, end, step = result[0]
|
start, end, step = result[0]
|
||||||
ds_names = result[1] # ['latency', 'loss']
|
ds_names = result[1] # ['latency', 'loss']
|
||||||
data_points = result[2]
|
data_points = result[2]
|
||||||
|
|
||||||
timestamps = []
|
timestamps = []
|
||||||
data = {ds: [] for ds in ds_names}
|
data = {ds: [] for ds in ds_names}
|
||||||
|
|
||||||
current_time = start
|
current_time = start
|
||||||
for point in data_points:
|
for point in data_points:
|
||||||
timestamps.append(current_time)
|
timestamps.append(current_time)
|
||||||
@ -204,13 +204,13 @@ class RRDDatabase:
|
|||||||
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
|
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
|
||||||
data[ds].append(value)
|
data[ds].append(value)
|
||||||
current_time += step
|
current_time += step
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'timestamps': timestamps,
|
'timestamps': timestamps,
|
||||||
'data': data,
|
'data': data,
|
||||||
'step': step
|
'step': step
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to get ping data {node_uuid}->{target_uuid}: {e}")
|
logger.error(f"Failed to get ping data {node_uuid}->{target_uuid}: {e}")
|
||||||
return None
|
return None
|
||||||
@ -232,26 +232,26 @@ class RRDDatabase:
|
|||||||
# RRD automatically handles data retention based on RRA configuration
|
# RRD automatically handles data retention based on RRA configuration
|
||||||
# This method could be used for cleaning up orphaned files
|
# This method could be used for cleaning up orphaned files
|
||||||
cutoff_date = datetime.now() - timedelta(days=190) # 6+ months
|
cutoff_date = datetime.now() - timedelta(days=190) # 6+ months
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for node_dir in self.data_dir.iterdir():
|
for node_dir in self.data_dir.iterdir():
|
||||||
if not node_dir.is_dir():
|
if not node_dir.is_dir():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if any RRD files have been modified recently
|
# Check if any RRD files have been modified recently
|
||||||
rrd_files = list(node_dir.glob("*.rrd"))
|
rrd_files = list(node_dir.glob("*.rrd"))
|
||||||
if not rrd_files:
|
if not rrd_files:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If all RRD files are old, the node is probably dead
|
# If all RRD files are old, the node is probably dead
|
||||||
all_old = all(
|
all_old = all(
|
||||||
datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date
|
datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date
|
||||||
for f in rrd_files
|
for f in rrd_files
|
||||||
)
|
)
|
||||||
|
|
||||||
if all_old:
|
if all_old:
|
||||||
logger.info(f"Node {node_dir.name} appears inactive for >6 months")
|
logger.info(f"Node {node_dir.name} appears inactive for >6 months")
|
||||||
# Could optionally remove the directory here
|
# Could optionally remove the directory here
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed during cleanup: {e}")
|
logger.error(f"Failed during cleanup: {e}")
|
||||||
|
132
app/main.py
132
app/main.py
@ -8,7 +8,7 @@ from fastapi.responses import HTMLResponse, JSONResponse
|
|||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from pydantic import BaseModel, Field, validator
|
from pydantic import BaseModel, Field, validator
|
||||||
from typing import Dict, List, Annotated
|
from typing import Dict, List, Annotated, Optional
|
||||||
import uuid as uuid_lib
|
import uuid as uuid_lib
|
||||||
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
@ -32,14 +32,17 @@ class BufferHandler(logging.Handler):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# Use the same formatter string as the StreamHandler for consistency
|
# Use the same formatter string as the StreamHandler for consistency
|
||||||
|
# Ensure asctime is formatted as ISO 8601 UTC with milliseconds and 'Z'
|
||||||
self.formatter = jsonlogger.JsonFormatter(
|
self.formatter = jsonlogger.JsonFormatter(
|
||||||
"%(asctime)s %(name)s %(levelname)s %(message)s"
|
"%(asctime)s %(name)s %(levelname)s %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S.%fZ" # ISO 8601 format with milliseconds and Z for UTC
|
||||||
)
|
)
|
||||||
|
|
||||||
def emit(self, record):
|
def emit(self, record):
|
||||||
try:
|
try:
|
||||||
log_entry_str = self.formatter.format(record)
|
log_entry_str = self.formatter.format(record)
|
||||||
log_entry = json.loads(log_entry_str)
|
log_entry = json.loads(log_entry_str)
|
||||||
|
# The 'asctime' field in log_entry is now guaranteed to be ISO 8601
|
||||||
log_buffer.add_log(log_entry)
|
log_buffer.add_log(log_entry)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
@ -53,14 +56,30 @@ class LogBuffer:
|
|||||||
self.buffer = deque(maxlen=maxlen)
|
self.buffer = deque(maxlen=maxlen)
|
||||||
|
|
||||||
def add_log(self, record):
|
def add_log(self, record):
|
||||||
# Assuming 'record' here is already a dictionary parsed from the JSON log string
|
# 'record' is a dictionary parsed from the JSON log string.
|
||||||
timestamp = record.get("asctime") or datetime.utcnow().isoformat()
|
# 'asctime' should now be in ISO 8601 format due to BufferHandler's formatter.
|
||||||
|
timestamp_str = record.get("asctime")
|
||||||
|
if timestamp_str:
|
||||||
|
try:
|
||||||
|
# Use isoparse for robust parsing, then convert to UTC and store as ISO 8601 with 'Z'
|
||||||
|
dt_obj = isoparse(timestamp_str)
|
||||||
|
if dt_obj.tzinfo is None:
|
||||||
|
# Assume UTC if naive (common for logs without explicit timezone info)
|
||||||
|
dt_obj = dt_obj.replace(tzinfo=timezone.utc)
|
||||||
|
else:
|
||||||
|
# Convert to UTC for consistent storage
|
||||||
|
dt_obj = dt_obj.astimezone(timezone.utc)
|
||||||
|
timestamp_to_store = dt_obj.isoformat(timespec='milliseconds').replace('+00:00', 'Z')
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Failed to parse log timestamp '{timestamp_str}' from formatter. Using current UTC time.")
|
||||||
|
timestamp_to_store = datetime.utcnow().isoformat(timespec='milliseconds') + 'Z'
|
||||||
|
else:
|
||||||
|
timestamp_to_store = datetime.utcnow().isoformat(timespec='milliseconds') + 'Z'
|
||||||
|
|
||||||
self.buffer.append(
|
self.buffer.append(
|
||||||
{
|
{
|
||||||
"timestamp": timestamp,
|
"timestamp": timestamp_to_store,
|
||||||
"level": record.get(
|
"level": record.get("levelname"),
|
||||||
"levelname"
|
|
||||||
), # This should now correctly get 'levelname'
|
|
||||||
"message": record.get("message"),
|
"message": record.get("message"),
|
||||||
"extra": {
|
"extra": {
|
||||||
k: v
|
k: v
|
||||||
@ -111,9 +130,8 @@ class LogBuffer:
|
|||||||
logs = [
|
logs = [
|
||||||
log
|
log
|
||||||
for log in logs
|
for log in logs
|
||||||
if datetime.fromisoformat(
|
# log["timestamp"] is now guaranteed to be ISO 8601 with 'Z'
|
||||||
log["timestamp"].replace("Z", "+00:00")
|
if isoparse(log["timestamp"]).astimezone(timezone.utc)
|
||||||
).astimezone(timezone.utc)
|
|
||||||
>= since_dt
|
>= since_dt
|
||||||
]
|
]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@ -125,7 +143,11 @@ class LogBuffer:
|
|||||||
log_buffer = LogBuffer()
|
log_buffer = LogBuffer()
|
||||||
|
|
||||||
logHandler = logging.StreamHandler()
|
logHandler = logging.StreamHandler()
|
||||||
formatter = jsonlogger.JsonFormatter("%(asctime)s %(name)s %(levelname)s %(message)s")
|
# Ensure StreamHandler also formats asctime into ISO 8601 UTC
|
||||||
|
formatter = jsonlogger.JsonFormatter(
|
||||||
|
"%(asctime)s %(name)s %(levelname)s %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S.%fZ" # ISO 8601 format with milliseconds and Z for UTC
|
||||||
|
)
|
||||||
logHandler.setFormatter(formatter)
|
logHandler.setFormatter(formatter)
|
||||||
|
|
||||||
if not logger.handlers:
|
if not logger.handlers:
|
||||||
@ -148,6 +170,10 @@ app = FastAPI(
|
|||||||
templates = Jinja2Templates(directory="app/web/templates")
|
templates = Jinja2Templates(directory="app/web/templates")
|
||||||
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
app.mount("/static", StaticFiles(directory="app/web/static"), name="static")
|
||||||
|
|
||||||
|
# To correctly handle HTTPS behind a reverse proxy, ensure your Uvicorn server
|
||||||
|
# is run with --proxy-headers and --forwarded-allow-ips.
|
||||||
|
# e.g., uvicorn main:app --host 0.0.0.0 --port 8000 --proxy-headers --forwarded-allow-ips '*'
|
||||||
|
|
||||||
|
|
||||||
# --- Data Models ---
|
# --- Data Models ---
|
||||||
class NodeStatusModel(BaseModel):
|
class NodeStatusModel(BaseModel):
|
||||||
@ -238,6 +264,30 @@ def get_node_health(node_data: Dict) -> str:
|
|||||||
return "healthy"
|
return "healthy"
|
||||||
|
|
||||||
|
|
||||||
|
def format_uptime(seconds: Optional[int]) -> str:
|
||||||
|
"""Formats uptime in seconds into a human-readable string (e.g., "1d 2h 3m 4s")."""
|
||||||
|
if seconds is None:
|
||||||
|
return "N/A"
|
||||||
|
days = seconds // (3600 * 24)
|
||||||
|
seconds %= (3600 * 24)
|
||||||
|
hours = seconds // 3600
|
||||||
|
seconds %= 3600
|
||||||
|
minutes = seconds // 60
|
||||||
|
remaining_seconds = seconds % 60
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
if days > 0:
|
||||||
|
parts.append(f"{days}d")
|
||||||
|
if hours > 0:
|
||||||
|
parts.append(f"{hours}h")
|
||||||
|
if minutes > 0:
|
||||||
|
parts.append(f"{minutes}m")
|
||||||
|
# Always include seconds if no other parts, or if there are remaining seconds
|
||||||
|
if remaining_seconds > 0 or not parts:
|
||||||
|
parts.append(f"{remaining_seconds}s")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
# --- API Endpoints ---
|
# --- API Endpoints ---
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def read_root(request: Request):
|
async def read_root(request: Request):
|
||||||
@ -247,15 +297,71 @@ async def read_root(request: Request):
|
|||||||
"Web root accessed",
|
"Web root accessed",
|
||||||
extra={"client_ip": client_ip, "service_uuid": SERVICE_UUID},
|
extra={"client_ip": client_ip, "service_uuid": SERVICE_UUID},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- Prepare initial node data for server-side rendering ---
|
||||||
|
current_time_utc = datetime.now(timezone.utc)
|
||||||
|
nodes_to_remove = []
|
||||||
|
for node_uuid, data in known_nodes_db.items():
|
||||||
|
last_seen_dt = datetime.fromisoformat(data["last_seen"]).replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
current_time_utc - last_seen_dt
|
||||||
|
).total_seconds() > NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS:
|
||||||
|
nodes_to_remove.append(node_uuid)
|
||||||
|
logger.info(f"Node {node_uuid} inactive for >{NODE_INACTIVE_REMOVAL_THRESHOLD_SECONDS}s. Will not render initially.")
|
||||||
|
|
||||||
|
# Filter out inactive nodes for the initial render
|
||||||
|
active_known_nodes_db = {
|
||||||
|
k: v for k, v in known_nodes_db.items()
|
||||||
|
if k not in nodes_to_remove
|
||||||
|
}
|
||||||
|
|
||||||
|
initial_nodes_data = []
|
||||||
|
for node_uuid, data in active_known_nodes_db.items():
|
||||||
|
current_health = get_node_health(data)
|
||||||
|
|
||||||
|
connections = {}
|
||||||
|
for target_uuid in active_known_nodes_db: # Only iterate over currently active nodes
|
||||||
|
if target_uuid != node_uuid:
|
||||||
|
ping_data = database.get_ping_data(
|
||||||
|
node_uuid, target_uuid, start_time="-300s"
|
||||||
|
)
|
||||||
|
latency_ms = None
|
||||||
|
if ping_data and ping_data["data"]["latency"]:
|
||||||
|
# Get the most recent non-None, non-zero latency
|
||||||
|
for latency in reversed(ping_data["data"]["latency"]):
|
||||||
|
if latency is not None and not (isinstance(latency, float) and latency == 0.0):
|
||||||
|
latency_ms = float(latency)
|
||||||
|
break
|
||||||
|
connections[target_uuid] = latency_ms
|
||||||
|
|
||||||
|
initial_nodes_data.append(
|
||||||
|
{
|
||||||
|
"uuid": node_uuid,
|
||||||
|
"last_seen": data["last_seen"], # Keep original for JS
|
||||||
|
"formatted_last_seen": datetime.fromisoformat(data["last_seen"]).strftime("%Y-%m-%d %H:%M:%S UTC"),
|
||||||
|
"ip": data["ip"],
|
||||||
|
"health_status": current_health,
|
||||||
|
"uptime_seconds": data.get("uptime_seconds"),
|
||||||
|
"formatted_uptime": format_uptime(data.get("uptime_seconds")), # Pre-format uptime for HTML
|
||||||
|
"load_avg": data.get("load_avg"),
|
||||||
|
"memory_usage_percent": data.get("memory_usage_percent"),
|
||||||
|
"connections": connections,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# --- End initial node data preparation ---
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"index.html",
|
"index.html",
|
||||||
{
|
{
|
||||||
"request": request,
|
"request": request,
|
||||||
"service_uuid": SERVICE_UUID,
|
"service_uuid": SERVICE_UUID,
|
||||||
"url_for": request.url_for, # Pass url_for for dynamic URL generation
|
"url_for": request.url_for,
|
||||||
"root_path": request.scope.get(
|
"root_path": request.scope.get(
|
||||||
"root_path", ""
|
"root_path", ""
|
||||||
), # Pass root_path for JS base URL
|
), # Pass root_path for JS base URL
|
||||||
|
"nodes": initial_nodes_data, # Pass initial node data for server-side rendering
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ document.addEventListener('DOMContentLoaded', () => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Attempt to parse JSON. This is where the error would occur if the content is HTML.
|
// Attempt to parse JSON. This is where the error would occur if the content is HTML.
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
console.log('Received logs:', data.logs.length);
|
console.log('Received logs:', data.logs.length);
|
||||||
renderLogTable(data.logs);
|
renderLogTable(data.logs);
|
||||||
logCountSpan.textContent = data.log_count;
|
logCountSpan.textContent = data.log_count;
|
||||||
|
@ -9,12 +9,79 @@
|
|||||||
<body>
|
<body>
|
||||||
<div class="header-container">
|
<div class="header-container">
|
||||||
<h1>Node Monitoring System</h1>
|
<h1>Node Monitoring System</h1>
|
||||||
<p>Total Nodes: <span id="node-count">0</span></p>
|
<p>Total Nodes: <span id="node-count">{{ nodes|length }}</span></p>
|
||||||
<p>Service UUID: <code>{{ service_uuid }}</code></p> <!-- ALWAYS DISPLAYED -->
|
<p>Service UUID: <code>{{ service_uuid }}</code></p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="node-grid-container">
|
<div id="node-grid-container">
|
||||||
<p class="loading-message">Loading node data...</p>
|
{% if nodes %}
|
||||||
|
<div class="connection-grid" style="grid-template-columns: minmax(100px, 1fr) repeat({{ nodes|length }}, minmax(100px, 1fr));">
|
||||||
|
<!-- Header Row (Column UUIDs) -->
|
||||||
|
<div class="grid-row header-row">
|
||||||
|
<div class="grid-cell empty-cell"></div> {# Top-left corner #}
|
||||||
|
{% for node in nodes %}
|
||||||
|
<div class="grid-cell header-cell">
|
||||||
|
<div class="node-uuid" title="{{ node.uuid }}">{{ node.uuid[:8] }}...</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Data Rows -->
|
||||||
|
{% for row_node in nodes %}
|
||||||
|
<div class="grid-row">
|
||||||
|
<!-- Row Header (UUID) -->
|
||||||
|
<div class="grid-cell header-cell">
|
||||||
|
<div class="node-uuid" title="{{ row_node.uuid }}">{{ row_node.uuid[:8] }}...</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Cells for connections/status -->
|
||||||
|
{% for col_node in nodes %}
|
||||||
|
<div class="grid-cell
|
||||||
|
{% if row_node.uuid == col_node.uuid %}
|
||||||
|
node-{{ row_node.health_status }}
|
||||||
|
{% else %}
|
||||||
|
{% set latency = row_node.connections[col_node.uuid] if col_node.uuid in row_node.connections else None %}
|
||||||
|
{% if latency is not none and latency is not equalto 0.0 %}
|
||||||
|
{% if latency <= 200 %}latency-low
|
||||||
|
{% elif latency <= 1000 %}latency-medium
|
||||||
|
{% else %}latency-high
|
||||||
|
{% endif %}
|
||||||
|
{% else %}
|
||||||
|
latency-unavailable
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
">
|
||||||
|
{% if row_node.uuid == col_node.uuid %}
|
||||||
|
<div class="node-status-text">Status: {{ row_node.health_status.upper() }}</div>
|
||||||
|
<div class="node-tooltip">
|
||||||
|
<p><strong>UUID:</strong> {{ row_node.uuid }}</p>
|
||||||
|
<p><strong>IP:</strong> {{ row_node.ip }}</p>
|
||||||
|
<p><strong>Last Seen:</strong> {{ row_node.formatted_last_seen }}</p>
|
||||||
|
<p><strong>Uptime:</strong> {{ row_node.formatted_uptime }}</p>
|
||||||
|
<p><strong>Load Avg (1m, 5m, 15m):</strong> {{ row_node.load_avg | map('%.2f' | format) | join(', ') if row_node.load_avg else 'N/A' }}</p>
|
||||||
|
<p><strong>Memory Usage:</strong> {{ '%.2f' | format(row_node.memory_usage_percent) + '%' if row_node.memory_usage_percent is not none else 'N/A' }}</p>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
{% set latency = row_node.connections[col_node.uuid] if col_node.uuid in row_node.connections else None %}
|
||||||
|
{% set display_latency = 'N/A' %}
|
||||||
|
{% if latency is not none and latency is not equalto 0.0 %}
|
||||||
|
{% set display_latency = '%.1f ms' | format(latency) %}
|
||||||
|
{% endif %}
|
||||||
|
<div class="conn-status-text">Ping: {{ display_latency }}</div>
|
||||||
|
<div class="node-tooltip">
|
||||||
|
<p><strong>From:</strong> {{ row_node.uuid[:8] }}...</p>
|
||||||
|
<p><strong>to:</strong> {{ col_node.uuid[:8] }}...</p>
|
||||||
|
<p><strong>Latency:</strong> {{ display_latency }}</p>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<p class="loading-message">No nodes reporting yet. Start a client!</p>
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
832
test-client-multi-node-flux.py
Normal file
832
test-client-multi-node-flux.py
Normal file
@ -0,0 +1,832 @@
|
|||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
import random
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import socket
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import argparse
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.connection import create_connection
|
||||||
|
|
||||||
|
# --- Multi-Node Client Configuration ---
|
||||||
|
TARGET_SERVICE_UUID = os.environ.get(
|
||||||
|
"TARGET_SERVICE_UUID", "ab73d00a-8169-46bb-997d-f13e5f760973"
|
||||||
|
)
|
||||||
|
|
||||||
|
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
|
||||||
|
UPDATE_INTERVAL_SECONDS = int(os.environ.get("UPDATE_INTERVAL_SECONDS", 5))
|
||||||
|
NUM_NODES = int(os.environ.get("NUM_NODES", 3))
|
||||||
|
|
||||||
|
# Base IP for loopback binding (127.0.0.x where x starts from this base)
|
||||||
|
LOOPBACK_IP_BASE = int(os.environ.get("LOOPBACK_IP_BASE", 2)) # Start from 127.0.0.2
|
||||||
|
|
||||||
|
# Dynamic node management settings
|
||||||
|
DYNAMIC_MIN_NODES = int(os.environ.get("DYNAMIC_MIN_NODES", 3))
|
||||||
|
DYNAMIC_MAX_NODES = int(os.environ.get("DYNAMIC_MAX_NODES", 7))
|
||||||
|
NODE_CHANGE_INTERVAL = int(os.environ.get("NODE_CHANGE_INTERVAL", 30)) # seconds
|
||||||
|
NODE_LIFECYCLE_VARIANCE = int(os.environ.get("NODE_LIFECYCLE_VARIANCE", 10)) # seconds
|
||||||
|
|
||||||
|
# --- Logging Configuration ---
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger("MultiNodeClient")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Custom HTTP Adapter for Source IP Binding ---
|
||||||
|
class SourceIPHTTPAdapter(HTTPAdapter):
|
||||||
|
def __init__(self, source_ip, *args, **kwargs):
|
||||||
|
self.source_ip = source_ip
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def init_poolmanager(self, *args, **kwargs):
|
||||||
|
# Override the socket creation to bind to specific source IP
|
||||||
|
def custom_create_connection(
|
||||||
|
address,
|
||||||
|
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||||
|
source_address=None,
|
||||||
|
socket_options=None,
|
||||||
|
):
|
||||||
|
# Force our custom source address
|
||||||
|
return create_connection(
|
||||||
|
address,
|
||||||
|
timeout,
|
||||||
|
source_address=(self.source_ip, 0), # 0 = any available port
|
||||||
|
socket_options=socket_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Monkey patch the connection creation
|
||||||
|
original_create_connection = socket.create_connection
|
||||||
|
socket.create_connection = custom_create_connection
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = super().init_poolmanager(*args, **kwargs)
|
||||||
|
finally:
|
||||||
|
# Restore original function
|
||||||
|
socket.create_connection = original_create_connection
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# --- Enhanced Node Class with IP Binding ---
|
||||||
|
class SimulatedNode:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
node_id: int,
|
||||||
|
total_nodes: int,
|
||||||
|
server_url: str,
|
||||||
|
service_uuid: str,
|
||||||
|
update_interval: int,
|
||||||
|
ip_base: int,
|
||||||
|
persistent_uuid: str = None, # Allow reusing UUIDs for returning nodes
|
||||||
|
):
|
||||||
|
self.node_id = node_id
|
||||||
|
self.node_uuid = persistent_uuid or str(uuid.uuid4())
|
||||||
|
self.server_url = server_url # Store server URL
|
||||||
|
self.service_uuid = service_uuid # Store service UUID
|
||||||
|
self.update_interval = update_interval # Store update interval
|
||||||
|
self.uptime_seconds = 0
|
||||||
|
self.known_peers = {}
|
||||||
|
self.total_nodes = total_nodes
|
||||||
|
self.running = False
|
||||||
|
self.is_persistent = (
|
||||||
|
persistent_uuid is not None
|
||||||
|
) # Track if this is a returning node
|
||||||
|
|
||||||
|
# Assign unique loopback IP to this node using the passed ip_base
|
||||||
|
self.source_ip = f"127.0.0.{ip_base + node_id - 1}"
|
||||||
|
|
||||||
|
# Create requests session with custom adapter for IP binding
|
||||||
|
self.session = requests.Session()
|
||||||
|
adapter = SourceIPHTTPAdapter(self.source_ip)
|
||||||
|
self.session.mount("http://", adapter)
|
||||||
|
self.session.mount("https://", adapter)
|
||||||
|
|
||||||
|
# Each node gets slightly different characteristics
|
||||||
|
if not self.is_persistent: # Only randomize for new nodes
|
||||||
|
self.base_load = random.uniform(0.2, 1.0)
|
||||||
|
self.base_memory = random.uniform(40.0, 70.0)
|
||||||
|
self.load_variance = random.uniform(0.1, 0.5)
|
||||||
|
self.memory_variance = random.uniform(5.0, 15.0)
|
||||||
|
|
||||||
|
# Some nodes might be "problematic" (higher load/memory)
|
||||||
|
if random.random() < 0.2: # 20% chance of being a "problematic" node
|
||||||
|
self.base_load *= 2.0
|
||||||
|
self.base_memory += 20.0
|
||||||
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Returning nodes keep some consistency but with variation
|
||||||
|
self.base_load = random.uniform(0.3, 0.8)
|
||||||
|
self.base_memory = random.uniform(45.0, 65.0)
|
||||||
|
self.load_variance = random.uniform(0.1, 0.4)
|
||||||
|
self.memory_variance = random.uniform(5.0, 12.0)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({'returning' if self.is_persistent else 'new'}) will bind to source IP: {self.source_ip}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_ip_binding(self):
|
||||||
|
"""Test if we can bind to the assigned IP address."""
|
||||||
|
try:
|
||||||
|
# Try to create a socket and bind to the IP
|
||||||
|
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
test_socket.bind((self.source_ip, 0)) # Bind to any available port
|
||||||
|
test_socket.close()
|
||||||
|
logger.debug(
|
||||||
|
f"Node {self.node_id} successfully tested binding to {self.source_ip}"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Node {self.node_id} cannot bind to {self.source_ip}: {e}")
|
||||||
|
logger.error(
|
||||||
|
"Make sure the IP address is available on your loopback interface."
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
f"You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)"
|
||||||
|
)
|
||||||
|
logger.error(f"Or: sudo ip addr add {self.source_ip}/8 dev lo (Linux)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def generate_node_status_data(self):
|
||||||
|
"""Generates simulated node status metrics with per-node characteristics."""
|
||||||
|
|
||||||
|
self.uptime_seconds += self.update_interval + random.randint(-1, 2)
|
||||||
|
|
||||||
|
# Generate load with some randomness but consistent per-node baseline
|
||||||
|
load_1min = max(
|
||||||
|
0.1,
|
||||||
|
self.base_load + random.uniform(-self.load_variance, self.load_variance),
|
||||||
|
)
|
||||||
|
load_5min = max(0.1, load_1min * random.uniform(0.8, 1.0))
|
||||||
|
load_15min = max(0.1, load_5min * random.uniform(0.8, 1.0))
|
||||||
|
|
||||||
|
load_avg = [round(load_1min, 2), round(load_5min, 2), round(load_15min, 2)]
|
||||||
|
|
||||||
|
# Generate memory usage with baseline + variance
|
||||||
|
memory_usage = max(
|
||||||
|
10.0,
|
||||||
|
min(
|
||||||
|
95.0,
|
||||||
|
self.base_memory
|
||||||
|
+ random.uniform(-self.memory_variance, self.memory_variance),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"uptime_seconds": self.uptime_seconds,
|
||||||
|
"load_avg": load_avg,
|
||||||
|
"memory_usage_percent": round(memory_usage, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
def generate_ping_data(self):
|
||||||
|
"""Generates simulated ping latencies to known peers."""
|
||||||
|
pings = {}
|
||||||
|
|
||||||
|
# Ping to self (loopback)
|
||||||
|
pings[self.node_uuid] = round(random.uniform(0.1, 1.5), 2)
|
||||||
|
|
||||||
|
# Ping to known peers
|
||||||
|
for peer_uuid in self.known_peers.keys():
|
||||||
|
if peer_uuid != self.node_uuid:
|
||||||
|
# Simulate network latency with some consistency per peer
|
||||||
|
base_latency = random.uniform(5.0, 100.0)
|
||||||
|
variation = random.uniform(-10.0, 10.0)
|
||||||
|
latency = max(0.1, base_latency + variation)
|
||||||
|
pings[peer_uuid] = round(latency, 2)
|
||||||
|
|
||||||
|
return pings
|
||||||
|
|
||||||
|
def send_update(self):
|
||||||
|
"""Sends a single status update to the server using bound IP."""
|
||||||
|
try:
|
||||||
|
status_data = self.generate_node_status_data()
|
||||||
|
ping_data = self.generate_ping_data()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"node": self.node_uuid,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"status": status_data,
|
||||||
|
"pings": ping_data,
|
||||||
|
}
|
||||||
|
endpoint_url = f"{self.server_url}/{self.service_uuid}/{self.node_uuid}/"
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) sending update. "
|
||||||
|
f"Uptime: {status_data['uptime_seconds']}s, "
|
||||||
|
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
||||||
|
f"Pings: {len(ping_data)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the custom session with IP binding
|
||||||
|
response = self.session.put(endpoint_url, json=payload, timeout=10)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
response_data = response.json()
|
||||||
|
|
||||||
|
if "peers" in response_data and isinstance(
|
||||||
|
response_data["peers"], dict
|
||||||
|
):
|
||||||
|
new_peers = {k: v for k, v in response_data["peers"].items()}
|
||||||
|
|
||||||
|
# Log new peer discoveries
|
||||||
|
newly_discovered = set(new_peers.keys()) - set(
|
||||||
|
self.known_peers.keys()
|
||||||
|
)
|
||||||
|
if newly_discovered:
|
||||||
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.known_peers = new_peers
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(newly_discovered) > 0
|
||||||
|
or len(self.known_peers) != self.total_nodes - 1
|
||||||
|
):
|
||||||
|
logger.debug(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
||||||
|
f"(expected {self.total_nodes - 1})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
||||||
|
f"Status: {response.status_code}, Response: {response.text}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
logger.error(f"Node {self.node_id} ({self.source_ip}) request timed out")
|
||||||
|
return False
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) connection error: {e}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Main loop for this simulated node."""
|
||||||
|
# Test IP binding before starting
|
||||||
|
if not self.test_ip_binding():
|
||||||
|
logger.error(f"Node {self.node_id} cannot start due to IP binding failure")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.running = True
|
||||||
|
logger.info(
|
||||||
|
f"Starting Node {self.node_id} ({'returning' if self.is_persistent else 'new'}) with UUID: {self.node_uuid[:8]}... (IP: {self.source_ip})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add some initial delay to stagger node starts
|
||||||
|
initial_delay = self.node_id * 0.5
|
||||||
|
time.sleep(initial_delay)
|
||||||
|
|
||||||
|
consecutive_failures = 0
|
||||||
|
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
success = self.send_update()
|
||||||
|
|
||||||
|
if success:
|
||||||
|
consecutive_failures = 0
|
||||||
|
else:
|
||||||
|
consecutive_failures += 1
|
||||||
|
if consecutive_failures >= 3:
|
||||||
|
logger.warning(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add some jitter to prevent thundering herd
|
||||||
|
jitter = random.uniform(-1.0, 1.0)
|
||||||
|
sleep_time = max(1.0, self.update_interval + jitter)
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) received interrupt signal"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}"
|
||||||
|
)
|
||||||
|
time.sleep(self.update_interval)
|
||||||
|
|
||||||
|
logger.info(f"Node {self.node_id} ({self.source_ip}) stopped")
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the node."""
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
|
||||||
|
# --- Dynamic Multi-Node Manager ---
|
||||||
|
class DynamicMultiNodeManager:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_nodes: int,
|
||||||
|
max_nodes: int,
|
||||||
|
server_url: str,
|
||||||
|
service_uuid: str,
|
||||||
|
update_interval: int,
|
||||||
|
ip_base: int,
|
||||||
|
change_interval: int,
|
||||||
|
lifecycle_variance: int,
|
||||||
|
):
|
||||||
|
self.min_nodes = min_nodes
|
||||||
|
self.max_nodes = max_nodes
|
||||||
|
self.server_url = server_url
|
||||||
|
self.service_uuid = service_uuid
|
||||||
|
self.update_interval = update_interval
|
||||||
|
self.ip_base = ip_base
|
||||||
|
self.change_interval = change_interval
|
||||||
|
self.lifecycle_variance = lifecycle_variance
|
||||||
|
|
||||||
|
# Track active nodes and their threads
|
||||||
|
self.active_nodes = {} # node_id -> SimulatedNode
|
||||||
|
self.node_threads = {} # node_id -> Thread
|
||||||
|
self.node_counter = 1 # For assigning unique node IDs
|
||||||
|
|
||||||
|
# Track node history for potential returns
|
||||||
|
self.departed_nodes = {} # UUID -> {'characteristics', 'last_seen'}
|
||||||
|
|
||||||
|
self.running = False
|
||||||
|
self.manager_thread = None
|
||||||
|
|
||||||
|
def get_available_ip_addresses(self):
|
||||||
|
"""Get list of available IP addresses for new nodes."""
|
||||||
|
max_possible_ips = self.max_nodes * 2 # Allow some extra IPs
|
||||||
|
available_ips = []
|
||||||
|
used_ips = {node.source_ip for node in self.active_nodes.values()}
|
||||||
|
|
||||||
|
for i in range(max_possible_ips):
|
||||||
|
ip = f"127.0.0.{self.ip_base + i}"
|
||||||
|
if ip not in used_ips:
|
||||||
|
available_ips.append((i + 1, ip)) # (node_id_offset, ip)
|
||||||
|
|
||||||
|
return available_ips
|
||||||
|
|
||||||
|
def check_ip_availability(self, num_ips_needed):
|
||||||
|
"""Check if required number of IP addresses are available."""
|
||||||
|
logger.info(f"Checking availability of {num_ips_needed} IP addresses...")
|
||||||
|
available_ips = self.get_available_ip_addresses()
|
||||||
|
|
||||||
|
if len(available_ips) < num_ips_needed:
|
||||||
|
logger.error(
|
||||||
|
f"Only {len(available_ips)} IP addresses available, need {num_ips_needed}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Test the first few IPs we'd actually use
|
||||||
|
test_ips = available_ips[:num_ips_needed]
|
||||||
|
all_available = True
|
||||||
|
|
||||||
|
for node_id_offset, ip in test_ips:
|
||||||
|
try:
|
||||||
|
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
test_socket.bind((ip, 0))
|
||||||
|
test_socket.close()
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Cannot bind to {ip}: {e}")
|
||||||
|
all_available = False
|
||||||
|
|
||||||
|
if not all_available:
|
||||||
|
logger.error("Some IP addresses are not available.")
|
||||||
|
self.show_setup_commands()
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"All {num_ips_needed} IP addresses are available!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def show_setup_commands(self):
|
||||||
|
"""Show commands for setting up loopback IPs."""
|
||||||
|
logger.info("=== Loopback IP Setup Commands ===")
|
||||||
|
logger.info("Run these commands to add the required loopback IP addresses:")
|
||||||
|
|
||||||
|
import platform
|
||||||
|
|
||||||
|
system = platform.system().lower()
|
||||||
|
|
||||||
|
max_ips_needed = self.max_nodes * 2
|
||||||
|
for i in range(max_ips_needed):
|
||||||
|
ip = f"127.0.0.{self.ip_base + i}"
|
||||||
|
if system == "linux":
|
||||||
|
logger.info(f"sudo ip addr add {ip}/8 dev lo")
|
||||||
|
elif system == "darwin": # macOS
|
||||||
|
logger.info(f"sudo ifconfig lo0 alias {ip}")
|
||||||
|
else:
|
||||||
|
logger.info(f"Add {ip} to loopback interface (OS: {system})")
|
||||||
|
|
||||||
|
logger.info("=" * 40)
|
||||||
|
|
||||||
|
def create_new_node(self, return_existing=False):
|
||||||
|
"""Create a new node, optionally bringing back a departed node."""
|
||||||
|
available_ips = self.get_available_ip_addresses()
|
||||||
|
if not available_ips:
|
||||||
|
logger.warning("No available IP addresses for new nodes")
|
||||||
|
return None
|
||||||
|
|
||||||
|
node_id_offset, source_ip = available_ips[0]
|
||||||
|
node_id = self.node_counter
|
||||||
|
self.node_counter += 1
|
||||||
|
|
||||||
|
# Decide if we should bring back an old node (30% chance if we have departed nodes)
|
||||||
|
persistent_uuid = None
|
||||||
|
if return_existing and self.departed_nodes and random.random() < 0.3:
|
||||||
|
# Pick a random departed node to bring back
|
||||||
|
persistent_uuid = random.choice(list(self.departed_nodes.keys()))
|
||||||
|
logger.info(
|
||||||
|
f"Bringing back departed node {persistent_uuid[:8]}... as Node {node_id}"
|
||||||
|
)
|
||||||
|
# Remove from departed list since it's returning
|
||||||
|
del self.departed_nodes[persistent_uuid]
|
||||||
|
|
||||||
|
node = SimulatedNode(
|
||||||
|
node_id=node_id,
|
||||||
|
total_nodes=len(self.active_nodes) + 1, # +1 for this new node
|
||||||
|
server_url=self.server_url,
|
||||||
|
service_uuid=self.service_uuid,
|
||||||
|
update_interval=self.update_interval,
|
||||||
|
ip_base=self.ip_base + node_id_offset - 1, # Adjust IP calculation
|
||||||
|
persistent_uuid=persistent_uuid,
|
||||||
|
)
|
||||||
|
|
||||||
|
return node
|
||||||
|
|
||||||
|
def start_node(self, node):
|
||||||
|
"""Start a single node in its own thread."""
|
||||||
|
thread = threading.Thread(target=node.run, name=f"Node-{node.node_id}")
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
self.active_nodes[node.node_id] = node
|
||||||
|
self.node_threads[node.node_id] = thread
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"✅ Started Node {node.node_id} ({node.node_uuid[:8]}...) on {node.source_ip}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def stop_node(self, node_id, permanently=False):
|
||||||
|
"""Stop a specific node."""
|
||||||
|
if node_id not in self.active_nodes:
|
||||||
|
return False
|
||||||
|
|
||||||
|
node = self.active_nodes[node_id]
|
||||||
|
thread = self.node_threads[node_id]
|
||||||
|
|
||||||
|
# Store node info for potential return (unless it's permanently leaving)
|
||||||
|
if not permanently and random.random() < 0.7: # 70% chance node might return
|
||||||
|
self.departed_nodes[node.node_uuid] = {
|
||||||
|
"last_seen": datetime.now(),
|
||||||
|
"characteristics": {
|
||||||
|
"base_load": node.base_load,
|
||||||
|
"base_memory": node.base_memory,
|
||||||
|
"load_variance": node.load_variance,
|
||||||
|
"memory_variance": node.memory_variance,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
logger.info(
|
||||||
|
f"⏸️ Node {node_id} ({node.node_uuid[:8]}...) departing temporarily"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"❌ Node {node_id} ({node.node_uuid[:8]}...) leaving permanently"
|
||||||
|
)
|
||||||
|
|
||||||
|
node.stop()
|
||||||
|
thread.join(timeout=5.0)
|
||||||
|
|
||||||
|
del self.active_nodes[node_id]
|
||||||
|
del self.node_threads[node_id]
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def adjust_node_count(self):
|
||||||
|
"""Randomly adjust the number of active nodes within the specified range."""
|
||||||
|
current_count = len(self.active_nodes)
|
||||||
|
|
||||||
|
# Decide on target count
|
||||||
|
target_count = random.randint(self.min_nodes, self.max_nodes)
|
||||||
|
|
||||||
|
if target_count == current_count:
|
||||||
|
logger.debug(f"Node count staying at {current_count}")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"🔄 Adjusting node count: {current_count} → {target_count}")
|
||||||
|
|
||||||
|
if target_count > current_count:
|
||||||
|
# Add nodes
|
||||||
|
nodes_to_add = target_count - current_count
|
||||||
|
for _ in range(nodes_to_add):
|
||||||
|
node = self.create_new_node(return_existing=True)
|
||||||
|
if node:
|
||||||
|
self.start_node(node)
|
||||||
|
time.sleep(random.uniform(1, 3)) # Stagger starts
|
||||||
|
|
||||||
|
elif target_count < current_count:
|
||||||
|
# Remove nodes
|
||||||
|
nodes_to_remove = current_count - target_count
|
||||||
|
active_node_ids = list(self.active_nodes.keys())
|
||||||
|
nodes_to_stop = random.sample(active_node_ids, nodes_to_remove)
|
||||||
|
|
||||||
|
for node_id in nodes_to_stop:
|
||||||
|
# 20% chance node leaves permanently
|
||||||
|
permanently = random.random() < 0.2
|
||||||
|
self.stop_node(node_id, permanently=permanently)
|
||||||
|
time.sleep(random.uniform(0.5, 2)) # Stagger stops
|
||||||
|
|
||||||
|
def manage_node_lifecycle(self):
|
||||||
|
"""Main loop for managing node lifecycle changes."""
|
||||||
|
logger.info("🚀 Starting dynamic node lifecycle management")
|
||||||
|
|
||||||
|
# Start with minimum nodes
|
||||||
|
logger.info(f"Initializing with {self.min_nodes} nodes...")
|
||||||
|
for _ in range(self.min_nodes):
|
||||||
|
node = self.create_new_node()
|
||||||
|
if node:
|
||||||
|
self.start_node(node)
|
||||||
|
time.sleep(1) # Brief delay between starts
|
||||||
|
|
||||||
|
# Main management loop
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
# Wait for the change interval (with some randomness)
|
||||||
|
variance = random.randint(
|
||||||
|
-self.lifecycle_variance, self.lifecycle_variance
|
||||||
|
)
|
||||||
|
sleep_time = max(
|
||||||
|
10, self.change_interval + variance
|
||||||
|
) # Minimum 10 seconds
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Waiting {sleep_time} seconds until next potential change..."
|
||||||
|
)
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
if not self.running:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Randomly decide if we should make a change (70% chance)
|
||||||
|
if random.random() < 0.7:
|
||||||
|
self.adjust_node_count()
|
||||||
|
else:
|
||||||
|
logger.debug("Skipping this change cycle")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Node lifecycle manager received interrupt signal")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in node lifecycle management: {e}", exc_info=True)
|
||||||
|
time.sleep(5) # Brief pause before continuing
|
||||||
|
|
||||||
|
def start_dynamic_management(self):
|
||||||
|
"""Start the dynamic node management system."""
|
||||||
|
if not self.check_ip_availability(self.max_nodes * 2):
|
||||||
|
logger.error(
|
||||||
|
"Cannot start dynamic management due to IP availability issues"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.running = True
|
||||||
|
self.manager_thread = threading.Thread(
|
||||||
|
target=self.manage_node_lifecycle, name="NodeLifecycleManager"
|
||||||
|
)
|
||||||
|
self.manager_thread.daemon = True
|
||||||
|
self.manager_thread.start()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def stop_all_nodes(self):
|
||||||
|
"""Stop all nodes and the management system."""
|
||||||
|
logger.info("🛑 Stopping dynamic node management...")
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
# Stop the manager thread
|
||||||
|
if self.manager_thread and self.manager_thread.is_alive():
|
||||||
|
self.manager_thread.join(timeout=5.0)
|
||||||
|
|
||||||
|
# Stop all active nodes
|
||||||
|
for node_id in list(self.active_nodes.keys()):
|
||||||
|
self.stop_node(node_id, permanently=True)
|
||||||
|
|
||||||
|
logger.info("All nodes stopped")
|
||||||
|
|
||||||
|
def print_status(self):
|
||||||
|
"""Print current status of the dynamic system."""
|
||||||
|
active_count = len(self.active_nodes)
|
||||||
|
departed_count = len(self.departed_nodes)
|
||||||
|
|
||||||
|
logger.info(f"=== Dynamic Multi-Node Status ===")
|
||||||
|
logger.info(
|
||||||
|
f"Active nodes: {active_count} (range: {self.min_nodes}-{self.max_nodes})"
|
||||||
|
)
|
||||||
|
logger.info(f"Departed nodes (may return): {departed_count}")
|
||||||
|
|
||||||
|
for node_id, node in self.active_nodes.items():
|
||||||
|
status = "returning" if node.is_persistent else "new"
|
||||||
|
logger.info(
|
||||||
|
f" Node {node_id}: {status}, IP={node.source_ip}, UUID={node.node_uuid[:8]}..., Uptime={node.uptime_seconds}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
if departed_count > 0:
|
||||||
|
logger.info("Departed nodes that might return:")
|
||||||
|
for uuid_val in list(self.departed_nodes.keys())[:5]: # Show first 5
|
||||||
|
logger.info(f" UUID={uuid_val[:8]}...")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Dynamic multi-node test client with fluctuating node counts"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-nodes",
|
||||||
|
type=int,
|
||||||
|
default=DYNAMIC_MIN_NODES,
|
||||||
|
help=f"Minimum number of nodes (default: {DYNAMIC_MIN_NODES})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-nodes",
|
||||||
|
type=int,
|
||||||
|
default=DYNAMIC_MAX_NODES,
|
||||||
|
help=f"Maximum number of nodes (default: {DYNAMIC_MAX_NODES})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--change-interval",
|
||||||
|
type=int,
|
||||||
|
default=NODE_CHANGE_INTERVAL,
|
||||||
|
help=f"Average seconds between node changes (default: {NODE_CHANGE_INTERVAL})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--lifecycle-variance",
|
||||||
|
type=int,
|
||||||
|
default=NODE_LIFECYCLE_VARIANCE,
|
||||||
|
help=f"Random variance in change timing (default: {NODE_LIFECYCLE_VARIANCE})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--interval",
|
||||||
|
type=int,
|
||||||
|
default=UPDATE_INTERVAL_SECONDS,
|
||||||
|
help=f"Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--server",
|
||||||
|
type=str,
|
||||||
|
default=SERVER_BASE_URL,
|
||||||
|
help=f"Server URL (default: {SERVER_BASE_URL})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--service-uuid",
|
||||||
|
type=str,
|
||||||
|
default=TARGET_SERVICE_UUID,
|
||||||
|
help="Target service UUID",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ip-base",
|
||||||
|
type=int,
|
||||||
|
default=LOOPBACK_IP_BASE,
|
||||||
|
help=f"Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--setup-help",
|
||||||
|
action="store_true",
|
||||||
|
help="Show commands to set up loopback IP addresses",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", "-v", action="store_true", help="Enable verbose logging"
|
||||||
|
)
|
||||||
|
# Keep the old --nodes argument for compatibility, but make it set max nodes
|
||||||
|
parser.add_argument(
|
||||||
|
"--nodes",
|
||||||
|
type=int,
|
||||||
|
help="Set max nodes (compatibility mode, same as --max-nodes)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Handle compatibility
|
||||||
|
if args.nodes:
|
||||||
|
args.max_nodes = args.nodes
|
||||||
|
if (
|
||||||
|
args.min_nodes == DYNAMIC_MIN_NODES
|
||||||
|
): # Only adjust min if it wasn't explicitly set
|
||||||
|
args.min_nodes = max(
|
||||||
|
1, args.nodes - 2
|
||||||
|
) # Set min to a reasonable lower bound
|
||||||
|
|
||||||
|
min_nodes = args.min_nodes
|
||||||
|
max_nodes = args.max_nodes
|
||||||
|
change_interval = args.change_interval
|
||||||
|
lifecycle_variance = args.lifecycle_variance
|
||||||
|
update_interval = args.interval
|
||||||
|
server_url = args.server
|
||||||
|
service_uuid = args.service_uuid
|
||||||
|
ip_base = args.ip_base
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Validate configuration
|
||||||
|
if min_nodes >= max_nodes:
|
||||||
|
logger.error("Minimum nodes must be less than maximum nodes")
|
||||||
|
return
|
||||||
|
|
||||||
|
if max_nodes < 1:
|
||||||
|
logger.error("Maximum nodes must be at least 1")
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.setup_help:
|
||||||
|
# Create a temporary manager just to show setup commands
|
||||||
|
temp_manager = DynamicMultiNodeManager(
|
||||||
|
min_nodes,
|
||||||
|
max_nodes,
|
||||||
|
server_url,
|
||||||
|
service_uuid,
|
||||||
|
update_interval,
|
||||||
|
ip_base,
|
||||||
|
change_interval,
|
||||||
|
lifecycle_variance,
|
||||||
|
)
|
||||||
|
temp_manager.show_setup_commands()
|
||||||
|
return
|
||||||
|
|
||||||
|
if service_uuid == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
|
||||||
|
logger.error("=" * 60)
|
||||||
|
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
|
||||||
|
logger.error(
|
||||||
|
"Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable."
|
||||||
|
)
|
||||||
|
logger.error("=" * 60)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("Dynamic Multi-Node Test Client Configuration:")
|
||||||
|
logger.info(f" Node range: {min_nodes} - {max_nodes} nodes")
|
||||||
|
logger.info(
|
||||||
|
f" Change interval: {change_interval}s (±{lifecycle_variance}s variance)"
|
||||||
|
)
|
||||||
|
logger.info(f" Update interval: {update_interval} seconds per node")
|
||||||
|
logger.info(f" Server URL: {server_url}")
|
||||||
|
logger.info(f" Target Service UUID: {service_uuid}")
|
||||||
|
logger.info(
|
||||||
|
f" IP range: 127.0.0.{ip_base} - 127.0.0.{ip_base + max_nodes * 2 - 1}"
|
||||||
|
)
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
# Create and start the dynamic multi-node manager
|
||||||
|
manager = DynamicMultiNodeManager(
|
||||||
|
min_nodes,
|
||||||
|
max_nodes,
|
||||||
|
server_url,
|
||||||
|
service_uuid,
|
||||||
|
update_interval,
|
||||||
|
ip_base,
|
||||||
|
change_interval,
|
||||||
|
lifecycle_variance,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not manager.start_dynamic_management():
|
||||||
|
logger.error("Failed to start dynamic node management")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Main monitoring loop
|
||||||
|
status_interval = 30 # Print status every 30 seconds
|
||||||
|
last_status_time = time.time()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"🎯 Dynamic node management started! Nodes will fluctuate between {} and {} over time.".format(
|
||||||
|
min_nodes, max_nodes
|
||||||
|
)
|
||||||
|
)
|
||||||
|
logger.info("Press Ctrl+C to stop...")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
time.sleep(5) # Check every 5 seconds
|
||||||
|
|
||||||
|
current_time = time.time()
|
||||||
|
if current_time - last_status_time >= status_interval:
|
||||||
|
manager.print_status()
|
||||||
|
last_status_time = current_time
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Received interrupt signal, shutting down...")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error in main: {e}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
manager.stop_all_nodes()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -15,7 +15,7 @@ from urllib3.util.connection import create_connection
|
|||||||
|
|
||||||
# --- Multi-Node Client Configuration ---
|
# --- Multi-Node Client Configuration ---
|
||||||
TARGET_SERVICE_UUID = os.environ.get(
|
TARGET_SERVICE_UUID = os.environ.get(
|
||||||
"TARGET_SERVICE_UUID", "c7c883fd-46f3-4b14-a727-d805ae0a6ec0"
|
"TARGET_SERVICE_UUID", "ab73d00a-8169-46bb-997d-f13e5f760973"
|
||||||
)
|
)
|
||||||
|
|
||||||
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
|
SERVER_BASE_URL = os.environ.get("SERVER_URL", "http://localhost:8000")
|
||||||
@ -28,48 +28,62 @@ LOOPBACK_IP_BASE = int(os.environ.get("LOOPBACK_IP_BASE", 2)) # Start from 127.
|
|||||||
# --- Logging Configuration ---
|
# --- Logging Configuration ---
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s'
|
format="%(asctime)s - %(name)s - %(levelname)s - [%(thread)d] - %(message)s",
|
||||||
)
|
)
|
||||||
logger = logging.getLogger("MultiNodeClient")
|
logger = logging.getLogger("MultiNodeClient")
|
||||||
|
|
||||||
|
|
||||||
# --- Custom HTTP Adapter for Source IP Binding ---
|
# --- Custom HTTP Adapter for Source IP Binding ---
|
||||||
class SourceIPHTTPAdapter(HTTPAdapter):
|
class SourceIPHTTPAdapter(HTTPAdapter):
|
||||||
def __init__(self, source_ip, *args, **kwargs):
|
def __init__(self, source_ip, *args, **kwargs):
|
||||||
self.source_ip = source_ip
|
self.source_ip = source_ip
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
def init_poolmanager(self, *args, **kwargs):
|
def init_poolmanager(self, *args, **kwargs):
|
||||||
# Override the socket creation to bind to specific source IP
|
# Override the socket creation to bind to specific source IP
|
||||||
def custom_create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
def custom_create_connection(
|
||||||
source_address=None, socket_options=None):
|
address,
|
||||||
|
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
|
||||||
|
source_address=None,
|
||||||
|
socket_options=None,
|
||||||
|
):
|
||||||
# Force our custom source address
|
# Force our custom source address
|
||||||
return create_connection(
|
return create_connection(
|
||||||
address,
|
address,
|
||||||
timeout,
|
timeout,
|
||||||
source_address=(self.source_ip, 0), # 0 = any available port
|
source_address=(self.source_ip, 0), # 0 = any available port
|
||||||
socket_options=socket_options
|
socket_options=socket_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Monkey patch the connection creation
|
# Monkey patch the connection creation
|
||||||
original_create_connection = socket.create_connection
|
original_create_connection = socket.create_connection
|
||||||
socket.create_connection = custom_create_connection
|
socket.create_connection = custom_create_connection
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = super().init_poolmanager(*args, **kwargs)
|
result = super().init_poolmanager(*args, **kwargs)
|
||||||
finally:
|
finally:
|
||||||
# Restore original function
|
# Restore original function
|
||||||
socket.create_connection = original_create_connection
|
socket.create_connection = original_create_connection
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# --- Enhanced Node Class with IP Binding ---
|
# --- Enhanced Node Class with IP Binding ---
|
||||||
class SimulatedNode:
|
class SimulatedNode:
|
||||||
def __init__(self, node_id: int, total_nodes: int, server_url: str, service_uuid: str, update_interval: int, ip_base: int):
|
def __init__(
|
||||||
|
self,
|
||||||
|
node_id: int,
|
||||||
|
total_nodes: int,
|
||||||
|
server_url: str,
|
||||||
|
service_uuid: str,
|
||||||
|
update_interval: int,
|
||||||
|
ip_base: int,
|
||||||
|
):
|
||||||
self.node_id = node_id
|
self.node_id = node_id
|
||||||
self.node_uuid = str(uuid.uuid4())
|
self.node_uuid = str(uuid.uuid4())
|
||||||
self.server_url = server_url # Store server URL
|
self.server_url = server_url # Store server URL
|
||||||
self.service_uuid = service_uuid # Store service UUID
|
self.service_uuid = service_uuid # Store service UUID
|
||||||
self.update_interval = update_interval # Store update interval
|
self.update_interval = update_interval # Store update interval
|
||||||
self.uptime_seconds = 0
|
self.uptime_seconds = 0
|
||||||
self.known_peers = {}
|
self.known_peers = {}
|
||||||
self.total_nodes = total_nodes
|
self.total_nodes = total_nodes
|
||||||
@ -81,21 +95,23 @@ class SimulatedNode:
|
|||||||
# Create requests session with custom adapter for IP binding
|
# Create requests session with custom adapter for IP binding
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
adapter = SourceIPHTTPAdapter(self.source_ip)
|
adapter = SourceIPHTTPAdapter(self.source_ip)
|
||||||
self.session.mount('http://', adapter)
|
self.session.mount("http://", adapter)
|
||||||
self.session.mount('https://', adapter)
|
self.session.mount("https://", adapter)
|
||||||
|
|
||||||
# Each node gets slightly different characteristics
|
# Each node gets slightly different characteristics
|
||||||
self.base_load = random.uniform(0.2, 1.0)
|
self.base_load = random.uniform(0.2, 1.0)
|
||||||
self.base_memory = random.uniform(40.0, 70.0)
|
self.base_memory = random.uniform(40.0, 70.0)
|
||||||
self.load_variance = random.uniform(0.1, 0.5)
|
self.load_variance = random.uniform(0.1, 0.5)
|
||||||
self.memory_variance = random.uniform(5.0, 15.0)
|
self.memory_variance = random.uniform(5.0, 15.0)
|
||||||
|
|
||||||
# Some nodes might be "problematic" (higher load/memory)
|
# Some nodes might be "problematic" (higher load/memory)
|
||||||
if random.random() < 0.2: # 20% chance of being a "problematic" node
|
if random.random() < 0.2: # 20% chance of being a "problematic" node
|
||||||
self.base_load *= 2.0
|
self.base_load *= 2.0
|
||||||
self.base_memory += 20.0
|
self.base_memory += 20.0
|
||||||
logger.info(f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})")
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.node_uuid[:8]}) will simulate high resource usage (IP: {self.source_ip})"
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Node {self.node_id} will bind to source IP: {self.source_ip}")
|
logger.info(f"Node {self.node_id} will bind to source IP: {self.source_ip}")
|
||||||
|
|
||||||
def test_ip_binding(self):
|
def test_ip_binding(self):
|
||||||
@ -105,12 +121,18 @@ class SimulatedNode:
|
|||||||
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
test_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
test_socket.bind((self.source_ip, 0)) # Bind to any available port
|
test_socket.bind((self.source_ip, 0)) # Bind to any available port
|
||||||
test_socket.close()
|
test_socket.close()
|
||||||
logger.debug(f"Node {self.node_id} successfully tested binding to {self.source_ip}")
|
logger.debug(
|
||||||
|
f"Node {self.node_id} successfully tested binding to {self.source_ip}"
|
||||||
|
)
|
||||||
return True
|
return True
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
logger.error(f"Node {self.node_id} cannot bind to {self.source_ip}: {e}")
|
logger.error(f"Node {self.node_id} cannot bind to {self.source_ip}: {e}")
|
||||||
logger.error("Make sure the IP address is available on your loopback interface.")
|
logger.error(
|
||||||
logger.error("You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)")
|
"Make sure the IP address is available on your loopback interface."
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
"You might need to add it with: sudo ifconfig lo0 alias {self.source_ip} (macOS)"
|
||||||
|
)
|
||||||
logger.error("Or: sudo ip addr add {self.source_ip}/8 dev lo (Linux)")
|
logger.error("Or: sudo ip addr add {self.source_ip}/8 dev lo (Linux)")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -118,35 +140,40 @@ class SimulatedNode:
|
|||||||
"""Generates simulated node status metrics with per-node characteristics."""
|
"""Generates simulated node status metrics with per-node characteristics."""
|
||||||
|
|
||||||
self.uptime_seconds += self.update_interval + random.randint(-1, 2)
|
self.uptime_seconds += self.update_interval + random.randint(-1, 2)
|
||||||
|
|
||||||
# Generate load with some randomness but consistent per-node baseline
|
# Generate load with some randomness but consistent per-node baseline
|
||||||
load_1min = max(0.1, self.base_load + random.uniform(-self.load_variance, self.load_variance))
|
load_1min = max(
|
||||||
|
0.1,
|
||||||
|
self.base_load + random.uniform(-self.load_variance, self.load_variance),
|
||||||
|
)
|
||||||
load_5min = max(0.1, load_1min * random.uniform(0.8, 1.0))
|
load_5min = max(0.1, load_1min * random.uniform(0.8, 1.0))
|
||||||
load_15min = max(0.1, load_5min * random.uniform(0.8, 1.0))
|
load_15min = max(0.1, load_5min * random.uniform(0.8, 1.0))
|
||||||
|
|
||||||
load_avg = [
|
load_avg = [round(load_1min, 2), round(load_5min, 2), round(load_15min, 2)]
|
||||||
round(load_1min, 2),
|
|
||||||
round(load_5min, 2),
|
|
||||||
round(load_15min, 2)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Generate memory usage with baseline + variance
|
# Generate memory usage with baseline + variance
|
||||||
memory_usage = max(10.0, min(95.0,
|
memory_usage = max(
|
||||||
self.base_memory + random.uniform(-self.memory_variance, self.memory_variance)))
|
10.0,
|
||||||
|
min(
|
||||||
|
95.0,
|
||||||
|
self.base_memory
|
||||||
|
+ random.uniform(-self.memory_variance, self.memory_variance),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"uptime_seconds": self.uptime_seconds,
|
"uptime_seconds": self.uptime_seconds,
|
||||||
"load_avg": load_avg,
|
"load_avg": load_avg,
|
||||||
"memory_usage_percent": round(memory_usage, 2)
|
"memory_usage_percent": round(memory_usage, 2),
|
||||||
}
|
}
|
||||||
|
|
||||||
def generate_ping_data(self):
|
def generate_ping_data(self):
|
||||||
"""Generates simulated ping latencies to known peers."""
|
"""Generates simulated ping latencies to known peers."""
|
||||||
pings = {}
|
pings = {}
|
||||||
|
|
||||||
# Ping to self (loopback)
|
# Ping to self (loopback)
|
||||||
pings[self.node_uuid] = round(random.uniform(0.1, 1.5), 2)
|
pings[self.node_uuid] = round(random.uniform(0.1, 1.5), 2)
|
||||||
|
|
||||||
# Ping to known peers
|
# Ping to known peers
|
||||||
for peer_uuid in self.known_peers.keys():
|
for peer_uuid in self.known_peers.keys():
|
||||||
if peer_uuid != self.node_uuid:
|
if peer_uuid != self.node_uuid:
|
||||||
@ -155,7 +182,7 @@ class SimulatedNode:
|
|||||||
variation = random.uniform(-10.0, 10.0)
|
variation = random.uniform(-10.0, 10.0)
|
||||||
latency = max(0.1, base_latency + variation)
|
latency = max(0.1, base_latency + variation)
|
||||||
pings[peer_uuid] = round(latency, 2)
|
pings[peer_uuid] = round(latency, 2)
|
||||||
|
|
||||||
return pings
|
return pings
|
||||||
|
|
||||||
def send_update(self):
|
def send_update(self):
|
||||||
@ -168,49 +195,68 @@ class SimulatedNode:
|
|||||||
"node": self.node_uuid,
|
"node": self.node_uuid,
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
"status": status_data,
|
"status": status_data,
|
||||||
"pings": ping_data
|
"pings": ping_data,
|
||||||
}
|
}
|
||||||
endpoint_url = f"{self.server_url}/{self.service_uuid}/{self.node_uuid}/"
|
endpoint_url = f"{self.server_url}/{self.service_uuid}/{self.node_uuid}/"
|
||||||
|
|
||||||
logger.debug(f"Node {self.node_id} ({self.source_ip}) sending update. "
|
logger.debug(
|
||||||
f"Uptime: {status_data['uptime_seconds']}s, "
|
f"Node {self.node_id} ({self.source_ip}) sending update. "
|
||||||
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
f"Uptime: {status_data['uptime_seconds']}s, "
|
||||||
f"Pings: {len(ping_data)}")
|
f"Load: {status_data['load_avg'][0]}, Memory: {status_data['memory_usage_percent']}%, "
|
||||||
|
f"Pings: {len(ping_data)}"
|
||||||
|
)
|
||||||
|
|
||||||
# Use the custom session with IP binding
|
# Use the custom session with IP binding
|
||||||
response = self.session.put(endpoint_url, json=payload, timeout=10)
|
response = self.session.put(endpoint_url, json=payload, timeout=10)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
response_data = response.json()
|
response_data = response.json()
|
||||||
|
|
||||||
if "peers" in response_data and isinstance(response_data["peers"], dict):
|
if "peers" in response_data and isinstance(
|
||||||
|
response_data["peers"], dict
|
||||||
|
):
|
||||||
new_peers = {k: v for k, v in response_data["peers"].items()}
|
new_peers = {k: v for k, v in response_data["peers"].items()}
|
||||||
|
|
||||||
# Log new peer discoveries
|
# Log new peer discoveries
|
||||||
newly_discovered = set(new_peers.keys()) - set(self.known_peers.keys())
|
newly_discovered = set(new_peers.keys()) - set(
|
||||||
|
self.known_peers.keys()
|
||||||
|
)
|
||||||
if newly_discovered:
|
if newly_discovered:
|
||||||
logger.info(f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)")
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) discovered {len(newly_discovered)} new peer(s)"
|
||||||
|
)
|
||||||
|
|
||||||
self.known_peers = new_peers
|
self.known_peers = new_peers
|
||||||
|
|
||||||
if len(newly_discovered) > 0 or len(self.known_peers) != self.total_nodes - 1:
|
if (
|
||||||
logger.debug(f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
len(newly_discovered) > 0
|
||||||
f"(expected {self.total_nodes - 1})")
|
or len(self.known_peers) != self.total_nodes - 1
|
||||||
|
):
|
||||||
|
logger.debug(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) knows {len(self.known_peers)} peers "
|
||||||
|
f"(expected {self.total_nodes - 1})"
|
||||||
|
)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
logger.error(f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
logger.error(
|
||||||
f"Status: {response.status_code}, Response: {response.text}")
|
f"Node {self.node_id} ({self.source_ip}) failed to send update. "
|
||||||
|
f"Status: {response.status_code}, Response: {response.text}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
except requests.exceptions.Timeout:
|
||||||
logger.error(f"Node {self.node_id} ({self.source_ip}) request timed out")
|
logger.error(f"Node {self.node_id} ({self.source_ip}) request timed out")
|
||||||
return False
|
return False
|
||||||
except requests.exceptions.ConnectionError as e:
|
except requests.exceptions.ConnectionError as e:
|
||||||
logger.error(f"Node {self.node_id} ({self.source_ip}) connection error: {e}")
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) connection error: {e}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}")
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) unexpected error: {e}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
@ -219,37 +265,45 @@ class SimulatedNode:
|
|||||||
if not self.test_ip_binding():
|
if not self.test_ip_binding():
|
||||||
logger.error(f"Node {self.node_id} cannot start due to IP binding failure")
|
logger.error(f"Node {self.node_id} cannot start due to IP binding failure")
|
||||||
return
|
return
|
||||||
|
|
||||||
self.running = True
|
self.running = True
|
||||||
logger.info(f"Starting Node {self.node_id} with UUID: {self.node_uuid} (IP: {self.source_ip})")
|
logger.info(
|
||||||
|
f"Starting Node {self.node_id} with UUID: {self.node_uuid} (IP: {self.source_ip})"
|
||||||
|
)
|
||||||
|
|
||||||
# Add some initial delay to stagger node starts
|
# Add some initial delay to stagger node starts
|
||||||
initial_delay = self.node_id * 0.5
|
initial_delay = self.node_id * 0.5
|
||||||
time.sleep(initial_delay)
|
time.sleep(initial_delay)
|
||||||
|
|
||||||
consecutive_failures = 0
|
consecutive_failures = 0
|
||||||
|
|
||||||
while self.running:
|
while self.running:
|
||||||
try:
|
try:
|
||||||
success = self.send_update()
|
success = self.send_update()
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
consecutive_failures = 0
|
consecutive_failures = 0
|
||||||
else:
|
else:
|
||||||
consecutive_failures += 1
|
consecutive_failures += 1
|
||||||
if consecutive_failures >= 3:
|
if consecutive_failures >= 3:
|
||||||
logger.warning(f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates")
|
logger.warning(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) has failed {consecutive_failures} consecutive updates"
|
||||||
|
)
|
||||||
|
|
||||||
# Add some jitter to prevent thundering herd
|
# Add some jitter to prevent thundering herd
|
||||||
jitter = random.uniform(-1.0, 1.0)
|
jitter = random.uniform(-1.0, 1.0)
|
||||||
sleep_time = max(1.0, self.update_interval + jitter)
|
sleep_time = max(1.0, self.update_interval + jitter)
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logger.info(f"Node {self.node_id} ({self.source_ip}) received interrupt signal")
|
logger.info(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) received interrupt signal"
|
||||||
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}")
|
logger.error(
|
||||||
|
f"Node {self.node_id} ({self.source_ip}) unexpected error in main loop: {e}"
|
||||||
|
)
|
||||||
time.sleep(self.update_interval)
|
time.sleep(self.update_interval)
|
||||||
|
|
||||||
logger.info(f"Node {self.node_id} ({self.source_ip}) stopped")
|
logger.info(f"Node {self.node_id} ({self.source_ip}) stopped")
|
||||||
@ -258,9 +312,17 @@ class SimulatedNode:
|
|||||||
"""Stop the node."""
|
"""Stop the node."""
|
||||||
self.running = False
|
self.running = False
|
||||||
|
|
||||||
|
|
||||||
# --- Multi-Node Manager ---
|
# --- Multi-Node Manager ---
|
||||||
class MultiNodeManager:
|
class MultiNodeManager:
|
||||||
def __init__(self, num_nodes: int, server_url: str, service_uuid: str, update_interval: int, ip_base: int):
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_nodes: int,
|
||||||
|
server_url: str,
|
||||||
|
service_uuid: str,
|
||||||
|
update_interval: int,
|
||||||
|
ip_base: int,
|
||||||
|
):
|
||||||
self.num_nodes = num_nodes
|
self.num_nodes = num_nodes
|
||||||
self.server_url = server_url
|
self.server_url = server_url
|
||||||
self.service_uuid = service_uuid
|
self.service_uuid = service_uuid
|
||||||
@ -269,30 +331,36 @@ class MultiNodeManager:
|
|||||||
self.nodes = []
|
self.nodes = []
|
||||||
self.threads = []
|
self.threads = []
|
||||||
self.running = False
|
self.running = False
|
||||||
|
|
||||||
# Create simulated nodes
|
# Create simulated nodes
|
||||||
for i in range(num_nodes):
|
for i in range(num_nodes):
|
||||||
node = SimulatedNode(i + 1, num_nodes, server_url, service_uuid, update_interval, ip_base)
|
node = SimulatedNode(
|
||||||
|
i + 1, num_nodes, server_url, service_uuid, update_interval, ip_base
|
||||||
|
)
|
||||||
self.nodes.append(node)
|
self.nodes.append(node)
|
||||||
|
|
||||||
def check_ip_availability(self):
|
def check_ip_availability(self):
|
||||||
"""Check if all required IP addresses are available."""
|
"""Check if all required IP addresses are available."""
|
||||||
logger.info("Checking IP address availability...")
|
logger.info("Checking IP address availability...")
|
||||||
all_available = True
|
all_available = True
|
||||||
|
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
if not node.test_ip_binding():
|
if not node.test_ip_binding():
|
||||||
all_available = False
|
all_available = False
|
||||||
|
|
||||||
if not all_available:
|
if not all_available:
|
||||||
logger.error("Some IP addresses are not available. See individual node errors above.")
|
logger.error(
|
||||||
|
"Some IP addresses are not available. See individual node errors above."
|
||||||
|
)
|
||||||
logger.info("To add loopback IP addresses:")
|
logger.info("To add loopback IP addresses:")
|
||||||
logger.info(" Linux: sudo ip addr add 127.0.0.X/8 dev lo")
|
logger.info(" Linux: sudo ip addr add 127.0.0.X/8 dev lo")
|
||||||
logger.info(" macOS: sudo ifconfig lo0 alias 127.0.0.X")
|
logger.info(" macOS: sudo ifconfig lo0 alias 127.0.0.X")
|
||||||
# Use self.ip_base for the range
|
# Use self.ip_base for the range
|
||||||
logger.info(f" Where X ranges from {self.ip_base} to {self.ip_base + self.num_nodes - 1}")
|
logger.info(
|
||||||
|
f" Where X ranges from {self.ip_base} to {self.ip_base + self.num_nodes - 1}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.info("All IP addresses are available!")
|
logger.info("All IP addresses are available!")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -300,16 +368,18 @@ class MultiNodeManager:
|
|||||||
"""Start all simulated nodes in separate threads."""
|
"""Start all simulated nodes in separate threads."""
|
||||||
if not self.check_ip_availability():
|
if not self.check_ip_availability():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.info(f"Starting {self.num_nodes} simulated nodes with unique IP addresses...")
|
logger.info(
|
||||||
|
f"Starting {self.num_nodes} simulated nodes with unique IP addresses..."
|
||||||
|
)
|
||||||
self.running = True
|
self.running = True
|
||||||
|
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
thread = threading.Thread(target=node.run, name=f"Node-{node.node_id}")
|
thread = threading.Thread(target=node.run, name=f"Node-{node.node_id}")
|
||||||
thread.daemon = True
|
thread.daemon = True
|
||||||
self.threads.append(thread)
|
self.threads.append(thread)
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
logger.info(f"All {self.num_nodes} nodes started")
|
logger.info(f"All {self.num_nodes} nodes started")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -317,33 +387,37 @@ class MultiNodeManager:
|
|||||||
"""Stop all simulated nodes."""
|
"""Stop all simulated nodes."""
|
||||||
logger.info("Stopping all nodes...")
|
logger.info("Stopping all nodes...")
|
||||||
self.running = False
|
self.running = False
|
||||||
|
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
node.stop()
|
node.stop()
|
||||||
|
|
||||||
# Wait for threads to finish
|
# Wait for threads to finish
|
||||||
for thread in self.threads:
|
for thread in self.threads:
|
||||||
thread.join(timeout=5.0)
|
thread.join(timeout=5.0)
|
||||||
|
|
||||||
logger.info("All nodes stopped")
|
logger.info("All nodes stopped")
|
||||||
|
|
||||||
def print_status(self):
|
def print_status(self):
|
||||||
"""Print current status of all nodes."""
|
"""Print current status of all nodes."""
|
||||||
logger.info(f"=== Multi-Node Status ({self.num_nodes} nodes) ===")
|
logger.info(f"=== Multi-Node Status ({self.num_nodes} nodes) ===")
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
logger.info(f"Node {node.node_id}: IP={node.source_ip}, UUID={node.node_uuid[:8]}..., "
|
logger.info(
|
||||||
f"Uptime={node.uptime_seconds}s, Peers={len(node.known_peers)}")
|
f"Node {node.node_id}: IP={node.source_ip}, UUID={node.node_uuid[:8]}..., "
|
||||||
|
f"Uptime={node.uptime_seconds}s, Peers={len(node.known_peers)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def setup_loopback_ips(num_nodes, base_ip):
|
def setup_loopback_ips(num_nodes, base_ip):
|
||||||
"""Helper function to show commands for setting up loopback IPs."""
|
"""Helper function to show commands for setting up loopback IPs."""
|
||||||
logger.info("=== Loopback IP Setup Commands ===")
|
logger.info("=== Loopback IP Setup Commands ===")
|
||||||
logger.info("Run these commands to add the required loopback IP addresses:")
|
logger.info("Run these commands to add the required loopback IP addresses:")
|
||||||
logger.info("")
|
logger.info("")
|
||||||
|
|
||||||
# Detect OS and show appropriate commands
|
# Detect OS and show appropriate commands
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
system = platform.system().lower()
|
system = platform.system().lower()
|
||||||
|
|
||||||
for i in range(num_nodes):
|
for i in range(num_nodes):
|
||||||
ip = f"127.0.0.{base_ip + i}"
|
ip = f"127.0.0.{base_ip + i}"
|
||||||
if system == "linux":
|
if system == "linux":
|
||||||
@ -352,7 +426,7 @@ def setup_loopback_ips(num_nodes, base_ip):
|
|||||||
logger.info(f"sudo ifconfig lo0 alias {ip}")
|
logger.info(f"sudo ifconfig lo0 alias {ip}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Add {ip} to loopback interface (OS: {system})")
|
logger.info(f"Add {ip} to loopback interface (OS: {system})")
|
||||||
|
|
||||||
logger.info("")
|
logger.info("")
|
||||||
logger.info("To remove them later:")
|
logger.info("To remove them later:")
|
||||||
for i in range(num_nodes):
|
for i in range(num_nodes):
|
||||||
@ -361,51 +435,82 @@ def setup_loopback_ips(num_nodes, base_ip):
|
|||||||
logger.info(f"sudo ip addr del {ip}/8 dev lo")
|
logger.info(f"sudo ip addr del {ip}/8 dev lo")
|
||||||
elif system == "darwin": # macOS
|
elif system == "darwin": # macOS
|
||||||
logger.info(f"sudo ifconfig lo0 -alias {ip}")
|
logger.info(f"sudo ifconfig lo0 -alias {ip}")
|
||||||
|
|
||||||
logger.info("=" * 40)
|
logger.info("=" * 40)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Multi-node test client with unique IP binding')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('--nodes', type=int, default=NUM_NODES,
|
description="Multi-node test client with unique IP binding"
|
||||||
help=f'Number of simulated nodes (default: {NUM_NODES})')
|
)
|
||||||
parser.add_argument('--interval', type=int, default=UPDATE_INTERVAL_SECONDS,
|
parser.add_argument(
|
||||||
help=f'Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})')
|
"--nodes",
|
||||||
parser.add_argument('--server', type=str, default=SERVER_BASE_URL,
|
type=int,
|
||||||
help=f'Server URL (default: {SERVER_BASE_URL})')
|
default=NUM_NODES,
|
||||||
parser.add_argument('--service-uuid', type=str, default=TARGET_SERVICE_UUID,
|
help=f"Number of simulated nodes (default: {NUM_NODES})",
|
||||||
help='Target service UUID')
|
)
|
||||||
parser.add_argument('--ip-base', type=int, default=LOOPBACK_IP_BASE,
|
parser.add_argument(
|
||||||
help=f'Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})')
|
"--interval",
|
||||||
parser.add_argument('--setup-help', action='store_true',
|
type=int,
|
||||||
help='Show commands to set up loopback IP addresses')
|
default=UPDATE_INTERVAL_SECONDS,
|
||||||
parser.add_argument('--verbose', '-v', action='store_true',
|
help=f"Update interval in seconds (default: {UPDATE_INTERVAL_SECONDS})",
|
||||||
help='Enable verbose logging')
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--server",
|
||||||
|
type=str,
|
||||||
|
default=SERVER_BASE_URL,
|
||||||
|
help=f"Server URL (default: {SERVER_BASE_URL})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--service-uuid",
|
||||||
|
type=str,
|
||||||
|
default=TARGET_SERVICE_UUID,
|
||||||
|
help="Target service UUID",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ip-base",
|
||||||
|
type=int,
|
||||||
|
default=LOOPBACK_IP_BASE,
|
||||||
|
help=f"Starting IP for 127.0.0.X (default: {LOOPBACK_IP_BASE})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--setup-help",
|
||||||
|
action="store_true",
|
||||||
|
help="Show commands to set up loopback IP addresses",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", "-v", action="store_true", help="Enable verbose logging"
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
num_nodes = args.nodes
|
num_nodes = args.nodes
|
||||||
update_interval = args.interval
|
update_interval = args.interval
|
||||||
server_url = args.server
|
server_url = args.server
|
||||||
service_uuid = args.service_uuid
|
service_uuid = args.service_uuid
|
||||||
ip_base = args.ip_base
|
ip_base = args.ip_base
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
if args.setup_help:
|
if args.setup_help:
|
||||||
setup_loopback_ips(num_nodes, ip_base)
|
setup_loopback_ips(num_nodes, ip_base)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Validate configuration
|
# Validate configuration
|
||||||
if service_uuid == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
|
if service_uuid == "REPLACE_ME_WITH_YOUR_SERVER_SERVICE_UUID":
|
||||||
logger.error("=" * 60)
|
logger.error("=" * 60)
|
||||||
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
|
logger.error("ERROR: TARGET_SERVICE_UUID is not set correctly!")
|
||||||
logger.error("Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable.")
|
logger.error(
|
||||||
logger.error("You can find the server's UUID by running main.py and checking its console output")
|
"Please set it via --service-uuid argument or TARGET_SERVICE_UUID environment variable."
|
||||||
|
)
|
||||||
|
logger.error(
|
||||||
|
"You can find the server's UUID by running main.py and checking its console output"
|
||||||
|
)
|
||||||
logger.error("or by visiting the server's root endpoint in your browser.")
|
logger.error("or by visiting the server's root endpoint in your browser.")
|
||||||
logger.error("=" * 60)
|
logger.error("=" * 60)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("Multi-Node Test Client Configuration:")
|
logger.info("Multi-Node Test Client Configuration:")
|
||||||
logger.info(f" Number of nodes: {num_nodes}")
|
logger.info(f" Number of nodes: {num_nodes}")
|
||||||
@ -414,21 +519,23 @@ def main():
|
|||||||
logger.info(f" Target Service UUID: {service_uuid}")
|
logger.info(f" Target Service UUID: {service_uuid}")
|
||||||
logger.info(f" IP range: 127.0.0.{ip_base} - 127.0.0.{ip_base + num_nodes - 1}")
|
logger.info(f" IP range: 127.0.0.{ip_base} - 127.0.0.{ip_base + num_nodes - 1}")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
# Create and start the multi-node manager
|
# Create and start the multi-node manager
|
||||||
manager = MultiNodeManager(num_nodes, server_url, service_uuid, update_interval, ip_base)
|
manager = MultiNodeManager(
|
||||||
|
num_nodes, server_url, service_uuid, update_interval, ip_base
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not manager.start_all_nodes():
|
if not manager.start_all_nodes():
|
||||||
logger.error("Failed to start nodes. Check IP availability.")
|
logger.error("Failed to start nodes. Check IP availability.")
|
||||||
setup_loopback_ips(num_nodes, ip_base)
|
setup_loopback_ips(num_nodes, ip_base)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Main monitoring loop
|
# Main monitoring loop
|
||||||
while True:
|
while True:
|
||||||
time.sleep(30) # Print status every 30 seconds
|
time.sleep(30) # Print status every 30 seconds
|
||||||
manager.print_status()
|
manager.print_status()
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logger.info("Received interrupt signal, shutting down...")
|
logger.info("Received interrupt signal, shutting down...")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -436,5 +543,6 @@ def main():
|
|||||||
finally:
|
finally:
|
||||||
manager.stop_all_nodes()
|
manager.stop_all_nodes()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Reference in New Issue
Block a user