Moving basic grid stuff to jinja2 from JS. Gemini fixed the logs display it seems.

This commit is contained in:
Kalzu Rekku
2025-06-13 23:10:39 +03:00
parent a1f4fc556b
commit 44c13c16df
7 changed files with 1300 additions and 187 deletions

View File

@ -12,9 +12,9 @@ class RRDDatabase:
# Use environment variable or default to /data
if data_dir is None:
data_dir = os.environ.get("DATA_DIR", "/data")
self.data_dir = Path(data_dir)
# Create data directory if it doesn't exist
try:
self.data_dir.mkdir(parents=True, exist_ok=True)
@ -29,11 +29,11 @@ class RRDDatabase:
# RRD configuration
self.step = 60 # 1-minute intervals
self.heartbeat = 120 # 2-minute heartbeat (allow 1 missed update)
# Retention policy (6 months total)
self.rra_config = [
"RRA:AVERAGE:0.5:1:1440", # 1-min avg for 24 hours (1440 points)
"RRA:AVERAGE:0.5:60:744", # 1-hour avg for 31 days (744 points)
"RRA:AVERAGE:0.5:60:744", # 1-hour avg for 31 days (744 points)
"RRA:AVERAGE:0.5:1440:180", # 1-day avg for 6 months (180 points)
"RRA:MAX:0.5:1:1440", # 1-min max for 24 hours
"RRA:MAX:0.5:60:744", # 1-hour max for 31 days
@ -49,10 +49,10 @@ class RRDDatabase:
def _create_system_rrd(self, node_uuid: str) -> str:
"""Create RRD file for system metrics (uptime, load, memory)."""
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
if rrd_file.exists():
return str(rrd_file)
try:
rrdtool.create(
str(rrd_file),
@ -60,7 +60,7 @@ class RRDDatabase:
# Data sources
f"DS:uptime:GAUGE:{self.heartbeat}:0:U", # Uptime in seconds
f"DS:load1:GAUGE:{self.heartbeat}:0:100", # 1-min load average
f"DS:load5:GAUGE:{self.heartbeat}:0:100", # 5-min load average
f"DS:load5:GAUGE:{self.heartbeat}:0:100", # 5-min load average
f"DS:load15:GAUGE:{self.heartbeat}:0:100", # 15-min load average
f"DS:memory:GAUGE:{self.heartbeat}:0:100", # Memory usage %
# Round Robin Archives
@ -75,10 +75,10 @@ class RRDDatabase:
def _create_ping_rrd(self, node_uuid: str, target_uuid: str) -> str:
"""Create RRD file for ping metrics between two nodes."""
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
if rrd_file.exists():
return str(rrd_file)
try:
rrdtool.create(
str(rrd_file),
@ -95,68 +95,68 @@ class RRDDatabase:
logger.error(f"Failed to create ping RRD for {node_uuid}->{target_uuid}: {e}")
raise
def update_system_metrics(self, node_uuid: str, timestamp: datetime,
uptime_seconds: int, load_avg: List[float],
def update_system_metrics(self, node_uuid: str, timestamp: datetime,
uptime_seconds: int, load_avg: List[float],
memory_usage_percent: float):
"""Update system metrics for a node."""
try:
rrd_file = self._create_system_rrd(node_uuid)
# Convert datetime to Unix timestamp
unix_time = int(timestamp.timestamp())
# Format: timestamp:uptime:load1:load5:load15:memory
values = f"{unix_time}:{uptime_seconds}:{load_avg[0]}:{load_avg[1]}:{load_avg[2]}:{memory_usage_percent}"
rrdtool.update(rrd_file, values)
logger.debug(f"Updated system metrics for {node_uuid}: {values}")
except Exception as e:
logger.error(f"Failed to update system metrics for {node_uuid}: {e}")
raise
def update_ping_metrics(self, node_uuid: str, target_uuid: str,
def update_ping_metrics(self, node_uuid: str, target_uuid: str,
timestamp: datetime, latency_ms: float):
"""Update ping metrics between two nodes."""
try:
rrd_file = self._create_ping_rrd(node_uuid, target_uuid)
unix_time = int(timestamp.timestamp())
# For now, we only track latency. Loss can be calculated from missing updates
values = f"{unix_time}:{latency_ms}:0" # 0% loss (could be enhanced)
rrdtool.update(rrd_file, values)
logger.debug(f"Updated ping metrics {node_uuid}->{target_uuid}: {latency_ms}ms")
except Exception as e:
logger.error(f"Failed to update ping metrics {node_uuid}->{target_uuid}: {e}")
raise
def get_system_data(self, node_uuid: str, start_time: str = "-24h",
def get_system_data(self, node_uuid: str, start_time: str = "-24h",
end_time: str = "now") -> Optional[Dict]:
"""Retrieve system metrics data for a node."""
try:
rrd_file = self._get_node_dir(node_uuid) / "system.rrd"
if not rrd_file.exists():
return None
result = rrdtool.fetch(
str(rrd_file),
"AVERAGE",
"--start", start_time,
"--end", end_time
)
# Parse RRDtool fetch result
start, end, step = result[0]
ds_names = result[1] # ['uptime', 'load1', 'load5', 'load15', 'memory']
data_points = result[2]
# Convert to more usable format
timestamps = []
data = {ds: [] for ds in ds_names}
current_time = start
for point in data_points:
timestamps.append(current_time)
@ -164,39 +164,39 @@ class RRDDatabase:
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
data[ds].append(value)
current_time += step
return {
'timestamps': timestamps,
'data': data,
'step': step
'timestamps': timestamps,
'data': data,
'step': step
}
except Exception as e:
logger.error(f"Failed to get system data for {node_uuid}: {e}")
return None
def get_ping_data(self, node_uuid: str, target_uuid: str,
def get_ping_data(self, node_uuid: str, target_uuid: str,
start_time: str = "-24h", end_time: str = "now") -> Optional[Dict]:
"""Retrieve ping metrics between two nodes."""
try:
rrd_file = self._get_node_dir(node_uuid) / f"ping_{target_uuid}.rrd"
if not rrd_file.exists():
return None
result = rrdtool.fetch(
str(rrd_file),
"AVERAGE",
"AVERAGE",
"--start", start_time,
"--end", end_time
)
start, end, step = result[0]
ds_names = result[1] # ['latency', 'loss']
data_points = result[2]
timestamps = []
data = {ds: [] for ds in ds_names}
current_time = start
for point in data_points:
timestamps.append(current_time)
@ -204,13 +204,13 @@ class RRDDatabase:
value = point[i] if point[i] is not None else None # Changed 0 to None for better representation
data[ds].append(value)
current_time += step
return {
'timestamps': timestamps,
'data': data,
'step': step
'timestamps': timestamps,
'data': data,
'step': step
}
except Exception as e:
logger.error(f"Failed to get ping data {node_uuid}->{target_uuid}: {e}")
return None
@ -232,26 +232,26 @@ class RRDDatabase:
# RRD automatically handles data retention based on RRA configuration
# This method could be used for cleaning up orphaned files
cutoff_date = datetime.now() - timedelta(days=190) # 6+ months
try:
for node_dir in self.data_dir.iterdir():
if not node_dir.is_dir():
continue
# Check if any RRD files have been modified recently
rrd_files = list(node_dir.glob("*.rrd"))
if not rrd_files:
continue
# If all RRD files are old, the node is probably dead
all_old = all(
datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date
datetime.fromtimestamp(f.stat().st_mtime) < cutoff_date
for f in rrd_files
)
if all_old:
logger.info(f"Node {node_dir.name} appears inactive for >6 months")
# Could optionally remove the directory here
except Exception as e:
logger.error(f"Failed during cleanup: {e}")