Files
kalzu-value-store/integration_test.sh
ryyst 16c0766a15 improve: add robust retry logic to conflict resolution test
Replace the fixed 20-second wait with intelligent retry logic that:
- Checks for convergence every 3 seconds for up to 60 seconds
- Provides detailed progress logging showing current state
- Reduces sync interval from 8s to 3s for faster testing
- Adds 10-second cluster stabilization period

This makes the test more reliable and provides better diagnostics when
conflict resolution doesn't work as expected. The retry logic reveals
that the current conflict resolution mechanism needs investigation,
but the test infrastructure itself is now much more robust.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-20 19:03:15 +03:00

397 lines
14 KiB
Bash
Executable File

#!/bin/bash
# KVS Integration Test Suite - Adapted for Merkle Tree Sync
# Tests all critical features of the distributed key-value store with Merkle Tree replication
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Test configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TEST_DIR="$SCRIPT_DIR/integration_test"
BINARY="$SCRIPT_DIR/kvs"
# Counters
TESTS_PASSED=0
TESTS_FAILED=0
TOTAL_TESTS=0
# Helper functions
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[PASS]${NC} $1"
((TESTS_PASSED++))
}
log_error() {
echo -e "${RED}[FAIL]${NC} $1"
((TESTS_FAILED++))
}
test_start() {
((TOTAL_TESTS++))
log_info "Test $TOTAL_TESTS: $1"
}
# Cleanup function
cleanup() {
log_info "Cleaning up test environment..."
pkill -f "$BINARY" 2>/dev/null || true
rm -rf "$TEST_DIR" 2>/dev/null || true
sleep 2 # Allow processes to fully terminate
}
# Wait for service to be ready
wait_for_service() {
local port=$1
local timeout=${2:-30}
local count=0
while [ $count -lt $timeout ]; do
if curl -s "http://localhost:$port/health" >/dev/null 2>&1; then
return 0
fi
sleep 1
((count++))
done
return 1
}
# Test 1: Build verification
test_build() {
test_start "Binary build verification"
cd "$SCRIPT_DIR"
if go build -o kvs . >/dev/null 2>&1; then
log_success "Binary builds successfully"
else
log_error "Binary build failed"
return 1
fi
# Ensure we are back in TEST_DIR for subsequent tests
cd "$TEST_DIR"
}
# Test 2: Basic functionality
test_basic_functionality() {
test_start "Basic functionality test"
# Create basic config
cat > basic.yaml <<EOF
node_id: "basic-test"
bind_address: "127.0.0.1"
port: 8090
data_dir: "./basic_data"
seed_nodes: []
log_level: "error"
EOF
# Start node
$BINARY basic.yaml >/dev/null 2>&1 &
local pid=$!
if wait_for_service 8090; then
# Test basic CRUD
local put_result=$(curl -s -X PUT http://localhost:8090/kv/test/basic \
-H "Content-Type: application/json" \
-d '{"message":"hello world"}')
local get_result=$(curl -s http://localhost:8090/kv/test/basic)
local message=$(echo "$get_result" | jq -r '.data.message' 2>/dev/null) # Adjusted jq path
if [ "$message" = "hello world" ]; then
log_success "Basic CRUD operations work"
else
log_error "Basic CRUD failed: Expected 'hello world', got '$message' from $get_result"
fi
else
log_error "Basic test node failed to start"
fi
kill $pid 2>/dev/null || true
sleep 2
}
# Test 3: Cluster formation
test_cluster_formation() {
test_start "2-node cluster formation and Merkle Tree replication"
# Node 1 config
cat > cluster1.yaml <<EOF
node_id: "cluster-1"
bind_address: "127.0.0.1"
port: 8101
data_dir: "./cluster1_data"
seed_nodes: []
log_level: "error"
gossip_interval_min: 5
gossip_interval_max: 10
sync_interval: 10
EOF
# Node 2 config
cat > cluster2.yaml <<EOF
node_id: "cluster-2"
bind_address: "127.0.0.1"
port: 8102
data_dir: "./cluster2_data"
seed_nodes: ["127.0.0.1:8101"]
log_level: "error"
gossip_interval_min: 5
gossip_interval_max: 10
sync_interval: 10
EOF
# Start nodes
$BINARY cluster1.yaml >/dev/null 2>&1 &
local pid1=$!
if ! wait_for_service 8101; then
log_error "Cluster node 1 failed to start"
kill $pid1 2>/dev/null || true
return 1
fi
sleep 2 # Give node 1 a moment to fully initialize
$BINARY cluster2.yaml >/dev/null 2>&1 &
local pid2=$!
if ! wait_for_service 8102; then
log_error "Cluster node 2 failed to start"
kill $pid1 $pid2 2>/dev/null || true
return 1
fi
# Wait for cluster formation and initial Merkle sync
sleep 15
# Check if nodes see each other
local node1_members=$(curl -s http://localhost:8101/members/ | jq length 2>/dev/null || echo 0)
local node2_members=$(curl -s http://localhost:8102/members/ | jq length 2>/dev/null || echo 0)
if [ "$node1_members" -ge 1 ] && [ "$node2_members" -ge 1 ]; then
log_success "2-node cluster formed successfully (N1 members: $node1_members, N2 members: $node2_members)"
# Test data replication
log_info "Putting data on Node 1, waiting for Merkle sync..."
curl -s -X PUT http://localhost:8101/kv/cluster/test \
-H "Content-Type: application/json" \
-d '{"source":"node1", "value": 1}' >/dev/null
# Wait for Merkle sync cycle to complete
sleep 12
local node2_data_full=$(curl -s http://localhost:8102/kv/cluster/test)
local node2_data_source=$(echo "$node2_data_full" | jq -r '.data.source' 2>/dev/null)
local node2_data_value=$(echo "$node2_data_full" | jq -r '.data.value' 2>/dev/null)
local node1_data_full=$(curl -s http://localhost:8101/kv/cluster/test)
if [ "$node2_data_source" = "node1" ] && [ "$node2_data_value" = "1" ]; then
log_success "Data replication works correctly (Node 2 has data from Node 1)"
# Verify UUIDs and Timestamps are identical (crucial for Merkle sync correctness)
local node1_uuid=$(echo "$node1_data_full" | jq -r '.uuid' 2>/dev/null)
local node1_timestamp=$(echo "$node1_data_full" | jq -r '.timestamp' 2>/dev/null)
local node2_uuid=$(echo "$node2_data_full" | jq -r '.uuid' 2>/dev/null)
local node2_timestamp=$(echo "$node2_data_full" | jq -r '.timestamp' 2>/dev/null)
if [ "$node1_uuid" = "$node2_uuid" ] && [ "$node1_timestamp" = "$node2_timestamp" ]; then
log_success "Replicated data retains original UUID and Timestamp"
else
log_error "Replicated data changed UUID/Timestamp: N1_UUID=$node1_uuid, N1_TS=$node1_timestamp, N2_UUID=$node2_uuid, N2_TS=$node2_timestamp"
fi
else
log_error "Data replication failed: Node 2 data: $node2_data_full"
fi
else
log_error "Cluster formation failed (N1 members: $node1_members, N2 members: $node2_members)"
fi
kill $pid1 $pid2 2>/dev/null || true
sleep 2
}
# Test 4: Conflict resolution (Merkle Tree based)
# This test assumes 'test_conflict.go' creates two BadgerDBs with a key
# that has the same path and timestamp but different UUIDs, or different timestamps
# but same path. The Merkle tree sync should then trigger conflict resolution.
test_conflict_resolution() {
test_start "Conflict resolution test (Merkle Tree based)"
# Create conflicting data using our utility
rm -rf conflict1_data conflict2_data 2>/dev/null || true
mkdir -p conflict1_data conflict2_data
cd "$SCRIPT_DIR"
if go run test_conflict.go "$TEST_DIR/conflict1_data" "$TEST_DIR/conflict2_data" >/dev/null 2>&1; then
cd "$TEST_DIR"
# Create configs
cat > conflict1.yaml <<EOF
node_id: "conflict-1"
bind_address: "127.0.0.1"
port: 8111
data_dir: "./conflict1_data"
seed_nodes: []
log_level: "info"
sync_interval: 3
EOF
cat > conflict2.yaml <<EOF
node_id: "conflict-2"
bind_address: "127.0.0.1"
port: 8112
data_dir: "./conflict2_data"
seed_nodes: ["127.0.0.1:8111"]
log_level: "info"
sync_interval: 3
EOF
# Start nodes
# Node 1 started first, making it "older" for tie-breaker if timestamps are equal
"$BINARY" conflict1.yaml >conflict1.log 2>&1 &
local pid1=$!
if wait_for_service 8111; then
sleep 2
$BINARY conflict2.yaml >conflict2.log 2>&1 &
local pid2=$!
if wait_for_service 8112; then
# Get initial data (full StoredValue)
local node1_initial_full=$(curl -s http://localhost:8111/kv/test/conflict/data)
local node2_initial_full=$(curl -s http://localhost:8112/kv/test/conflict/data)
local node1_initial_msg=$(echo "$node1_initial_full" | jq -r '.data.message' 2>/dev/null)
local node2_initial_msg=$(echo "$node2_initial_full" | jq -r '.data.message' 2>/dev/null)
log_info "Initial conflict state: Node1='$node1_initial_msg', Node2='$node2_initial_msg'"
# Allow time for cluster formation and gossip protocol to stabilize
log_info "Waiting for cluster formation and gossip stabilization..."
sleep 10
# Wait for conflict resolution with retry logic (up to 60 seconds)
local max_attempts=20
local attempt=1
local node1_final_msg=""
local node2_final_msg=""
local node1_final_full=""
local node2_final_full=""
log_info "Waiting for conflict resolution (checking every 3 seconds, max 60 seconds)..."
while [ $attempt -le $max_attempts ]; do
sleep 3
# Get current data from both nodes
node1_final_full=$(curl -s http://localhost:8111/kv/test/conflict/data)
node2_final_full=$(curl -s http://localhost:8112/kv/test/conflict/data)
node1_final_msg=$(echo "$node1_final_full" | jq -r '.data.message' 2>/dev/null)
node2_final_msg=$(echo "$node2_final_full" | jq -r '.data.message' 2>/dev/null)
# Check if they've converged
if [ "$node1_final_msg" = "$node2_final_msg" ] && [ -n "$node1_final_msg" ] && [ "$node1_final_msg" != "null" ]; then
log_info "Conflict resolution achieved after $((attempt * 3)) seconds"
break
fi
log_info "Attempt $attempt/$max_attempts: Node1='$node1_final_msg', Node2='$node2_final_msg' (not converged yet)"
attempt=$((attempt + 1))
done
# Check if they converged
if [ "$node1_final_msg" = "$node2_final_msg" ] && [ -n "$node1_final_msg" ]; then
log_success "Conflict resolution converged to: '$node1_final_msg'"
# Verify UUIDs and Timestamps are identical after resolution
local node1_final_uuid=$(echo "$node1_final_full" | jq -r '.uuid' 2>/dev/null)
local node1_final_timestamp=$(echo "$node1_final_full" | jq -r '.timestamp' 2>/dev/null)
local node2_final_uuid=$(echo "$node2_final_full" | jq -r '.uuid' 2>/dev/null)
local node2_final_timestamp=$(echo "$node2_final_full" | jq -r '.timestamp' 2>/dev/null)
if [ "$node1_final_uuid" = "$node2_final_uuid" ] && [ "$node1_final_timestamp" = "$node2_final_timestamp" ]; then
log_success "Resolved data retains consistent UUID and Timestamp across nodes"
else
log_error "Resolved data has inconsistent UUID/Timestamp: N1_UUID=$node1_final_uuid, N1_TS=$node1_final_timestamp, N2_UUID=$node2_final_uuid, N2_TS=$node2_final_timestamp"
fi
# Optionally, check logs for conflict resolution messages
if grep -q "Conflict resolved" conflict1.log conflict2.log 2>/dev/null; then
log_success "Conflict resolution messages found in logs"
else
log_error "No 'Conflict resolved' messages found in logs, but data converged."
fi
else
log_error "Conflict resolution failed: N1_final='$node1_final_msg', N2_final='$node2_final_msg'"
fi
else
log_error "Conflict node 2 failed to start"
fi
kill $pid2 2>/dev/null || true
else
log_error "Conflict node 1 failed to start"
fi
kill $pid1 2>/dev/null || true
sleep 2
else
cd "$TEST_DIR"
log_error "Failed to create conflict test data. Ensure test_conflict.go is correct."
fi
}
# Main test execution
main() {
echo "=================================================="
echo " KVS Integration Test Suite (Merkle Tree)"
echo "=================================================="
# Setup
log_info "Setting up test environment..."
cleanup
mkdir -p "$TEST_DIR"
cd "$TEST_DIR"
# Run core tests
test_build
test_basic_functionality
test_cluster_formation
test_conflict_resolution
# Results
echo "=================================================="
echo " Test Results"
echo "=================================================="
echo -e "Total Tests: $TOTAL_TESTS"
echo -e "${GREEN}Passed: $TESTS_PASSED${NC}"
echo -e "${RED}Failed: $TESTS_FAILED${NC}"
echo "=================================================="
if [ $TESTS_FAILED -eq 0 ]; then
echo -e "${GREEN}🎉 All tests passed! KVS with Merkle Tree sync is working correctly.${NC}"
cleanup
exit 0
else
echo -e "${RED}❌ Some tests failed. Please check the output above.${NC}"
cleanup
exit 1
fi
}
# Handle interruption
trap cleanup INT TERM
# Run tests
main "$@"