Replace the fixed 20-second wait with intelligent retry logic that: - Checks for convergence every 3 seconds for up to 60 seconds - Provides detailed progress logging showing current state - Reduces sync interval from 8s to 3s for faster testing - Adds 10-second cluster stabilization period This makes the test more reliable and provides better diagnostics when conflict resolution doesn't work as expected. The retry logic reveals that the current conflict resolution mechanism needs investigation, but the test infrastructure itself is now much more robust. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
397 lines
14 KiB
Bash
Executable File
397 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# KVS Integration Test Suite - Adapted for Merkle Tree Sync
|
|
# Tests all critical features of the distributed key-value store with Merkle Tree replication
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Test configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TEST_DIR="$SCRIPT_DIR/integration_test"
|
|
BINARY="$SCRIPT_DIR/kvs"
|
|
|
|
# Counters
|
|
TESTS_PASSED=0
|
|
TESTS_FAILED=0
|
|
TOTAL_TESTS=0
|
|
|
|
# Helper functions
|
|
log_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_success() {
|
|
echo -e "${GREEN}[PASS]${NC} $1"
|
|
((TESTS_PASSED++))
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[FAIL]${NC} $1"
|
|
((TESTS_FAILED++))
|
|
}
|
|
|
|
test_start() {
|
|
((TOTAL_TESTS++))
|
|
log_info "Test $TOTAL_TESTS: $1"
|
|
}
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
log_info "Cleaning up test environment..."
|
|
pkill -f "$BINARY" 2>/dev/null || true
|
|
rm -rf "$TEST_DIR" 2>/dev/null || true
|
|
sleep 2 # Allow processes to fully terminate
|
|
}
|
|
|
|
# Wait for service to be ready
|
|
wait_for_service() {
|
|
local port=$1
|
|
local timeout=${2:-30}
|
|
local count=0
|
|
|
|
while [ $count -lt $timeout ]; do
|
|
if curl -s "http://localhost:$port/health" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
((count++))
|
|
done
|
|
return 1
|
|
}
|
|
|
|
# Test 1: Build verification
|
|
test_build() {
|
|
test_start "Binary build verification"
|
|
|
|
cd "$SCRIPT_DIR"
|
|
if go build -o kvs . >/dev/null 2>&1; then
|
|
log_success "Binary builds successfully"
|
|
else
|
|
log_error "Binary build failed"
|
|
return 1
|
|
fi
|
|
# Ensure we are back in TEST_DIR for subsequent tests
|
|
cd "$TEST_DIR"
|
|
}
|
|
|
|
# Test 2: Basic functionality
|
|
test_basic_functionality() {
|
|
test_start "Basic functionality test"
|
|
|
|
# Create basic config
|
|
cat > basic.yaml <<EOF
|
|
node_id: "basic-test"
|
|
bind_address: "127.0.0.1"
|
|
port: 8090
|
|
data_dir: "./basic_data"
|
|
seed_nodes: []
|
|
log_level: "error"
|
|
EOF
|
|
|
|
# Start node
|
|
$BINARY basic.yaml >/dev/null 2>&1 &
|
|
local pid=$!
|
|
|
|
if wait_for_service 8090; then
|
|
# Test basic CRUD
|
|
local put_result=$(curl -s -X PUT http://localhost:8090/kv/test/basic \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"message":"hello world"}')
|
|
|
|
local get_result=$(curl -s http://localhost:8090/kv/test/basic)
|
|
local message=$(echo "$get_result" | jq -r '.data.message' 2>/dev/null) # Adjusted jq path
|
|
|
|
if [ "$message" = "hello world" ]; then
|
|
log_success "Basic CRUD operations work"
|
|
else
|
|
log_error "Basic CRUD failed: Expected 'hello world', got '$message' from $get_result"
|
|
fi
|
|
else
|
|
log_error "Basic test node failed to start"
|
|
fi
|
|
|
|
kill $pid 2>/dev/null || true
|
|
sleep 2
|
|
}
|
|
|
|
# Test 3: Cluster formation
|
|
test_cluster_formation() {
|
|
test_start "2-node cluster formation and Merkle Tree replication"
|
|
|
|
# Node 1 config
|
|
cat > cluster1.yaml <<EOF
|
|
node_id: "cluster-1"
|
|
bind_address: "127.0.0.1"
|
|
port: 8101
|
|
data_dir: "./cluster1_data"
|
|
seed_nodes: []
|
|
log_level: "error"
|
|
gossip_interval_min: 5
|
|
gossip_interval_max: 10
|
|
sync_interval: 10
|
|
EOF
|
|
|
|
# Node 2 config
|
|
cat > cluster2.yaml <<EOF
|
|
node_id: "cluster-2"
|
|
bind_address: "127.0.0.1"
|
|
port: 8102
|
|
data_dir: "./cluster2_data"
|
|
seed_nodes: ["127.0.0.1:8101"]
|
|
log_level: "error"
|
|
gossip_interval_min: 5
|
|
gossip_interval_max: 10
|
|
sync_interval: 10
|
|
EOF
|
|
|
|
# Start nodes
|
|
$BINARY cluster1.yaml >/dev/null 2>&1 &
|
|
local pid1=$!
|
|
|
|
if ! wait_for_service 8101; then
|
|
log_error "Cluster node 1 failed to start"
|
|
kill $pid1 2>/dev/null || true
|
|
return 1
|
|
fi
|
|
|
|
sleep 2 # Give node 1 a moment to fully initialize
|
|
$BINARY cluster2.yaml >/dev/null 2>&1 &
|
|
local pid2=$!
|
|
|
|
if ! wait_for_service 8102; then
|
|
log_error "Cluster node 2 failed to start"
|
|
kill $pid1 $pid2 2>/dev/null || true
|
|
return 1
|
|
fi
|
|
|
|
# Wait for cluster formation and initial Merkle sync
|
|
sleep 15
|
|
|
|
# Check if nodes see each other
|
|
local node1_members=$(curl -s http://localhost:8101/members/ | jq length 2>/dev/null || echo 0)
|
|
local node2_members=$(curl -s http://localhost:8102/members/ | jq length 2>/dev/null || echo 0)
|
|
|
|
if [ "$node1_members" -ge 1 ] && [ "$node2_members" -ge 1 ]; then
|
|
log_success "2-node cluster formed successfully (N1 members: $node1_members, N2 members: $node2_members)"
|
|
|
|
# Test data replication
|
|
log_info "Putting data on Node 1, waiting for Merkle sync..."
|
|
curl -s -X PUT http://localhost:8101/kv/cluster/test \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"source":"node1", "value": 1}' >/dev/null
|
|
|
|
# Wait for Merkle sync cycle to complete
|
|
sleep 12
|
|
|
|
local node2_data_full=$(curl -s http://localhost:8102/kv/cluster/test)
|
|
local node2_data_source=$(echo "$node2_data_full" | jq -r '.data.source' 2>/dev/null)
|
|
local node2_data_value=$(echo "$node2_data_full" | jq -r '.data.value' 2>/dev/null)
|
|
local node1_data_full=$(curl -s http://localhost:8101/kv/cluster/test)
|
|
|
|
if [ "$node2_data_source" = "node1" ] && [ "$node2_data_value" = "1" ]; then
|
|
log_success "Data replication works correctly (Node 2 has data from Node 1)"
|
|
|
|
# Verify UUIDs and Timestamps are identical (crucial for Merkle sync correctness)
|
|
local node1_uuid=$(echo "$node1_data_full" | jq -r '.uuid' 2>/dev/null)
|
|
local node1_timestamp=$(echo "$node1_data_full" | jq -r '.timestamp' 2>/dev/null)
|
|
local node2_uuid=$(echo "$node2_data_full" | jq -r '.uuid' 2>/dev/null)
|
|
local node2_timestamp=$(echo "$node2_data_full" | jq -r '.timestamp' 2>/dev/null)
|
|
|
|
if [ "$node1_uuid" = "$node2_uuid" ] && [ "$node1_timestamp" = "$node2_timestamp" ]; then
|
|
log_success "Replicated data retains original UUID and Timestamp"
|
|
else
|
|
log_error "Replicated data changed UUID/Timestamp: N1_UUID=$node1_uuid, N1_TS=$node1_timestamp, N2_UUID=$node2_uuid, N2_TS=$node2_timestamp"
|
|
fi
|
|
else
|
|
log_error "Data replication failed: Node 2 data: $node2_data_full"
|
|
fi
|
|
else
|
|
log_error "Cluster formation failed (N1 members: $node1_members, N2 members: $node2_members)"
|
|
fi
|
|
|
|
kill $pid1 $pid2 2>/dev/null || true
|
|
sleep 2
|
|
}
|
|
|
|
# Test 4: Conflict resolution (Merkle Tree based)
|
|
# This test assumes 'test_conflict.go' creates two BadgerDBs with a key
|
|
# that has the same path and timestamp but different UUIDs, or different timestamps
|
|
# but same path. The Merkle tree sync should then trigger conflict resolution.
|
|
test_conflict_resolution() {
|
|
test_start "Conflict resolution test (Merkle Tree based)"
|
|
|
|
# Create conflicting data using our utility
|
|
rm -rf conflict1_data conflict2_data 2>/dev/null || true
|
|
mkdir -p conflict1_data conflict2_data
|
|
|
|
cd "$SCRIPT_DIR"
|
|
if go run test_conflict.go "$TEST_DIR/conflict1_data" "$TEST_DIR/conflict2_data" >/dev/null 2>&1; then
|
|
cd "$TEST_DIR"
|
|
|
|
# Create configs
|
|
cat > conflict1.yaml <<EOF
|
|
node_id: "conflict-1"
|
|
bind_address: "127.0.0.1"
|
|
port: 8111
|
|
data_dir: "./conflict1_data"
|
|
seed_nodes: []
|
|
log_level: "info"
|
|
sync_interval: 3
|
|
EOF
|
|
|
|
cat > conflict2.yaml <<EOF
|
|
node_id: "conflict-2"
|
|
bind_address: "127.0.0.1"
|
|
port: 8112
|
|
data_dir: "./conflict2_data"
|
|
seed_nodes: ["127.0.0.1:8111"]
|
|
log_level: "info"
|
|
sync_interval: 3
|
|
EOF
|
|
|
|
# Start nodes
|
|
# Node 1 started first, making it "older" for tie-breaker if timestamps are equal
|
|
"$BINARY" conflict1.yaml >conflict1.log 2>&1 &
|
|
local pid1=$!
|
|
|
|
if wait_for_service 8111; then
|
|
sleep 2
|
|
$BINARY conflict2.yaml >conflict2.log 2>&1 &
|
|
local pid2=$!
|
|
|
|
if wait_for_service 8112; then
|
|
# Get initial data (full StoredValue)
|
|
local node1_initial_full=$(curl -s http://localhost:8111/kv/test/conflict/data)
|
|
local node2_initial_full=$(curl -s http://localhost:8112/kv/test/conflict/data)
|
|
|
|
local node1_initial_msg=$(echo "$node1_initial_full" | jq -r '.data.message' 2>/dev/null)
|
|
local node2_initial_msg=$(echo "$node2_initial_full" | jq -r '.data.message' 2>/dev/null)
|
|
|
|
log_info "Initial conflict state: Node1='$node1_initial_msg', Node2='$node2_initial_msg'"
|
|
|
|
# Allow time for cluster formation and gossip protocol to stabilize
|
|
log_info "Waiting for cluster formation and gossip stabilization..."
|
|
sleep 10
|
|
|
|
# Wait for conflict resolution with retry logic (up to 60 seconds)
|
|
local max_attempts=20
|
|
local attempt=1
|
|
local node1_final_msg=""
|
|
local node2_final_msg=""
|
|
local node1_final_full=""
|
|
local node2_final_full=""
|
|
|
|
log_info "Waiting for conflict resolution (checking every 3 seconds, max 60 seconds)..."
|
|
|
|
while [ $attempt -le $max_attempts ]; do
|
|
sleep 3
|
|
|
|
# Get current data from both nodes
|
|
node1_final_full=$(curl -s http://localhost:8111/kv/test/conflict/data)
|
|
node2_final_full=$(curl -s http://localhost:8112/kv/test/conflict/data)
|
|
|
|
node1_final_msg=$(echo "$node1_final_full" | jq -r '.data.message' 2>/dev/null)
|
|
node2_final_msg=$(echo "$node2_final_full" | jq -r '.data.message' 2>/dev/null)
|
|
|
|
# Check if they've converged
|
|
if [ "$node1_final_msg" = "$node2_final_msg" ] && [ -n "$node1_final_msg" ] && [ "$node1_final_msg" != "null" ]; then
|
|
log_info "Conflict resolution achieved after $((attempt * 3)) seconds"
|
|
break
|
|
fi
|
|
|
|
log_info "Attempt $attempt/$max_attempts: Node1='$node1_final_msg', Node2='$node2_final_msg' (not converged yet)"
|
|
attempt=$((attempt + 1))
|
|
done
|
|
|
|
# Check if they converged
|
|
if [ "$node1_final_msg" = "$node2_final_msg" ] && [ -n "$node1_final_msg" ]; then
|
|
log_success "Conflict resolution converged to: '$node1_final_msg'"
|
|
|
|
# Verify UUIDs and Timestamps are identical after resolution
|
|
local node1_final_uuid=$(echo "$node1_final_full" | jq -r '.uuid' 2>/dev/null)
|
|
local node1_final_timestamp=$(echo "$node1_final_full" | jq -r '.timestamp' 2>/dev/null)
|
|
local node2_final_uuid=$(echo "$node2_final_full" | jq -r '.uuid' 2>/dev/null)
|
|
local node2_final_timestamp=$(echo "$node2_final_full" | jq -r '.timestamp' 2>/dev/null)
|
|
|
|
if [ "$node1_final_uuid" = "$node2_final_uuid" ] && [ "$node1_final_timestamp" = "$node2_final_timestamp" ]; then
|
|
log_success "Resolved data retains consistent UUID and Timestamp across nodes"
|
|
else
|
|
log_error "Resolved data has inconsistent UUID/Timestamp: N1_UUID=$node1_final_uuid, N1_TS=$node1_final_timestamp, N2_UUID=$node2_final_uuid, N2_TS=$node2_final_timestamp"
|
|
fi
|
|
|
|
# Optionally, check logs for conflict resolution messages
|
|
if grep -q "Conflict resolved" conflict1.log conflict2.log 2>/dev/null; then
|
|
log_success "Conflict resolution messages found in logs"
|
|
else
|
|
log_error "No 'Conflict resolved' messages found in logs, but data converged."
|
|
fi
|
|
|
|
else
|
|
log_error "Conflict resolution failed: N1_final='$node1_final_msg', N2_final='$node2_final_msg'"
|
|
fi
|
|
else
|
|
log_error "Conflict node 2 failed to start"
|
|
fi
|
|
|
|
kill $pid2 2>/dev/null || true
|
|
else
|
|
log_error "Conflict node 1 failed to start"
|
|
fi
|
|
|
|
kill $pid1 2>/dev/null || true
|
|
sleep 2
|
|
else
|
|
cd "$TEST_DIR"
|
|
log_error "Failed to create conflict test data. Ensure test_conflict.go is correct."
|
|
fi
|
|
}
|
|
|
|
# Main test execution
|
|
main() {
|
|
echo "=================================================="
|
|
echo " KVS Integration Test Suite (Merkle Tree)"
|
|
echo "=================================================="
|
|
|
|
# Setup
|
|
log_info "Setting up test environment..."
|
|
cleanup
|
|
mkdir -p "$TEST_DIR"
|
|
cd "$TEST_DIR"
|
|
|
|
# Run core tests
|
|
test_build
|
|
test_basic_functionality
|
|
test_cluster_formation
|
|
test_conflict_resolution
|
|
|
|
# Results
|
|
echo "=================================================="
|
|
echo " Test Results"
|
|
echo "=================================================="
|
|
echo -e "Total Tests: $TOTAL_TESTS"
|
|
echo -e "${GREEN}Passed: $TESTS_PASSED${NC}"
|
|
echo -e "${RED}Failed: $TESTS_FAILED${NC}"
|
|
echo "=================================================="
|
|
|
|
if [ $TESTS_FAILED -eq 0 ]; then
|
|
echo -e "${GREEN}🎉 All tests passed! KVS with Merkle Tree sync is working correctly.${NC}"
|
|
cleanup
|
|
exit 0
|
|
else
|
|
echo -e "${RED}❌ Some tests failed. Please check the output above.${NC}"
|
|
cleanup
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Handle interruption
|
|
trap cleanup INT TERM
|
|
|
|
# Run tests
|
|
main "$@"
|