name: E2E Tests (In-Cluster Runner) on: workflow_dispatch: inputs: run_benchmarks: description: 'Run performance benchmarks' required: false type: boolean default: false schedule: # Run nightly at 2 AM UTC - cron: '0 2 * * *' pull_request: types: [labeled] jobs: e2e-tests: # Use self-hosted runner with 'ibm-e2e' label runs-on: [self-hosted, ibm-e2e] if: | github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || (github.event_name == 'pull_request' && contains(github.event.label.name, 'run-e2e')) # Prevent concurrent e2e runs concurrency: group: e2e-tests cancel-in-progress: false timeout-minutes: 210 container: # Run in a container with necessary tools image: golang:1.24.6 options: --user 0 steps: - name: Checkout uses: actions/checkout@v4 - name: Install dependencies run: | apt-get update && apt-get install -y curl jq # Install kubectl curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" chmod +x kubectl mv kubectl /usr/local/bin/ # Install IBM Cloud CLI curl -fsSL https://clis.cloud.ibm.com/install/linux | sh ibmcloud plugin install vpc-infrastructure - name: Setup kubeconfig env: KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} run: | # Create kubeconfig from secret printf '%s' "$KUBECONFIG_CONTENT" | base64 -d > /tmp/kubeconfig chmod 600 /tmp/kubeconfig export KUBECONFIG=/tmp/kubeconfig # Verify the kubeconfig works kubectl version --client - name: Verify cluster access env: KUBECONFIG: /tmp/kubeconfig run: | # Verify cluster access with provided kubeconfig kubectl cluster-info kubectl auth can-i create nodeclaims --all-namespaces kubectl auth can-i create nodepools --all-namespaces kubectl auth can-i create ibmnodeclasses --all-namespaces - name: Configure IBM Cloud CLI env: IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} run: | ibmcloud login --apikey "${{ secrets.IBMCLOUD_API_KEY }}" -r "${{ secrets.IBMCLOUD_REGION }}" - name: Deploy latest version env: KUBECONFIG: /tmp/kubeconfig run: | # Install or update Karpenter CRDs kubectl apply -f charts/crds/ # Restart operator pods to pull latest upstream image tag kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter - name: Pre-test cleanup env: KUBECONFIG: /tmp/kubeconfig run: | echo "๐Ÿงน Cleaning up any existing e2e test resources..." # Clean up any leftover resources from previous runs kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true kubectl delete nodeclaims -l test=e2e --timeout=300s || true kubectl delete nodepools -l test=e2e --timeout=300s || true kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true # Wait for cluster stabilization echo "โณ Waiting for cluster stabilization..." # Wait for no pending e2e pods for i in {1..30}; do pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l) if [ "$pending_pods" -eq 0 ]; then echo "โœ… No pending e2e pods found" break fi echo "โณ Still have $pending_pods pending e2e pods, waiting..." sleep 10 done # Wait for no disrupted nodes for i in {1..30}; do disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0") disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0") if [ "$disrupted_nodes" -eq 0 ]; then echo "โœ… No disrupted nodes found" break fi echo "โณ Still have $disrupted_nodes disrupted nodes, waiting..." sleep 10 done # Brief pause for final cleanup sleep 30 echo "โœ… Pre-test cleanup completed" - name: Run E2E tests (Sequential) env: RUN_E2E_TESTS: "true" IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} VPC_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} IBMCLOUD_REGION: ${{ secrets.IBMCLOUD_REGION }} TEST_VPC_ID: ${{ secrets.E2E_TEST_VPC_ID }} TEST_SUBNET_ID: ${{ secrets.E2E_TEST_SUBNET_ID }} TEST_IMAGE_ID: ${{ secrets.E2E_TEST_IMAGE_ID }} TEST_ZONE: ${{ secrets.E2E_TEST_ZONE }} TEST_SECURITY_GROUP_ID: ${{ secrets.E2E_TEST_SECURITY_GROUP_ID }} VPC_URL: ${{ secrets.VPC_URL }} KUBERNETES_API_SERVER_ENDPOINT: ${{ secrets.KUBERNETES_API_SERVER_ENDPOINT }} IBM_RESOURCE_GROUP_ID: ${{ secrets.IBM_RESOURCE_GROUP_ID }} IBM_SSH_KEY_ID: ${{ secrets.IBM_SSH_KEY_ID }} RUN_E2E_BENCHMARKS: ${{ inputs.run_benchmarks }} # Use the kubeconfig we set up KUBECONFIG: /tmp/kubeconfig # Configure e2e test behavior E2E_SEQUENTIAL: "true" E2E_CLEANUP_TIMEOUT: "300s" E2E_STABILIZATION_WAIT: "60s" run: | echo "๐Ÿš€ Starting E2E test suite..." # Define test groups # Core functionality tests from basic_workflow_test.go core_tests="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability" # NodeClass validation tests from validation_test.go validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields" # Block device mapping tests from block_device_test.go block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation" # Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling" # UserData feature tests from userdata_test.go userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap" # Image selector tests from image_selector_test.go image_selector_tests="TestE2EImageSelector" # Multi-zone tests from multizone_test.go multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover" # Cleanup tests from cleanup_test.go cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources" # Combine all tests all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests" test_failed="false" passed_tests=0 failed_tests=0 total_tests=$(echo $all_tests | wc -w) echo "๐Ÿ“‹ Test Suite Summary:" echo " Core Tests: $(echo $core_tests | wc -w)" echo " Validation Tests: $(echo $validation_tests | wc -w)" echo " Block Device Tests: $(echo $block_device_tests | wc -w)" echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)" echo " UserData Tests: $(echo $userdata_tests | wc -w)" echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)" echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)" echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)" echo " Total Tests: $total_tests" echo "" # Run each test individually with cleanup between for test in $all_tests; do echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "๐Ÿงช Running test: $test" echo "Progress: $((passed_tests + failed_tests + 1))/$total_tests" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" # Set appropriate timeout based on test type timeout="20m" case "$test" in "TestE2EDriftStability") timeout="30m" # Drift test needs more time for monitoring ;; "TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*) timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning ;; "TestE2ECleanup"*) timeout="15m" # Cleanup tests are typically faster ;; "TestE2EValidation"*|"TestE2ENodeClass"*) timeout="10m" # Validation tests are quick ;; *) timeout="20m" # Default timeout for other tests ;; esac # Create test-specific log file to capture all output test_log="test-artifacts/${test}-$(date +%s).log" mkdir -p test-artifacts # Run test with enhanced logging and crash recovery set +e # Don't exit on failure timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 2>&1 | tee "$test_log" test_exit_code=$? set -e # Re-enable exit on failure if [ $test_exit_code -eq 0 ]; then echo "โœ… Test $test passed" passed_tests=$((passed_tests + 1)) else echo "โŒ Test $test failed (exit code: $test_exit_code)" failed_tests=$((failed_tests + 1)) # Enhanced debug information on failure echo "๐Ÿ“Š Debug information for failed test $test:" echo " Exit code: $test_exit_code" echo " Log file: $test_log" # Collect system state kubectl get nodes --no-headers | wc -l | xargs echo " Total nodes:" kubectl get nodeclaims --no-headers 2>/dev/null | wc -l | xargs echo " Total nodeclaims:" || echo " Total nodeclaims: 0" kubectl get pods -l test=e2e --all-namespaces --no-headers 2>/dev/null | wc -l | xargs echo " Total e2e pods:" || echo " Total e2e pods: 0" # Collect Karpenter pod status echo " Karpenter pod status:" kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter --no-headers 2>/dev/null || echo " No Karpenter pods found" # Collect recent events (errors and warnings) echo " Recent warning events:" kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo " No warning events" # Check for panic or crash indicators in test log if grep -i "panic\|fatal\|segmentation\|killed" "$test_log" >/dev/null 2>&1; then echo " โš ๏ธ Test appears to have crashed (panic/fatal error detected)" fi # Collect Karpenter logs immediately after failure kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 > "test-artifacts/karpenter-logs-${test}-$(date +%s).txt" 2>/dev/null || echo " Failed to collect Karpenter logs" test_failed="true" fi # Inter-test cleanup and stabilization echo "๐Ÿงน Cleaning up after test: $test" # Delete test-specific resources kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true kubectl delete nodeclaims -l test=e2e --timeout=300s || true kubectl delete nodepools -l test=e2e --timeout=300s || true kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true # Wait for cleanup to complete echo "โณ Waiting for cleanup to complete..." # Extended cleanup wait for drift stability test due to NodeClaim deletion timeouts if [ "$test" = "TestE2EDriftStability" ]; then echo "โณ Extended cleanup wait for drift stability test..." sleep 120 # 2 minutes for NodeClaim finalizers to complete else sleep 30 # Standard cleanup wait fi # Check cluster health before next test kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:" kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0" echo "โœ… Completed test: $test" echo "" done echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "๐Ÿ“Š Test Suite Results:" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo " Total Tests: $total_tests" echo " โœ… Passed: $passed_tests" echo " โŒ Failed: $failed_tests" echo " Success Rate: $((passed_tests * 100 / total_tests))%" echo "" # Check if any test failed if [ "$test_failed" = "true" ]; then echo "โŒ Test suite failed with $failed_tests failures" exit 1 fi echo "โœ… All E2E tests completed successfully!" # Run benchmarks if requested if [ "$RUN_E2E_BENCHMARKS" = "true" ]; then echo "๐Ÿ“Š Running performance benchmarks..." go test -tags=e2e -v -timeout 30m ./test/e2e/... -run=^$ -bench=. fi - name: Collect test artifacts if: always() env: KUBECONFIG: /tmp/kubeconfig run: | echo "๐Ÿ“ฆ Collecting comprehensive test artifacts..." mkdir -p test-artifacts # Collect Karpenter logs with different tail sizes for completeness echo " Collecting Karpenter logs..." kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=2000 > test-artifacts/karpenter-logs.txt 2>/dev/null || echo "Failed to collect current Karpenter logs" > test-artifacts/karpenter-logs.txt kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --previous --tail=1000 > test-artifacts/karpenter-logs-previous.txt 2>/dev/null || echo "No previous Karpenter logs available" > test-artifacts/karpenter-logs-previous.txt # Collect events with different filters echo " Collecting events..." kubectl get events -A --sort-by='.lastTimestamp' > test-artifacts/events.txt 2>/dev/null || echo "Failed to collect events" > test-artifacts/events.txt kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' > test-artifacts/events-warnings.txt 2>/dev/null || echo "No warning events" > test-artifacts/events-warnings.txt kubectl get events -A --field-selector type=Normal --sort-by='.lastTimestamp' | tail -50 > test-artifacts/events-normal-recent.txt 2>/dev/null || echo "No normal events" > test-artifacts/events-normal-recent.txt # Collect resource states echo " Collecting resource states..." kubectl get nodes -o wide > test-artifacts/nodes.txt 2>/dev/null || echo "Failed to collect nodes" > test-artifacts/nodes.txt kubectl get nodeclaims -o yaml > test-artifacts/nodeclaims.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodeclaims.yaml kubectl get nodepools -o yaml > test-artifacts/nodepools.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodepools.yaml kubectl get ibmnodeclasses -o yaml > test-artifacts/ibmnodeclasses.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/ibmnodeclasses.yaml # Collect Karpenter deployment status echo " Collecting Karpenter deployment status..." kubectl describe deployment -n karpenter karpenter-karpenter-ibm > test-artifacts/karpenter-deployment.txt 2>/dev/null || echo "Failed to describe Karpenter deployment" > test-artifacts/karpenter-deployment.txt kubectl get pods -n karpenter -o wide > test-artifacts/karpenter-pods.txt 2>/dev/null || echo "Failed to get Karpenter pods" > test-artifacts/karpenter-pods.txt # Collect any crash dumps or additional logs echo " Collecting additional diagnostics..." kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded > test-artifacts/problematic-pods.txt 2>/dev/null || echo "No problematic pods found" > test-artifacts/problematic-pods.txt # Create summary of artifacts echo " Creating artifact summary..." { echo "E2E Test Artifacts Summary" echo "=========================" echo "Generated: $(date)" echo "Test run ID: ${{ github.run_id }}" echo "" echo "Files collected:" ls -la test-artifacts/ 2>/dev/null || echo "No artifacts directory" } > test-artifacts/README.txt echo "โœ… Test artifact collection completed" - name: Upload test artifacts if: always() uses: actions/upload-artifact@v4 with: name: e2e-test-artifacts-${{ github.run_id }} path: test-artifacts/ retention-days: 7 - name: Cleanup test resources if: always() env: IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} KUBECONFIG: /tmp/kubeconfig run: | echo "๐Ÿงน Starting comprehensive cleanup..." # Clean up Kubernetes resources with extended timeouts echo "Cleaning up Kubernetes resources..." kubectl delete pods -l test=e2e --all-namespaces --timeout=10m || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=10m || true kubectl delete nodeclaims -l test=e2e --timeout=10m || true kubectl delete nodepools -l test=e2e --timeout=10m || true kubectl delete ibmnodeclasses -l test=e2e --timeout=10m || true # Force cleanup any stuck resources with direct patching echo "Force cleaning up any stuck resources..." kubectl patch nodeclaims --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true kubectl patch nodepools --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true kubectl patch ibmnodeclasses --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true # Clean up IBM Cloud instances created by e2e tests echo "Cleaning up IBM Cloud instances..." ibmcloud is instances --output json | \ jq -r '.[] | select(.tags | index("karpenter-e2e")) | .id' | \ xargs -I {} ibmcloud is instance-delete {} --force || true # Clean up orphaned VNIs (Virtual Network Interfaces) echo "Cleaning up orphaned VNIs..." ibmcloud is virtual-network-interfaces --output json | \ jq -r '.[] | select(.name | test("e2e-.*-vni")) | .id' | \ xargs -I {} ibmcloud is virtual-network-interface-delete {} --force || true # Clean up orphaned volumes echo "Cleaning up orphaned volumes..." ibmcloud is volumes --output json | \ jq -r '.[] | select(.name | test("e2e-.*-boot")) | .id' | \ xargs -I {} ibmcloud is volume-delete {} --force || true echo "โœ… Cleanup completed"