name: E2E Tests for PRs on: workflow_dispatch: {} pull_request: types: [opened, synchronize, reopened] issue_comment: types: [created] permissions: contents: read packages: write issues: read pull-requests: read concurrency: group: e2e-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }} cancel-in-progress: false env: E2E_APPROVAL_COMMENT: "/ok-to-e2e" GO_VERSION: "1.24" KO_VERSION: "0.15.4" KUBECTL_VERSION: "1.28.0" jobs: gate: name: Gate (manual or approver comment) runs-on: ubuntu-latest outputs: approved: ${{ steps.decide.outputs.approved }} pr_number: ${{ steps.decide.outputs.pr_number }} head_sha: ${{ steps.decide.outputs.head_sha }} ref: ${{ steps.decide.outputs.ref }} image_tag: ${{ steps.decide.outputs.image_tag }} image_ref: ${{ steps.decide.outputs.image_ref }} steps: - id: decide uses: actions/github-script@v7 with: script: | async function checkUserPermission(username) { try { const { data: collaborator } = await github.rest.repos.getCollaboratorPermissionLevel({ owner: context.repo.owner, repo: context.repo.repo, username: username }); const hasWriteAccess = ['admin', 'write', 'maintain'].includes(collaborator.permission); return { hasAccess: hasWriteAccess, permission: collaborator.permission }; } catch (error) { console.log(`Could not check permissions for ${username}: ${error.message}`); return { hasAccess: false, permission: 'unknown' }; } } let approved = false; let prNumber = ''; let headSHA = context.sha; let ref = (context.ref || '').replace('refs/heads/', ''); if (context.eventName === 'pull_request') { prNumber = String(context.payload.pull_request.number); headSHA = context.payload.pull_request.head.sha; ref = context.payload.pull_request.head.ref; const author = context.payload.pull_request.user.login; const permCheck = await checkUserPermission(author); if (permCheck.hasAccess) { approved = true; console.log(`Auto-approved E2E for ${author} (${permCheck.permission} access)`); } else { console.log(`E2E requires manual approval for ${author} (${permCheck.permission} access)`); } } else if (context.eventName === 'issue_comment') { const comment = context.payload.comment.body || ''; const commenter = context.payload.comment.user.login || ''; if (context.payload.issue.pull_request && comment.includes('/ok-to-e2e')) { const permCheck = await checkUserPermission(commenter); if (permCheck.hasAccess) { approved = true; console.log(`Manual E2E approval by ${commenter} (${permCheck.permission} access)`); const { data: pr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, pull_number: context.payload.issue.number }); headSHA = pr.head.sha; ref = pr.head.ref; prNumber = String(context.payload.issue.number); } else { console.log(`E2E approval denied for ${commenter} (${permCheck.permission} access)`); } } } else if (context.eventName === 'workflow_dispatch') { approved = true; } const tag = (prNumber ? `pr-${prNumber}-` : '') + headSHA.substring(0, 12); const imageRef = `ghcr.io/${context.repo.owner}/${context.repo.repo}:${tag}`; console.log('Event:', context.eventName); console.log('Approved:', approved); console.log('PR:', prNumber); console.log('Image:', imageRef); core.setOutput('approved', approved ? 'true' : 'false'); core.setOutput('pr_number', prNumber); core.setOutput('head_sha', headSHA); core.setOutput('ref', ref); core.setOutput('image_tag', tag); core.setOutput('image_ref', imageRef); build: name: Build PR image with ko needs: [gate] if: ${{ needs.gate.outputs.approved == 'true' }} runs-on: ubuntu-latest permissions: contents: read packages: write steps: - uses: actions/checkout@v4 with: ref: ${{ needs.gate.outputs.ref }} fetch-depth: 0 - uses: actions/setup-go@v4 with: go-version: ${{ env.GO_VERSION }} cache: true - uses: ko-build/setup-ko@v0.6 with: version: v${{ env.KO_VERSION }} - name: Build & push image env: KO_DOCKER_REPO: ghcr.io/${{ github.repository_owner }}/${{ github.event.repository.name }} run: | echo "${{ secrets.GITHUB_TOKEN }}" | ko login ghcr.io --username ${{ github.actor }} --password-stdin ko build ./cmd/controller --platform=linux/amd64 --bare --tags="${{ needs.gate.outputs.image_tag }}" - name: Cleanup old PR images if: ${{ needs.gate.outputs.pr_number != '' }} continue-on-error: true uses: actions/github-script@v7 with: script: | const packageName = context.repo.repo; const prNumber = '${{ needs.gate.outputs.pr_number }}'; try { const { data: versions } = await github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg({ package_type: 'container', package_name: packageName, org: context.repo.owner, per_page: 100 }); const prVersions = versions .filter(version => version.metadata?.container?.tags?.some(tag => tag.startsWith(`pr-${prNumber}-`)) ) .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); const toDelete = prVersions.slice(3); console.log(`Found ${prVersions.length} images for PR #${prNumber}`); console.log(`Keeping latest 3, deleting ${toDelete.length} old images`); for (const version of toDelete) { try { await github.rest.packages.deletePackageVersionForOrg({ package_type: 'container', package_name: packageName, org: context.repo.owner, package_version_id: version.id }); console.log(`Deleted image version ${version.id}`); } catch (error) { console.log(`Failed to delete version ${version.id}: ${error.message}`); } } } catch (error) { console.log(`Failed to cleanup images: ${error.message}`); } e2e: name: Run E2E against PR image needs: [gate, build] if: ${{ needs.gate.outputs.approved == 'true' }} runs-on: [self-hosted, ibm-e2e] timeout-minutes: 210 container: image: golang:1.24.6 options: --user 0 env: KUBECONFIG: /tmp/kubeconfig steps: - uses: actions/checkout@v4 with: ref: ${{ needs.gate.outputs.ref }} - name: Install dependencies run: | apt-get update && apt-get install -y curl jq curl -LO "https://dl.k8s.io/release/v${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl" chmod +x kubectl && mv kubectl /usr/local/bin/ curl -fsSL https://clis.cloud.ibm.com/install/linux | sh ibmcloud plugin install vpc-infrastructure - name: Setup kubeconfig env: KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} run: | printf '%s' "$KUBECONFIG_CONTENT" | base64 -d > /tmp/kubeconfig chmod 600 /tmp/kubeconfig kubectl version --client - name: Verify cluster access run: | kubectl cluster-info kubectl auth can-i create nodeclaims --all-namespaces kubectl auth can-i create nodepools --all-namespaces kubectl auth can-i create ibmnodeclasses --all-namespaces - name: Configure IBM Cloud CLI env: IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} run: | ibmcloud login --apikey "${{ secrets.IBMCLOUD_API_KEY }}" -r "${{ secrets.IBMCLOUD_REGION }}" - name: Deploy PR version run: | kubectl apply -f charts/crds/ echo "๐Ÿ” Checking deployment structure..." kubectl get deployment karpenter-karpenter-ibm -n karpenter -o yaml | grep -A 10 "containers:" CONTAINER_NAME=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].name}') echo "๐Ÿ“‹ Found container name: $CONTAINER_NAME" kubectl set image deployment/karpenter-karpenter-ibm \ $CONTAINER_NAME=${{ needs.gate.outputs.image_ref }} \ -n karpenter kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s CURRENT_IMAGE=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].image}') echo "โœ… Deployment updated to: $CURRENT_IMAGE" - name: Pre-test cleanup run: | echo "๐Ÿงน Cleaning up any existing e2e test resources..." kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true kubectl delete nodeclaims -l test=e2e --timeout=300s || true kubectl delete nodepools -l test=e2e --timeout=300s || true kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true echo "โณ Waiting for cluster stabilization..." for i in {1..30}; do pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l) if [ "$pending_pods" -eq 0 ]; then echo "โœ… No pending e2e pods found" break fi echo "โณ Still have $pending_pods pending e2e pods, waiting..." sleep 10 done for i in {1..30}; do disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0") disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0") if [ "$disrupted_nodes" -eq 0 ]; then echo "โœ… No disrupted nodes found" break fi echo "โณ Still have $disrupted_nodes disrupted nodes, waiting..." sleep 10 done sleep 30 echo "โœ… Pre-test cleanup completed" - name: Run E2E tests env: RUN_E2E_TESTS: "true" IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} VPC_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} IBMCLOUD_REGION: ${{ secrets.IBMCLOUD_REGION }} TEST_VPC_ID: ${{ secrets.E2E_TEST_VPC_ID }} TEST_SUBNET_ID: ${{ secrets.E2E_TEST_SUBNET_ID }} TEST_IMAGE_ID: ${{ secrets.E2E_TEST_IMAGE_ID }} TEST_ZONE: ${{ secrets.E2E_TEST_ZONE }} TEST_SECURITY_GROUP_ID: ${{ secrets.E2E_TEST_SECURITY_GROUP_ID }} VPC_URL: ${{ secrets.VPC_URL }} KUBERNETES_API_SERVER_ENDPOINT: ${{ secrets.KUBERNETES_API_SERVER_ENDPOINT }} IBM_RESOURCE_GROUP_ID: ${{ secrets.IBM_RESOURCE_GROUP_ID }} IBM_SSH_KEY_ID: ${{ secrets.IBM_SSH_KEY_ID }} E2E_SEQUENTIAL: "true" E2E_CLEANUP_TIMEOUT: "300s" E2E_STABILIZATION_WAIT: "60s" run: | echo "๐Ÿš€ Starting E2E test suite..." # Define test groups # Core functionality tests from basic_workflow_test.go core_tests="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability" # NodeClass validation tests from validation_test.go validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields" # Block device mapping tests from block_device_test.go block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation" # Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling" # UserData feature tests from userdata_test.go userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap" # Image selector tests from image_selector_test.go image_selector_tests="TestE2EImageSelector" # Multi-zone tests from multizone_test.go multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover" # Cleanup tests from cleanup_test.go cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources" # Combine all tests all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests" test_failed="false" passed_tests=0 failed_tests=0 total_tests=$(echo $all_tests | wc -w) echo "๐Ÿ“‹ Test Suite Summary:" echo " Core Tests: $(echo $core_tests | wc -w)" echo " Validation Tests: $(echo $validation_tests | wc -w)" echo " Block Device Tests: $(echo $block_device_tests | wc -w)" echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)" echo " UserData Tests: $(echo $userdata_tests | wc -w)" echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)" echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)" echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)" echo " Total Tests: $total_tests" echo "" for test in $all_tests; do echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "๐Ÿงช Running test: $test" echo "Progress: $((passed_tests + failed_tests + 1))/$total_tests" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" # Set appropriate timeout based on test type timeout="20m" case "$test" in "TestE2EDriftStability") timeout="30m" # Drift test needs more time for monitoring ;; "TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*) timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning ;; "TestE2ECleanup"*) timeout="15m" # Cleanup tests are typically faster ;; "TestE2EValidation"*|"TestE2ENodeClass"*) timeout="10m" # Validation tests are quick ;; *) timeout="20m" # Default timeout for other tests ;; esac # Create test-specific log file to capture all output test_log="test-artifacts/${test}-$(date +%s).log" mkdir -p test-artifacts # Run test with enhanced logging and crash recovery set +e # Don't exit on failure timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 2>&1 | tee "$test_log" test_exit_code=$? set -e # Re-enable exit on failure if [ $test_exit_code -eq 0 ]; then echo "โœ… Test $test passed" passed_tests=$((passed_tests + 1)) else echo "โŒ Test $test failed (exit code: $test_exit_code)" failed_tests=$((failed_tests + 1)) # Enhanced debug information on failure echo "๐Ÿ“Š Debug information for failed test $test:" echo " Exit code: $test_exit_code" echo " Log file: $test_log" # Collect system state kubectl get nodes --no-headers | wc -l | xargs echo " Total nodes:" kubectl get nodeclaims --no-headers 2>/dev/null | wc -l | xargs echo " Total nodeclaims:" || echo " Total nodeclaims: 0" kubectl get pods -l test=e2e --all-namespaces --no-headers 2>/dev/null | wc -l | xargs echo " Total e2e pods:" || echo " Total e2e pods: 0" # Collect Karpenter pod status echo " Karpenter pod status:" kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter --no-headers 2>/dev/null || echo " No Karpenter pods found" # Collect recent events (errors and warnings) echo " Recent warning events:" kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo " No warning events" # Check for panic or crash indicators in test log if grep -i "panic\|fatal\|segmentation\|killed" "$test_log" >/dev/null 2>&1; then echo " โš ๏ธ Test appears to have crashed (panic/fatal error detected)" fi # Collect Karpenter logs immediately after failure kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 > "test-artifacts/karpenter-logs-${test}-$(date +%s).txt" 2>/dev/null || echo " Failed to collect Karpenter logs" test_failed="true" fi echo "๐Ÿงน Cleaning up after test: $test" kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true kubectl delete nodeclaims -l test=e2e --timeout=300s || true kubectl delete nodepools -l test=e2e --timeout=300s || true kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true echo "โณ Waiting for cleanup to complete..." sleep 30 kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:" kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0" echo "โœ… Completed test: $test" echo "" done echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo "๐Ÿ“Š Test Suite Results:" echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" echo " Total Tests: $total_tests" echo " โœ… Passed: $passed_tests" echo " โŒ Failed: $failed_tests" echo " Success Rate: $((passed_tests * 100 / total_tests))%" echo "" if [ "$test_failed" = "true" ]; then echo "โŒ Test suite failed with $failed_tests failures" exit 1 fi echo "โœ… All E2E tests completed successfully!" - name: Collect test artifacts if: always() run: | echo "๐Ÿ“ฆ Collecting comprehensive test artifacts..." mkdir -p test-artifacts # Collect Karpenter logs with different tail sizes for completeness echo " Collecting Karpenter logs..." kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=2000 > test-artifacts/karpenter-logs.txt 2>/dev/null || echo "Failed to collect current Karpenter logs" > test-artifacts/karpenter-logs.txt kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --previous --tail=1000 > test-artifacts/karpenter-logs-previous.txt 2>/dev/null || echo "No previous Karpenter logs available" > test-artifacts/karpenter-logs-previous.txt # Collect events with different filters echo " Collecting events..." kubectl get events -A --sort-by='.lastTimestamp' > test-artifacts/events.txt 2>/dev/null || echo "Failed to collect events" > test-artifacts/events.txt kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' > test-artifacts/events-warnings.txt 2>/dev/null || echo "No warning events" > test-artifacts/events-warnings.txt kubectl get events -A --field-selector type=Normal --sort-by='.lastTimestamp' | tail -50 > test-artifacts/events-normal-recent.txt 2>/dev/null || echo "No normal events" > test-artifacts/events-normal-recent.txt # Collect resource states echo " Collecting resource states..." kubectl get nodes -o wide > test-artifacts/nodes.txt 2>/dev/null || echo "Failed to collect nodes" > test-artifacts/nodes.txt kubectl get nodeclaims -o yaml > test-artifacts/nodeclaims.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodeclaims.yaml kubectl get nodepools -o yaml > test-artifacts/nodepools.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodepools.yaml kubectl get ibmnodeclasses -o yaml > test-artifacts/ibmnodeclasses.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/ibmnodeclasses.yaml # Collect Karpenter deployment status echo " Collecting Karpenter deployment status..." kubectl describe deployment -n karpenter karpenter-karpenter-ibm > test-artifacts/karpenter-deployment.txt 2>/dev/null || echo "Failed to describe Karpenter deployment" > test-artifacts/karpenter-deployment.txt kubectl get pods -n karpenter -o wide > test-artifacts/karpenter-pods.txt 2>/dev/null || echo "Failed to get Karpenter pods" > test-artifacts/karpenter-pods.txt # Collect any crash dumps or additional logs echo " Collecting additional diagnostics..." kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded > test-artifacts/problematic-pods.txt 2>/dev/null || echo "No problematic pods found" > test-artifacts/problematic-pods.txt # Create summary of artifacts echo " Creating artifact summary..." { echo "E2E Test Artifacts Summary" echo "=========================" echo "Generated: $(date)" echo "Test run ID: ${{ github.run_id }}" echo "" echo "Files collected:" ls -la test-artifacts/ 2>/dev/null || echo "No artifacts directory" } > test-artifacts/README.txt echo "โœ… Test artifact collection completed" - uses: actions/upload-artifact@v4 if: always() with: name: e2e-test-artifacts-${{ github.run_id }} path: test-artifacts/ retention-days: 7 - name: Cleanup test resources if: always() env: IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }} run: | echo "๐Ÿงน Starting comprehensive cleanup..." kubectl delete pods -l test=e2e --all-namespaces --timeout=10m || true kubectl delete deployments -l test=e2e --all-namespaces --timeout=10m || true kubectl delete nodeclaims -l test=e2e --timeout=10m || true kubectl delete nodepools -l test=e2e --timeout=10m || true kubectl delete ibmnodeclasses -l test=e2e --timeout=10m || true kubectl patch nodeclaims --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true kubectl patch nodepools --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true kubectl patch ibmnodeclasses --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true ibmcloud is instances --output json | \ jq -r '.[] | select(.tags | index("karpenter-e2e")) | .id' | \ xargs -I {} ibmcloud is instance-delete {} --force || true ibmcloud is virtual-network-interfaces --output json | \ jq -r '.[] | select(.name | test("e2e-.*-vni")) | .id' | \ xargs -I {} ibmcloud is virtual-network-interface-delete {} --force || true ibmcloud is volumes --output json | \ jq -r '.[] | select(.name | test("e2e-.*-boot")) | .id' | \ xargs -I {} ibmcloud is volume-delete {} --force || true echo "โœ… Cleanup completed" - name: Restore original deployment if: always() run: | echo "๐Ÿ”„ Restoring original karpenter deployment..." kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s