name: E2E Tests for PRs

on:
  workflow_dispatch: {}
  pull_request:
    types: [opened, synchronize, reopened]
  issue_comment:
    types: [created]

permissions:
  contents: read
  packages: write
  issues: read
  pull-requests: read

concurrency:
  group: e2e-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }}
  cancel-in-progress: false

env:
  E2E_APPROVAL_COMMENT: "/ok-to-e2e"
  GO_VERSION: "1.24"
  KO_VERSION: "0.15.4"
  KUBECTL_VERSION: "1.28.0"

jobs:
  gate:
    name: Gate (manual or approver comment)
    runs-on: ubuntu-latest
    outputs:
      approved:  ${{ steps.decide.outputs.approved }}
      pr_number: ${{ steps.decide.outputs.pr_number }}
      head_sha:  ${{ steps.decide.outputs.head_sha }}
      ref:       ${{ steps.decide.outputs.ref }}
      image_tag: ${{ steps.decide.outputs.image_tag }}
      image_ref: ${{ steps.decide.outputs.image_ref }}
    steps:
      - id: decide
        uses: actions/github-script@v7
        with:
          script: |
            async function checkUserPermission(username) {
              try {
                const { data: collaborator } = await github.rest.repos.getCollaboratorPermissionLevel({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  username: username
                });

                const hasWriteAccess = ['admin', 'write', 'maintain'].includes(collaborator.permission);
                return {
                  hasAccess: hasWriteAccess,
                  permission: collaborator.permission
                };
              } catch (error) {
                console.log(`Could not check permissions for ${username}: ${error.message}`);
                return {
                  hasAccess: false,
                  permission: 'unknown'
                };
              }
            }

            let approved = false;
            let prNumber = '';
            let headSHA = context.sha;
            let ref = (context.ref || '').replace('refs/heads/', '');

            if (context.eventName === 'pull_request') {
              prNumber = String(context.payload.pull_request.number);
              headSHA = context.payload.pull_request.head.sha;
              ref = context.payload.pull_request.head.ref;

              const author = context.payload.pull_request.user.login;
              const permCheck = await checkUserPermission(author);

              if (permCheck.hasAccess) {
                approved = true;
                console.log(`Auto-approved E2E for ${author} (${permCheck.permission} access)`);
              } else {
                console.log(`E2E requires manual approval for ${author} (${permCheck.permission} access)`);
              }

            } else if (context.eventName === 'issue_comment') {
              const comment = context.payload.comment.body || '';
              const commenter = context.payload.comment.user.login || '';

              if (context.payload.issue.pull_request && comment.includes('/ok-to-e2e')) {
                const permCheck = await checkUserPermission(commenter);

                if (permCheck.hasAccess) {
                  approved = true;
                  console.log(`Manual E2E approval by ${commenter} (${permCheck.permission} access)`);

                  const { data: pr } = await github.rest.pulls.get({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    pull_number: context.payload.issue.number
                  });
                  headSHA = pr.head.sha;
                  ref = pr.head.ref;
                  prNumber = String(context.payload.issue.number);
                } else {
                  console.log(`E2E approval denied for ${commenter} (${permCheck.permission} access)`);
                }
              }
            } else if (context.eventName === 'workflow_dispatch') {
              approved = true;
            }

            const tag = (prNumber ? `pr-${prNumber}-` : '') + headSHA.substring(0, 12);
            const imageRef = `ghcr.io/${context.repo.owner}/${context.repo.repo}:${tag}`;

            console.log('Event:', context.eventName);
            console.log('Approved:', approved);
            console.log('PR:', prNumber);
            console.log('Image:', imageRef);

            core.setOutput('approved', approved ? 'true' : 'false');
            core.setOutput('pr_number', prNumber);
            core.setOutput('head_sha', headSHA);
            core.setOutput('ref', ref);
            core.setOutput('image_tag', tag);
            core.setOutput('image_ref', imageRef);

  build:
    name: Build PR image with ko
    needs: [gate]
    if: ${{ needs.gate.outputs.approved == 'true' }}
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ needs.gate.outputs.ref }}
          fetch-depth: 0

      - uses: actions/setup-go@v4
        with:
          go-version: ${{ env.GO_VERSION }}
          cache: true

      - uses: ko-build/setup-ko@v0.6
        with:
          version: v${{ env.KO_VERSION }}

      - name: Build & push image
        env:
          KO_DOCKER_REPO: ghcr.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
        run: |
          echo "${{ secrets.GITHUB_TOKEN }}" | ko login ghcr.io --username ${{ github.actor }} --password-stdin
          ko build ./cmd/controller --platform=linux/amd64 --bare --tags="${{ needs.gate.outputs.image_tag }}"

      - name: Cleanup old PR images
        if: ${{ needs.gate.outputs.pr_number != '' }}
        continue-on-error: true
        uses: actions/github-script@v7
        with:
          script: |
            const packageName = context.repo.repo;
            const prNumber = '${{ needs.gate.outputs.pr_number }}';

            try {
              const { data: versions } = await github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg({
                package_type: 'container',
                package_name: packageName,
                org: context.repo.owner,
                per_page: 100
              });

              const prVersions = versions
                .filter(version =>
                  version.metadata?.container?.tags?.some(tag => tag.startsWith(`pr-${prNumber}-`))
                )
                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at));

              const toDelete = prVersions.slice(3);

              console.log(`Found ${prVersions.length} images for PR #${prNumber}`);
              console.log(`Keeping latest 3, deleting ${toDelete.length} old images`);

              for (const version of toDelete) {
                try {
                  await github.rest.packages.deletePackageVersionForOrg({
                    package_type: 'container',
                    package_name: packageName,
                    org: context.repo.owner,
                    package_version_id: version.id
                  });
                  console.log(`Deleted image version ${version.id}`);
                } catch (error) {
                  console.log(`Failed to delete version ${version.id}: ${error.message}`);
                }
              }
            } catch (error) {
              console.log(`Failed to cleanup images: ${error.message}`);
            }

  e2e:
    name: Run E2E against PR image
    needs: [gate, build]
    if: ${{ needs.gate.outputs.approved == 'true' }}
    runs-on: [self-hosted, ibm-e2e]
    timeout-minutes: 210
    container:
      image: golang:1.24.6
      options: --user 0
    env:
      KUBECONFIG: /tmp/kubeconfig
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ needs.gate.outputs.ref }}

      - name: Install dependencies
        run: |
          apt-get update && apt-get install -y curl jq
          curl -LO "https://dl.k8s.io/release/v${{ env.KUBECTL_VERSION }}/bin/linux/amd64/kubectl"
          chmod +x kubectl && mv kubectl /usr/local/bin/
          curl -fsSL https://clis.cloud.ibm.com/install/linux | sh
          ibmcloud plugin install vpc-infrastructure

      - name: Setup kubeconfig
        env:
          KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }}
        run: |
          printf '%s' "$KUBECONFIG_CONTENT" | base64 -d > /tmp/kubeconfig
          chmod 600 /tmp/kubeconfig
          kubectl version --client

      - name: Verify cluster access
        run: |
          kubectl cluster-info
          kubectl auth can-i create nodeclaims --all-namespaces
          kubectl auth can-i create nodepools --all-namespaces
          kubectl auth can-i create ibmnodeclasses --all-namespaces

      - name: Configure IBM Cloud CLI
        env:
          IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
        run: |
          ibmcloud login --apikey "${{ secrets.IBMCLOUD_API_KEY }}" -r "${{ secrets.IBMCLOUD_REGION }}"

      - name: Deploy PR version
        run: |
          kubectl apply -f charts/crds/

          echo "🔍 Checking deployment structure..."
          kubectl get deployment karpenter-karpenter-ibm -n karpenter -o yaml | grep -A 10 "containers:"

          CONTAINER_NAME=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].name}')
          echo "📋 Found container name: $CONTAINER_NAME"

          kubectl set image deployment/karpenter-karpenter-ibm \
            $CONTAINER_NAME=${{ needs.gate.outputs.image_ref }} \
            -n karpenter

          kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s

          CURRENT_IMAGE=$(kubectl get deployment karpenter-karpenter-ibm -n karpenter -o jsonpath='{.spec.template.spec.containers[0].image}')
          echo "✅ Deployment updated to: $CURRENT_IMAGE"

      - name: Pre-test cleanup
        run: |
          echo "🧹 Cleaning up any existing e2e test resources..."
          kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true
          kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true
          kubectl delete nodeclaims -l test=e2e --timeout=300s || true
          kubectl delete nodepools -l test=e2e --timeout=300s || true
          kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true

          echo "⏳ Waiting for cluster stabilization..."
          for i in {1..30}; do
            pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l)
            if [ "$pending_pods" -eq 0 ]; then
              echo "✅ No pending e2e pods found"
              break
            fi
            echo "⏳ Still have $pending_pods pending e2e pods, waiting..."
            sleep 10
          done

          for i in {1..30}; do
            disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0")
            disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0")
            if [ "$disrupted_nodes" -eq 0 ]; then
              echo "✅ No disrupted nodes found"
              break
            fi
            echo "⏳ Still have $disrupted_nodes disrupted nodes, waiting..."
            sleep 10
          done

          sleep 30
          echo "✅ Pre-test cleanup completed"

      - name: Run E2E tests
        env:
          RUN_E2E_TESTS: "true"
          IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
          VPC_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
          IBMCLOUD_REGION: ${{ secrets.IBMCLOUD_REGION }}
          TEST_VPC_ID: ${{ secrets.E2E_TEST_VPC_ID }}
          TEST_SUBNET_ID: ${{ secrets.E2E_TEST_SUBNET_ID }}
          TEST_IMAGE_ID: ${{ secrets.E2E_TEST_IMAGE_ID }}
          TEST_ZONE: ${{ secrets.E2E_TEST_ZONE }}
          TEST_SECURITY_GROUP_ID: ${{ secrets.E2E_TEST_SECURITY_GROUP_ID }}
          VPC_URL: ${{ secrets.VPC_URL }}
          KUBERNETES_API_SERVER_ENDPOINT: ${{ secrets.KUBERNETES_API_SERVER_ENDPOINT }}
          IBM_RESOURCE_GROUP_ID: ${{ secrets.IBM_RESOURCE_GROUP_ID }}
          IBM_SSH_KEY_ID: ${{ secrets.IBM_SSH_KEY_ID }}
          E2E_SEQUENTIAL: "true"
          E2E_CLEANUP_TIMEOUT: "300s"
          E2E_STABILIZATION_WAIT: "60s"
        run: |
          echo "🚀 Starting E2E test suite..."

          # Define test groups
          # Core functionality tests from basic_workflow_test.go
          core_tests="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability"

          # NodeClass validation tests from validation_test.go
          validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields"

          # Block device mapping tests from block_device_test.go
          block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation"

          # Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go
          scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling"

          # UserData feature tests from userdata_test.go
          userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap"

          # Image selector tests from image_selector_test.go
          image_selector_tests="TestE2EImageSelector"

          # Multi-zone tests from multizone_test.go
          multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover"

          # Cleanup tests from cleanup_test.go
          cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources"

          # Combine all tests
          all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests"

          test_failed="false"
          passed_tests=0
          failed_tests=0
          total_tests=$(echo $all_tests | wc -w)

          echo "📋 Test Suite Summary:"
          echo "  Core Tests: $(echo $core_tests | wc -w)"
          echo "  Validation Tests: $(echo $validation_tests | wc -w)"
          echo "  Block Device Tests: $(echo $block_device_tests | wc -w)"
          echo "  Scheduling Tests: $(echo $scheduling_tests | wc -w)"
          echo "  UserData Tests: $(echo $userdata_tests | wc -w)"
          echo "  Image Selector Tests: $(echo $image_selector_tests | wc -w)"
          echo "  Multi-Zone Tests: $(echo $multizone_tests | wc -w)"
          echo "  Cleanup Tests: $(echo $cleanup_tests | wc -w)"
          echo "  Total Tests: $total_tests"
          echo ""

          for test in $all_tests; do
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
            echo "🧪 Running test: $test"
            echo "Progress: $((passed_tests + failed_tests + 1))/$total_tests"
            echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

            # Set appropriate timeout based on test type
            timeout="20m"
            case "$test" in
              "TestE2EDriftStability")
                timeout="30m"  # Drift test needs more time for monitoring
                ;;
              "TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*)
                timeout="25m"  # Multi-zone tests need extra time for cross-zone provisioning
                ;;
              "TestE2ECleanup"*)
                timeout="15m"  # Cleanup tests are typically faster
                ;;
              "TestE2EValidation"*|"TestE2ENodeClass"*)
                timeout="10m"  # Validation tests are quick
                ;;
              *)
                timeout="20m"  # Default timeout for other tests
                ;;
            esac

            # Create test-specific log file to capture all output
            test_log="test-artifacts/${test}-$(date +%s).log"
            mkdir -p test-artifacts

            # Run test with enhanced logging and crash recovery
            set +e  # Don't exit on failure
            timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 2>&1 | tee "$test_log"
            test_exit_code=$?
            set -e  # Re-enable exit on failure

            if [ $test_exit_code -eq 0 ]; then
              echo "✅ Test $test passed"
              passed_tests=$((passed_tests + 1))
            else
              echo "❌ Test $test failed (exit code: $test_exit_code)"
              failed_tests=$((failed_tests + 1))

              # Enhanced debug information on failure
              echo "📊 Debug information for failed test $test:"
              echo "  Exit code: $test_exit_code"
              echo "  Log file: $test_log"

              # Collect system state
              kubectl get nodes --no-headers | wc -l | xargs echo "  Total nodes:"
              kubectl get nodeclaims --no-headers 2>/dev/null | wc -l | xargs echo "  Total nodeclaims:" || echo "  Total nodeclaims: 0"
              kubectl get pods -l test=e2e --all-namespaces --no-headers 2>/dev/null | wc -l | xargs echo "  Total e2e pods:" || echo "  Total e2e pods: 0"

              # Collect Karpenter pod status
              echo "  Karpenter pod status:"
              kubectl get pods -n karpenter -l app.kubernetes.io/name=karpenter --no-headers 2>/dev/null || echo "    No Karpenter pods found"

              # Collect recent events (errors and warnings)
              echo "  Recent warning events:"
              kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -5 || echo "    No warning events"

              # Check for panic or crash indicators in test log
              if grep -i "panic\|fatal\|segmentation\|killed" "$test_log" >/dev/null 2>&1; then
                echo "  ⚠️  Test appears to have crashed (panic/fatal error detected)"
              fi

              # Collect Karpenter logs immediately after failure
              kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 > "test-artifacts/karpenter-logs-${test}-$(date +%s).txt" 2>/dev/null || echo "  Failed to collect Karpenter logs"

              test_failed="true"
            fi

            echo "🧹 Cleaning up after test: $test"
            kubectl delete pods -l test=e2e --all-namespaces --timeout=300s || true
            kubectl delete deployments -l test=e2e --all-namespaces --timeout=300s || true
            kubectl delete nodeclaims -l test=e2e --timeout=300s || true
            kubectl delete nodepools -l test=e2e --timeout=300s || true
            kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true

            echo "⏳ Waiting for cleanup to complete..."
            sleep 30

            kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:"
            kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0"

            echo "✅ Completed test: $test"
            echo ""
          done

          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
          echo "📊 Test Suite Results:"
          echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
          echo "  Total Tests: $total_tests"
          echo "  ✅ Passed: $passed_tests"
          echo "  ❌ Failed: $failed_tests"
          echo "  Success Rate: $((passed_tests * 100 / total_tests))%"
          echo ""

          if [ "$test_failed" = "true" ]; then
            echo "❌ Test suite failed with $failed_tests failures"
            exit 1
          fi

          echo "✅ All E2E tests completed successfully!"

      - name: Collect test artifacts
        if: always()
        run: |
          echo "📦 Collecting comprehensive test artifacts..."
          mkdir -p test-artifacts

          # Collect Karpenter logs with different tail sizes for completeness
          echo "  Collecting Karpenter logs..."
          kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=2000 > test-artifacts/karpenter-logs.txt 2>/dev/null || echo "Failed to collect current Karpenter logs" > test-artifacts/karpenter-logs.txt
          kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --previous --tail=1000 > test-artifacts/karpenter-logs-previous.txt 2>/dev/null || echo "No previous Karpenter logs available" > test-artifacts/karpenter-logs-previous.txt

          # Collect events with different filters
          echo "  Collecting events..."
          kubectl get events -A --sort-by='.lastTimestamp' > test-artifacts/events.txt 2>/dev/null || echo "Failed to collect events" > test-artifacts/events.txt
          kubectl get events -A --field-selector type=Warning --sort-by='.lastTimestamp' > test-artifacts/events-warnings.txt 2>/dev/null || echo "No warning events" > test-artifacts/events-warnings.txt
          kubectl get events -A --field-selector type=Normal --sort-by='.lastTimestamp' | tail -50 > test-artifacts/events-normal-recent.txt 2>/dev/null || echo "No normal events" > test-artifacts/events-normal-recent.txt

          # Collect resource states
          echo "  Collecting resource states..."
          kubectl get nodes -o wide > test-artifacts/nodes.txt 2>/dev/null || echo "Failed to collect nodes" > test-artifacts/nodes.txt
          kubectl get nodeclaims -o yaml > test-artifacts/nodeclaims.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodeclaims.yaml
          kubectl get nodepools -o yaml > test-artifacts/nodepools.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/nodepools.yaml
          kubectl get ibmnodeclasses -o yaml > test-artifacts/ibmnodeclasses.yaml 2>/dev/null || echo "apiVersion: v1\nitems: []\nkind: List" > test-artifacts/ibmnodeclasses.yaml

          # Collect Karpenter deployment status
          echo "  Collecting Karpenter deployment status..."
          kubectl describe deployment -n karpenter karpenter-karpenter-ibm > test-artifacts/karpenter-deployment.txt 2>/dev/null || echo "Failed to describe Karpenter deployment" > test-artifacts/karpenter-deployment.txt
          kubectl get pods -n karpenter -o wide > test-artifacts/karpenter-pods.txt 2>/dev/null || echo "Failed to get Karpenter pods" > test-artifacts/karpenter-pods.txt

          # Collect any crash dumps or additional logs
          echo "  Collecting additional diagnostics..."
          kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded > test-artifacts/problematic-pods.txt 2>/dev/null || echo "No problematic pods found" > test-artifacts/problematic-pods.txt

          # Create summary of artifacts
          echo "  Creating artifact summary..."
          {
            echo "E2E Test Artifacts Summary"
            echo "========================="
            echo "Generated: $(date)"
            echo "Test run ID: ${{ github.run_id }}"
            echo ""
            echo "Files collected:"
            ls -la test-artifacts/ 2>/dev/null || echo "No artifacts directory"
          } > test-artifacts/README.txt

          echo "✅ Test artifact collection completed"

      - uses: actions/upload-artifact@v4
        if: always()
        with:
          name: e2e-test-artifacts-${{ github.run_id }}
          path: test-artifacts/
          retention-days: 7

      - name: Cleanup test resources
        if: always()
        env:
          IBMCLOUD_API_KEY: ${{ secrets.IBMCLOUD_API_KEY }}
        run: |
          echo "🧹 Starting comprehensive cleanup..."
          kubectl delete pods -l test=e2e --all-namespaces --timeout=10m || true
          kubectl delete deployments -l test=e2e --all-namespaces --timeout=10m || true
          kubectl delete nodeclaims -l test=e2e --timeout=10m || true
          kubectl delete nodepools -l test=e2e --timeout=10m || true
          kubectl delete ibmnodeclasses -l test=e2e --timeout=10m || true

          kubectl patch nodeclaims --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true
          kubectl patch nodepools --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true
          kubectl patch ibmnodeclasses --selector test=e2e --type='merge' -p='{"metadata":{"finalizers":[]}}' || true

          ibmcloud is instances --output json | \
            jq -r '.[] | select(.tags | index("karpenter-e2e")) | .id' | \
            xargs -I {} ibmcloud is instance-delete {} --force || true

          ibmcloud is virtual-network-interfaces --output json | \
            jq -r '.[] | select(.name | test("e2e-.*-vni")) | .id' | \
            xargs -I {} ibmcloud is virtual-network-interface-delete {} --force || true

          ibmcloud is volumes --output json | \
            jq -r '.[] | select(.name | test("e2e-.*-boot")) | .id' | \
            xargs -I {} ibmcloud is volume-delete {} --force || true

          echo "✅ Cleanup completed"

      - name: Restore original deployment
        if: always()
        run: |
          echo "🔄 Restoring original karpenter deployment..."
          kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter
          kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=300s