# Default values for karpenter-ibm.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# Self-contained chart includes base Karpenter CRDs and IBM provider

replicaCount: 1

image:
  repository: ghcr.io/kubernetes-sigs/karpenter-provider-ibm-cloud/controller
  tag: v0.4.0
  pullPolicy: IfNotPresent
  # digest: "sha256:98e1441aa5d15f6e23e58a466c2526942e853d49281eefffa64e22361f7a19ad"
  # Overrides the image tag whose default is the chart appVersion.

# Logging configuration
logLevel: "debug"           # Options: debug, info, error
controllerVerbosity: "4"    # Controller manager verbosity (0-5)
controllerLogLevel: "debug" # Controller manager log level

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

serviceAccount:
  # Specifies whether a service account should be created
  create: true
  # Annotations to add to the service account
  annotations: {}
  # The name of the service account to use.
  # If not set and create is true, a name is generated using the fullname template
  name: ""

podAnnotations: {}

podSecurityContext: {}
  # fsGroup: 2000

securityContext: {}
  # capabilities:
  #   drop:
  #   - ALL
  # readOnlyRootFilesystem: true
  # runAsNonRoot: true
  # runAsUser: 1000

service:
  type: ClusterIP
  port: 80

resources: {}
  # limits:
  #   cpu: 100m
  #   memory: 128Mi
  # requests:
  #   cpu: 100m
  #   memory: 128Mi

nodeSelector: {}

tolerations: []

affinity: {}

# Pod Disruption Budget configuration
podDisruptionBudget:
  # The name of the PDB. If not set, it will use the release name
  name: ""
  # Maximum number of pods that can be unavailable
  maxUnavailable: 1

# IBM Cloud credentials configuration
credentials:
  # Required: ibm cloud api key for general operations
  ibmApiKey: ""
  # Required: ibm cloud region (e.g., us-south, eu-de)
  region: ""
  # Required: ibm cloud zone (e.g., us-south-1, eu-de-2)
  zone: ""
  # Required: ibm cloud resource group ID
  resourceGroupId: ""
  # Optional: vpc api endpoint url
  # If not provided, defaults to https://{region}.iaas.cloud.ibm.com/v1
  vpcUrl: ""
  # Optional: vpc authentication type
  vpcAuthType: "iam"
  # Required: vpc api key for vpc operations
  vpcApiKey: ""

# IKS cluster configuration (for IKS integration)
# Optional: IKS cluster ID for IKS API integration
iksClusterID: ""
# Optional: cluster name for identification
clusterName: ""
# Bootstrap mode: auto, cloud-init, or iks-api
bootstrapMode: "auto"

# Custom Resource Configuration
customResources:
  # Enable automatic creation of NodePools and IBMNodeClasses
  # When enabled, you must provide configuration based on your mode:
  # - For vpc mode: customResources.nodeClass.vpc.vpcId is required
  # - For iks mode: iksClusterID is required
  enabled: false

  # Mode: "iks" for IBM Kubernetes Service, "vpc" for self-managed VPC clusters
  mode: "vpc"  # Options: "iks", "vpc"

  # Global settings for all node pools
  global:
    # Default instance types across all pools
    instanceTypes:
      - "bx2-2x8"
      - "bx2-4x16"
      - "bx2-8x32"

    # Default capacity type
    capacityType: "on-demand"  # Options: "on-demand", "spot"

    # Default architecture
    architecture: "amd64"  # Options: "amd64", "arm64"

    # Default disruption settings
    disruption:
      consolidationPolicy: WhenUnderutilized
      consolidateAfter: 30s
      expireAfter: 720h  # 30 days

    # Default resource limits
    limits:
      cpu: 1000
      memory: 1000Gi

    # Default taints for all nodes
    taints: []
    # - key: "example.com/special-nodes"
    #   value: "true"
    #   effect: "NoSchedule"

  # NodeClass configuration (IBM-specific settings)
  nodeClass:
    # Create a default IBMNodeClass
    enabled: true
    name: "default"

    # VPC-specific settings (when mode=vpc)
    vpc:
      # VPC ID (required for VPC mode)
      vpcId: ""

      # Subnet selection strategy
      subnetSelection:
        # Strategy: "all", "zone-balanced", "cost-optimized", "availability-preferred"
        strategy: "zone-balanced"

        # Specific subnet IDs (optional - overrides strategy)
        subnetIds: []
        # - "subnet-123"
        # - "subnet-456"

        # Availability zones to use (optional)
        zones: []
        # - "us-south-1"
        # - "us-south-2"

      # Security group settings
      securityGroups:
        # Selection strategy: "auto", "manual"
        strategy: "auto"

        # Specific security group IDs (when strategy=manual)
        groupIds: []
        # - "sg-123"
        # - "sg-456"

      # Boot volume configuration
      bootVolume:
        # Volume type: "general-purpose", "5iops-tier", "10iops-tier", "custom"
        volumeType: "general-purpose"
        size: 100  # GB
        encrypted: true

        # Custom IOPS (when volumeType=custom)
        iops: null

      # Network interface settings
      networkInterface:
        # Enable IP forwarding
        allowIpSpoofing: false

        # Primary interface subnet strategy
        primaryInterfaceSubnetStrategy: "zone-balanced"

    # IKS-specific settings (when mode=iks)
    iks:
      # Worker pool template settings
      workerPoolTemplate:
        # Machine type for IKS worker pools
        machineType: "bx2.4x16"

        # Disk encryption
        diskEncryption: true

        # Operating system
        operatingSystem: "UBUNTU_20_64"

    # Common settings for both VPC and IKS
    common:
      # Image selection
      image:
        # Image selection strategy: "latest", "specific", "family"
        strategy: "latest"

        # Specific image ID (when strategy=specific)
        imageId: ""

        # Image family (when strategy=family)
        family: "ubuntu-minimal"

        # Operating system filter
        operatingSystem: "ubuntu"

        # Architecture filter
        architecture: "amd64"

      # User data / cloud-init configuration
      userData: ""

      # Instance metadata service
      metadataService:
        enabled: true
        hopLimit: 1

      # Resource tags
      tags:
        Environment: "development"
        ManagedBy: "karpenter"
        # Add custom tags here

  # NodePool configurations - supports multiple pools
  nodePools:
    # Default general-purpose pool
    - name: "general-purpose"
      enabled: true

      # Pool-specific requirements
      requirements:
        # Instance types for this pool
        instanceTypes:
          - "bx2-2x8"
          - "bx2-4x16"

        # Capacity type
        capacityType: "on-demand"

        # Architecture
        architecture: "amd64"

        # Availability zones
        zones: []
        # - "us-south-1"
        # - "us-south-2"

      # Resource limits for this pool
      limits:
        cpu: 100
        memory: 100Gi

      # Disruption settings for this pool
      disruption:
        consolidationPolicy: WhenEmpty
        consolidateAfter: 30s
        expireAfter: 720h

      # Taints specific to this pool
      taints: []

      # Labels specific to this pool
      labels:
        "nodepool": "general-purpose"
        "workload-type": "general"

      # NodeClass reference (defaults to global nodeClass.name)
      nodeClassRef: ""

    # Spot instances pool
    - name: "spot-instances"
      enabled: false

      requirements:
        instanceTypes:
          - "bx2-4x16"
          - "bx2-8x32"
        capacityType: "spot"
        architecture: "amd64"

      limits:
        cpu: 200
        memory: 200Gi

      disruption:
        consolidationPolicy: WhenUnderutilized
        consolidateAfter: 10s
        expireAfter: 12h  # Shorter expiry for spot

      taints:
        - key: "node.kubernetes.io/instance-type"
          value: "spot"
          effect: "NoSchedule"

      labels:
        "nodepool": "spot-instances"
        "workload-type": "batch"
        "cost-optimization": "enabled"

    # High-memory pool for data processing
    - name: "high-memory"
      enabled: false

      requirements:
        instanceTypes:
          - "mx2-8x64"
          - "mx2-16x128"
        capacityType: "on-demand"
        architecture: "amd64"

      limits:
        cpu: 50
        memory: 500Gi

      disruption:
        consolidationPolicy: WhenEmpty
        consolidateAfter: 60s
        expireAfter: 168h  # 7 days

      taints:
        - key: "workload-type"
          value: "high-memory"
          effect: "NoSchedule"

      labels:
        "nodepool": "high-memory"
        "workload-type": "memory-intensive"

    # GPU pool for AI/ML workloads
    - name: "gpu-pool"
      enabled: false

      requirements:
        instanceTypes:
          - "gx2-8x64x1v100"
          - "gx2-16x128x2v100"
        capacityType: "on-demand"
        architecture: "amd64"

      limits:
        cpu: 20
        memory: 100Gi

      disruption:
        consolidationPolicy: WhenEmpty
        consolidateAfter: 300s  # Longer consolidation for expensive instances
        expireAfter: 24h

      taints:
        - key: "nvidia.com/gpu"
          value: "true"
          effect: "NoSchedule"
        - key: "workload-type"
          value: "gpu"
          effect: "NoSchedule"

      labels:
        "nodepool": "gpu-pool"
        "workload-type": "gpu"
        "accelerator": "nvidia-v100"

# Circuit Breaker configuration
# Protects against API rate limits and cascading failures
circuitBreaker:
  # Enable circuit breaker (set to false to disable all protections)
  enabled: true

  # Configuration values - supports time.Duration format (e.g., "5m", "30s")
  config:
    # Number of consecutive failures before opening the circuit
    failureThreshold: 3

    # Time window for counting failures
    failureWindow: "5m"

    # How long to wait before attempting recovery (half-open state)
    recoveryTimeout: "15m"

    # Maximum test requests allowed in half-open state
    halfOpenMaxRequests: 2

    # Maximum instances created per minute (rate limiting)
    # For development/demo: increase to 10-20
    # For production: keep at 2-5 based on IBM Cloud quotas
    rateLimitPerMinute: 10

    # Maximum instances being created simultaneously
    maxConcurrentInstances: 5

  # Preset configurations (overrides individual config values)
  # Options: "conservative", "balanced", "aggressive", "demo", "custom"
  preset: "balanced"

  # Preset definitions (only used when preset != "custom")
  presets:
    conservative:
      failureThreshold: 2
      failureWindow: "3m"
      recoveryTimeout: "20m"
      halfOpenMaxRequests: 1
      rateLimitPerMinute: 2
      maxConcurrentInstances: 3

    balanced:
      failureThreshold: 3
      failureWindow: "5m"
      recoveryTimeout: "15m"
      halfOpenMaxRequests: 2
      rateLimitPerMinute: 5
      maxConcurrentInstances: 5

    aggressive:
      failureThreshold: 5
      failureWindow: "10m"
      recoveryTimeout: "5m"
      halfOpenMaxRequests: 3
      rateLimitPerMinute: 10
      maxConcurrentInstances: 10

    demo:
      failureThreshold: 10
      failureWindow: "30m"
      recoveryTimeout: "1m"
      halfOpenMaxRequests: 5
      rateLimitPerMinute: 20
      maxConcurrentInstances: 15

# Additional CA certificates for cross-VPC kubelet authentication
additionalCAs:
  enabled: false
  # Map of CA certificates (key: filename, value: base64-encoded certificate)
  # Example:
  # certs:
  #   rke2-ca.crt: "LS0tLS1CRUdJTi1DRVJUSUZJQ0FURS0tLS0t..."
  #   proxy-ca.crt: "LS0tLS1CRUdJTi1DRVJUSUZJQ0FURS0tLS0t..."
  certs: {}

# Controller configuration
controller:
  # Orphan cleanup configuration
  orphanCleanup:
    # Enable cleanup of orphaned instances (instances without NodeClaims)
    # This prevents resource leaks when NodeClaims are deleted before instances are cleaned up
    enabled: true

  metrics:
    port: 8080
  healthProbe:
    port: 8081
  livenessProbe:
    httpGet:
      path: /healthz
      port: health
    initialDelaySeconds: 5
    timeoutSeconds: 1
  readinessProbe:
    httpGet:
      path: /readyz
      port: health
    initialDelaySeconds: 5
    timeoutSeconds: 1

# Metrics configuration
metrics:
  # Enable ServiceMonitor creation for Prometheus scraping
  serviceMonitor:
    enabled: false
    additionalLabels: {}
    # Scraping interval for metrics collection
    interval: 30s
    # Timeout for each scrape
    scrapeTimeout: 10s
    # Relabeling configuration for the ServiceMonitor
    relabelings: []
    # Metric relabeling configuration for the ServiceMonitor
    metricRelabelings: []
    # Additional endpoint configuration
    endpointConfig: {}

  # Enable PrometheusRule creation for alerting
  prometheusRule:
    enabled: false
    additionalLabels: {}
    # Rule evaluation interval
    interval: 30s
    # URL for runbook documentation
    runbookUrl: "https://github.com/kubernetes-sigs/karpenter-provider-ibm-cloud/blob/main/docs/troubleshooting.md"

    # Alert configuration with customizable thresholds and timing
    alerts:
      # Controller availability alerts
      controllerDown:
        for: 5m
        severity: critical

      # Error rate alerts
      highErrorRate:
        threshold: 0.1  # 10% error rate
        for: 10m
        severity: warning

      # Performance alerts
      slowReconciliation:
        threshold: 60  # 60 seconds
        for: 15m
        severity: warning

      slowNodeProvisioning:
        threshold: 600  # 10 minutes
        for: 15m
        severity: warning

      # IBM Cloud API alerts
      apiErrors:
        threshold: 0.05  # 5% error rate
        for: 5m
        severity: warning

      apiRateLimit:
        threshold: 0.01  # 1% rate limit occurrence
        for: 2m
        severity: warning

      # Node provisioning alerts (CRITICAL - impacts workload scheduling)
      nodeProvisioningFailures:
        threshold: 0.01  # 1% failure rate
        for: 5m
        severity: critical

      noNodeProvisioning:
        podThreshold: 0  # Any unschedulable pods
        for: 10m
        severity: critical

      # Cache performance alerts
      lowCacheHitRate:
        threshold: 80  # 80% hit rate
        for: 30m
        severity: info

      # Resource utilization alerts
      highMemoryUsage:
        threshold: 500  # 500MB
        for: 30m
        severity: warning

      highCPUUsage:
        threshold: 80  # 80% CPU usage
        for: 30m
        severity: warning

    # Additional custom rule groups
    additionalRuleGroups: []