--- # Workload scenario selection (either 'homogeneous' or 'heterogeneous') workload_scenario: homogeneous # Workload configurations homogeneous_workload: namespace: homogeneous-workload container: image: quay.io/kako1/stress:22 resources: cpu: "250m" memory_request: "256Mi" memory_limit: "512Mi" args: ["--cpu", "1", "--timeout", "3600", "--vm", "1", "--vm-bytes", "128M", "--io", "2"] replicas: 40 heterogeneous_workload: namespace: heterogeneous-workload cpu_intensive: - size: large replicas: 20 cpu: '4000m' memory_request: '512Mi' memory_limit: '1Gi' args: ['--cpu', '4', '--timeout', '3600', '--vm', '2', '--vm-bytes', '256M', '--vm-hang', '1'] - size: medium replicas: 30 cpu: '2000m' memory_request: '256Mi' memory_limit: '512Mi' args: ['--cpu', '2', '--timeout', '3600', '--vm', '1', '--vm-bytes', '128M', '--io', '2'] - size: small replicas: 40 cpu: '1000m' memory_request: '128Mi' memory_limit: '256Mi' args: ['--cpu', '1', '--timeout', '3600', '--vm', '1', '--vm-bytes', '64M', '--vm-stride', '128', '--hdd', '1', '--hdd-bytes', '128M'] # Prometheus metrics configuration prometheus_queries: - query: 'count(kube_node_info)' metricName: 'NodeCount' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeMemoryPercantage' - query: '100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) * 100)' metricName: 'CPUPercentageAllCores' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeLatency' - query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0' metricName: 'API99thLatency' - query: 'histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type="create_container"}[5m])) by (instance, le)) > 0' metricName: 'ContainerStartupLatency' - query: 'count(kube_pod_status_phase{phase="Pending"})' metricName: 'PendingPods' - query: 'count(kube_pod_status_phase{phase="Pending"})' metricName: 'PendingPods' - query: 'count(kube_pod_status_phase)' metricName: 'TotalPods' - query: 'sum(rate(node_cpu_seconds_total{mode!="idle"}[1m]))' metricName: 'TotalCPUUtilization' - query: 'sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / 1024 / 1024' metricName: 'TotalMemoryUtilization' # Metrics collection configuration metrics_collection: # Port-forward service configuration port_forward_wait_timeout: 120 # increased timeout for stability # Prometheus health check configuration health_check_retries: 30 health_check_delay: 20 # increased delay between retries # Metrics query configuration query_retries: 30 query_delay: 20 # increased delay between retries query_step: "60s" # increased step to reduce load