- name: Copy metrics profile file ansible.builtin.copy: src: "{{ playbook_dir }}/files/metrics-profiles/metrics-aggregated.yaml" dest: "metrics-aggregated.yaml" - name: Template metrics-endpoints.yaml for IKS clusters ansible.builtin.template: src: templates/metrics-endpoints.yaml.j2 dest: "metrics-endpoints-iks-{{ item }}.yaml" vars: prom_endpoint: "{{ lookup('vars', 'PROM_ENDPOINT_IKS_' ~ (item | upper)) }}" prom_metrics_profile_path: "metrics-aggregated.yaml" prom_token: "{{ lookup('vars', 'PROM_TOKEN_IKS_' ~ (item | upper)) }}" env: "iks" uuid: "{{ tag_uuid }}" cluster: "{{ item }}" loop: - karpenter - cas - name: deploy systemd service for port-forwarding become: yes ansible.builtin.template: src: templates/port-forward@.service.j2 dest: /etc/systemd/system/port-forward-{{ item.cluster }}-iks.service mode: '0644' loop: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } vars: kubeconfig_path: "{{ lookup('vars', 'KUBECONFIG_IKS_' ~ (item.cluster | upper)) }}" - name: start port-forward service for each cluster become: yes ansible.builtin.systemd: name: "port-forward-{{ item }}-iks" state: started enabled: yes daemon_reload: true loop: - karpenter - cas - name: wait 10 minutes for initial metrics collection ansible.builtin.wait_for: timeout: 600 # Homogeneous workload tasks for karpenter cluster # - name: Create namespace for homogeneous workload on karpenter # kubernetes.core.k8s: # name: homogeneous-workload # api_version: v1 # kind: Namespace # state: present # environment: # KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_KARPENTER') }}" # - name: Deploy homogeneous workload using medium CPU template on karpenter # kubernetes.core.k8s: # definition: # apiVersion: apps/v1 # kind: Deployment # metadata: # name: homogeneous-workload # namespace: homogeneous-workload # spec: # replicas: 40 # selector: # matchLabels: # app: homogeneous-workload # template: # metadata: # labels: # app: homogeneous-workload # spec: # containers: # - name: cpu-intensive-container # image: quay.io/kako1/stress:22 # resources: # requests: # cpu: "2000m" # memory: "256Mi" # limits: # cpu: "2000m" # memory: "512Mi" # command: ["stress"] # args: ["--cpu", "2", "--timeout", "3600", "--vm", "1", "--vm-bytes", "128M", "--io", "2"] # state: present # environment: # KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_KARPENTER') }}" # Heterogeneous workload tasks for cas cluster - name: Create namespace for heterogeneous workload on cas kubernetes.core.k8s: name: heterogeneous-workload api_version: v1 kind: Namespace state: present environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" # Apply CPU intensive deployments - name: Apply CPU intensive deployments on cas kubernetes.core.k8s: definition: apiVersion: apps/v1 kind: Deployment metadata: name: "cpu-intensive-{{ item.size }}" namespace: heterogeneous-workload spec: replicas: 40 selector: matchLabels: app: "cpu-intensive-{{ item.size }}" template: metadata: labels: app: "cpu-intensive-{{ item.size }}" spec: containers: - name: cpu-intensive-container image: quay.io/kako1/stress:22 resources: requests: cpu: "{{ item.cpu }}" memory: "{{ item.memory_request }}" limits: cpu: "{{ item.cpu }}" memory: "{{ item.memory_limit }}" command: ["stress"] args: "{{ item.args }}" state: present loop: - { size: 'large', replicas: 20, cpu: '4000m', memory_request: '512Mi', memory_limit: '1Gi', args: ['--cpu', '4', '--timeout', '3600', '--vm', '2', '--vm-bytes', '256M', '--vm-hang', '1'] } - { size: 'medium', replicas: 30, cpu: '2000m', memory_request: '256Mi', memory_limit: '512Mi', args: ['--cpu', '2', '--timeout', '3600', '--vm', '1', '--vm-bytes', '128M', '--io', '2'] } - { size: 'small', replicas: 40, cpu: '1000m', memory_request: '128Mi', memory_limit: '256Mi', args: ['--cpu', '1', '--timeout', '3600', '--vm', '1', '--vm-bytes', '64M', '--vm-stride', '128', '--hdd', '1', '--hdd-bytes', '128M'] } environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" # Apply Memory intensive jobs - name: Apply Memory intensive jobs on cas kubernetes.core.k8s: definition: apiVersion: batch/v1 kind: Job metadata: name: "memory-heavy-{{ item.size }}" namespace: heterogeneous-workload spec: template: spec: containers: - name: memory-heavy-container image: quay.io/kako1/stress:22 resources: requests: memory: "{{ item.request }}" limits: memory: "{{ item.limit }}" command: ["stress"] args: ["--vm", "2", "--vm-bytes", "{{ item.bytes }}", "--timeout", "600"] restartPolicy: Never backoffLimit: 4 state: present loop: - { size: 'large', request: '2Gi', limit: '32Gi', bytes: '32G' } - { size: 'medium', request: '2Gi', limit: '16Gi', bytes: '16G' } - { size: 'small', request: '2Gi', limit: '4Gi', bytes: '4G' } environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" # Apply IO intensive deployments - name: Apply IO intensive deployments on cas kubernetes.core.k8s: definition: apiVersion: apps/v1 kind: Deployment metadata: name: "io-intensive-{{ item.size }}" namespace: heterogeneous-workload spec: replicas: 40 selector: matchLabels: app: "io-intensive-{{ item.size }}" template: metadata: labels: app: "io-intensive-{{ item.size }}" spec: containers: - name: io-bound-container image: busybox:latest resources: requests: cpu: "{{ item.cpu }}" memory: "{{ item.memory_request }}" limits: cpu: "{{ item.cpu }}" memory: "{{ item.memory_limit }}" volumeMounts: - name: test-volume mountPath: /data command: ["/bin/sh"] args: ["-c", "dd if=/dev/zero of=/data/testfile bs=1M count=1024 && sleep 600"] volumes: - name: test-volume emptyDir: {} state: present loop: - { size: 'large', replicas: 20, cpu: '2000m', memory_request: '512Mi', memory_limit: '1Gi' } - { size: 'medium', replicas: 30, cpu: '1000m', memory_request: '256Mi', memory_limit: '512Mi' } - { size: 'small', replicas: 40, cpu: '500m', memory_request: '128Mi', memory_limit: '256Mi' } environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" # Deploy iperf server for network tests - name: Deploy iperf server on cas kubernetes.core.k8s: definition: apiVersion: apps/v1 kind: Deployment metadata: name: iperf-server namespace: heterogeneous-workload spec: replicas: 1 selector: matchLabels: app: iperf-server template: metadata: labels: app: iperf-server spec: containers: - name: iperf-server image: networkstatic/iperf3:latest command: ["iperf3"] args: ["-s"] ports: - containerPort: 5201 state: present environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" - name: Create iperf server service on cas kubernetes.core.k8s: definition: apiVersion: v1 kind: Service metadata: name: iperf-server namespace: heterogeneous-workload spec: selector: app: iperf-server ports: - port: 5201 targetPort: 5201 state: present environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" # Apply Network intensive deployments - name: Apply Network intensive deployments on cas kubernetes.core.k8s: definition: apiVersion: apps/v1 kind: Deployment metadata: name: "network-intensive-{{ item.size }}" namespace: heterogeneous-workload spec: replicas: 40 selector: matchLabels: app: "network-intensive-{{ item.size }}" template: metadata: labels: app: "network-intensive-{{ item.size }}" spec: containers: - name: network-intensive-container image: networkstatic/iperf3:latest resources: requests: cpu: "{{ item.cpu }}" memory: "{{ item.memory_request }}" limits: cpu: "{{ item.cpu }}" memory: "{{ item.memory_limit }}" command: ["iperf3"] args: ["-c", "iperf-server", "-t", "600"] state: present loop: - { size: 'large', replicas: 20, cpu: '1000m', memory_request: '256Mi', memory_limit: '512Mi' } - { size: 'medium', replicas: 30, cpu: '500m', memory_request: '128Mi', memory_limit: '256Mi' } - { size: 'small', replicas: 40, cpu: '250m', memory_request: '64Mi', memory_limit: '128Mi' } environment: KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_CAS') }}" - name: wait 10 minutes for workloads to stabilize ansible.builtin.wait_for: timeout: 600 - name: ensure jq is installed ansible.builtin.package: name: jq state: present - name: create output directories file: path: "{{ playbook_dir }}/experiment-data/iks/{{ item.cluster }}" state: directory loop: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } loop_control: loop_var: item - name: Set prometheus queries set_fact: prometheus_queries: - query: 'count(kube_node_info)' metricName: 'NodeCount' - query: 'avg(node_load1) by (instance)' metricName: 'CPUByInstance' - query: '100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) * 100)' metricName: 'CPUPercentageAllCores' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeMemoryPercantage' - query: 'rate(kube_pod_start_time_seconds{condition="Running"}[5m]) - rate(kube_pod_start_time_seconds{condition="Pending"}[5m])' metricName: 'PodLatency' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeLatency' - query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0' metricName: 'API99thLatency' - query: 'sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0' metricName: 'APIRequestRate' - query: 'sum(apiserver_current_inflight_requests{}) by (request_kind) > 0' metricName: 'APIInflightRequests' - query: '(sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0' metricName: 'containerMemory-Masters' - query: '(sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0' metricName: 'containerCPU-Masters' - query: '(sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0' metricName: 'containerCPU-Prometheus' - query: '(avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0' metricName: 'containerCPU-AggregatedWorkers' - query: '(avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0' metricName: 'containerCPU-AggregatedInfra' - query: '(sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0' metricName: 'containerMemory-Prometheus' - query: 'avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace)' metricName: 'containerMemory-AggregatedWorkers' - query: 'avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace)' metricName: 'containerMemory-AggregatedInfra' - query: '(sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0' metricName: 'nodeCPU-Masters' - query: '(avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0' metricName: 'nodeCPU-AggregatedWorkers' - query: '(avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0' metricName: 'nodeCPU-AggregatedInfra' - query: 'avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")' metricName: 'nodeMemoryAvailable-Masters' - query: 'avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryAvailable-AggregatedWorkers' - query: 'avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryAvailable-AggregatedInfra' - query: 'avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-Masters' - query: 'avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-AggregatedWorkers' - query: 'avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-AggregatedInfra' - query: 'avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-Masters' - query: 'avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers' - query: 'avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra' - query: 'irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'rxNetworkBytes-Masters' - query: 'avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'rxNetworkBytes-AggregatedWorkers' - query: 'avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'rxNetworkBytes-AggregatedInfra' - query: 'irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'txNetworkBytes-Masters' - query: 'avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'txNetworkBytes-AggregatedWorkers' - query: 'avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'txNetworkBytes-AggregatedInfra' - name: set current epoch time set_fact: current_epoch: "{{ lookup('pipe', 'date +%s') }}" - name: set start and end time for prometheus query ansible.builtin.set_fact: start_time: "{{ lookup('pipe', 'date --utc --date=\"@$(($(date +%s) - 7200))\" +%Y-%m-%dT%H:%M:%SZ') }}" end_time: "{{ lookup('pipe', 'date --utc +%Y-%m-%dT%H:%M:%SZ') }}" - name: fetch range metrics from prometheus for each cluster and query ansible.builtin.command: cmd: "curl -s 'http://127.0.0.1:{{ cluster_info.port }}/api/v1/query_range?query={{ prom_query.query | urlencode }}&start={{ start_time }}&end={{ end_time }}&step=30s'" register: range_query_response loop: "{{ prometheus_queries | product(cluster_list) | list }}" loop_control: loop_var: item label: "{{ item.1.cluster }} - {{ item.0.metricName }}" vars: prom_query: "{{ item.0 }}" cluster_info: "{{ item.1 }}" cluster_list: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } - name: save range query results to json files copy: content: "{{ item.stdout }}" dest: "{{ playbook_dir }}/experiment-data/iks/{{ item.item.1.cluster }}/{{ item.item.0.metricName }}.json" loop: "{{ range_query_response.results }}" loop_control: label: "{{ item.item.1.cluster }} - {{ item.item.0.metricName }}" - name: stop and disable port-forward service for each cluster become: yes ansible.builtin.systemd: name: "port-forward-{{ item }}-iks" state: stopped enabled: no loop: - karpenter - cas