- name: Download kube-burner archive become: yes ansible.builtin.get_url: url: "https://github.com/kube-burner/kube-burner/releases/download/v1.10.9/kube-burner-V1.10.9-linux-x86_64.tar.gz" dest: "/tmp/kube-burner-{{ kube_burner_version }}-{{ kube_burner_arch }}.tar.gz" mode: '0644' force: yes - name: Unarchive kube-burner binary become: yes ansible.builtin.unarchive: src: "/tmp/kube-burner-{{ kube_burner_version }}-{{ kube_burner_arch }}.tar.gz" dest: /usr/local/bin/ remote_src: yes mode: '0755' - name: Prepare kube-burner configuration files ansible.builtin.copy: src: files/kube-burner-heterogenous.yaml dest: kube-burner-config.yaml - name: Copy metrics profile file ansible.builtin.copy: src: "{{ playbook_dir }}/files/metrics-profiles/metrics-aggregated.yaml" dest: "metrics-aggregated.yaml" - name: Template metrics-endpoints.yaml for IKS clusters ansible.builtin.template: src: templates/metrics-endpoints.yaml.j2 dest: "metrics-endpoints-iks-{{ item }}.yaml" vars: prom_endpoint: "{{ lookup('vars', 'PROM_ENDPOINT_IKS_' ~ (item | upper)) }}" prom_metrics_profile_path: "metrics-aggregated.yaml" prom_token: "{{ lookup('vars', 'PROM_TOKEN_IKS_' ~ (item | upper)) }}" env: "iks" uuid: "{{ tag_uuid }}" cluster: "{{ item }}" loop: - karpenter - cas - name: deploy systemd service for port-forwarding become: yes ansible.builtin.template: src: templates/port-forward@.service.j2 dest: /etc/systemd/system/port-forward-{{ item.cluster }}-iks.service mode: '0644' loop: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } vars: kubeconfig_path: "{{ lookup('vars', 'KUBECONFIG_IKS_' ~ (item.cluster | upper)) }}" - name: start port-forward service for each cluster become: yes ansible.builtin.systemd: name: "port-forward-{{ item }}-iks" state: started enabled: yes daemon_reload: true loop: - karpenter - cas - name: wait 10 minutes ansible.builtin.wait_for: timeout: 1200 # - name: Run kube-burner on all clusters # become: yes # ignore_errors: yes # ansible.builtin.shell: | # /usr/local/bin/kube-burner init -c kube-burner-config.yaml -e metrics-endpoints-iks-{{ item }}.yaml --uuid {{ tag_uuid }}-iks-{{ item }} --timeout 1h # register: benchmark_run_iks # environment: # KUBECONFIG: "{{ lookup('vars', 'KUBECONFIG_IKS_' ~ (item | upper)) }}" # loop: # - karpenter # - cas - name: ensure jq is installed ansible.builtin.package: name: jq state: present - name: create output directories file: path: "{{ playbook_dir }}/experiment-data/iks/{{ item.cluster }}" state: directory loop: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } loop_control: loop_var: item - name: Set prometheus queries set_fact: prometheus_queries: - query: 'count(kube_node_info)' metricName: 'NodeCount' - query: 'avg(node_load1) by (instance)' metricName: 'CPUByInstance' - query: '100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) * 100)' metricName: 'CPUPercentageAllCores' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeMemoryPercantage' - query: 'rate(kube_pod_start_time_seconds{condition="Running"}[5m]) - rate(kube_pod_start_time_seconds{condition="Pending"}[5m])' metricName: 'PodLatency' - query: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' metricName: 'NodeLatency' - query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0' metricName: 'API99thLatency' - query: 'sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0' metricName: 'APIRequestRate' - query: 'sum(apiserver_current_inflight_requests{}) by (request_kind) > 0' metricName: 'APIInflightRequests' - query: '(sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0' metricName: 'containerMemory-Masters' - query: '(sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0' metricName: 'containerCPU-Masters' - query: '(sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0' metricName: 'containerCPU-Prometheus' - query: '(avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0' metricName: 'containerCPU-AggregatedWorkers' - query: '(avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0' metricName: 'containerCPU-AggregatedInfra' - query: '(sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0' metricName: 'containerMemory-Prometheus' - query: 'avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace)' metricName: 'containerMemory-AggregatedWorkers' - query: 'avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace)' metricName: 'containerMemory-AggregatedInfra' - query: '(sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0' metricName: 'nodeCPU-Masters' - query: '(avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0' metricName: 'nodeCPU-AggregatedWorkers' - query: '(avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0' metricName: 'nodeCPU-AggregatedInfra' - query: 'avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")' metricName: 'nodeMemoryAvailable-Masters' - query: 'avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryAvailable-AggregatedWorkers' - query: 'avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryAvailable-AggregatedInfra' - query: 'avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-Masters' - query: 'avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-AggregatedWorkers' - query: 'avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryActive-AggregatedInfra' - query: 'avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-Masters' - query: 'avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers' - query: 'avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))' metricName: 'nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra' - query: 'irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'rxNetworkBytes-Masters' - query: 'avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'rxNetworkBytes-AggregatedWorkers' - query: 'avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'rxNetworkBytes-AggregatedInfra' - query: 'irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))' metricName: 'txNetworkBytes-Masters' - query: 'avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'txNetworkBytes-AggregatedWorkers' - query: 'avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+"))) by (device)' metricName: 'txNetworkBytes-AggregatedInfra' - name: set current epoch time set_fact: current_epoch: "{{ lookup('pipe', 'date +%s') }}" - name: set start and end time for prometheus query ansible.builtin.set_fact: start_time: "{{ lookup('pipe', 'date --utc --date=\"@$(($(date +%s) - 7200))\" +%Y-%m-%dT%H:%M:%SZ') }}" end_time: "{{ lookup('pipe', 'date --utc +%Y-%m-%dT%H:%M:%SZ') }}" - name: fetch range metrics from prometheus for each cluster and query ansible.builtin.command: cmd: "curl -s 'http://127.0.0.1:{{ cluster_info.port }}/api/v1/query_range?query={{ prom_query.query | urlencode }}&start={{ start_time }}&end={{ end_time }}&step=30s'" register: range_query_response loop: "{{ prometheus_queries | product(cluster_list) | list }}" loop_control: loop_var: item label: "{{ item.1.cluster }} - {{ item.0.metricName }}" vars: prom_query: "{{ item.0 }}" cluster_info: "{{ item.1 }}" cluster_list: - { cluster: 'karpenter', port: 9095 } - { cluster: 'cas', port: 9096 } - name: save range query results to json files copy: content: "{{ item.stdout }}" dest: "{{ playbook_dir }}/experiment-data/iks/{{ item.item.1.cluster }}/{{ item.item.0.metricName }}.json" loop: "{{ range_query_response.results }}" loop_control: label: "{{ item.item.1.cluster }} - {{ item.item.0.metricName }}" - name: stop and disable port-forward service for each cluster become: yes ansible.builtin.systemd: name: "port-forward-{{ item }}-iks" state: stopped enabled: no loop: - karpenter - cas