# Container Mom Prometheus Multi-Tenant Configuration # This chart provides metrics collection with customer isolation # Override values for the kube-prometheus-stack chart kube-prometheus-stack: # Disable components we don't need or will configure separately alertmanager: enabled: false grafana: enabled: false # We're using our own Grafana deployment # Prometheus configuration prometheus: enabled: true # Service configuration service: type: ClusterIP port: 9090 # Ingress configuration for Prometheus UI (development only) ingress: enabled: false # Enable only in development # Prometheus server configuration prometheusSpec: # Multi-tenant configuration externalLabels: cluster: hub service: container-mom # Resource limits resources: requests: cpu: 500m memory: 2Gi limits: cpu: 2000m memory: 8Gi # Storage configuration storageSpec: volumeClaimTemplate: spec: storageClassName: standard accessModes: ["ReadWriteOnce"] resources: requests: storage: 50Gi # Retention configuration retention: 30d retentionSize: "45GB" # WAL compression walCompression: true # Service discovery configuration serviceMonitorNamespaceSelector: matchLabels: container.mom/monitoring: "true" serviceMonitorSelector: matchLabels: container.mom/monitor: "true" podMonitorNamespaceSelector: matchLabels: container.mom/monitoring: "true" podMonitorSelector: matchLabels: container.mom/monitor: "true" # Additional scrape configs for multi-tenant isolation additionalScrapeConfigs: - job_name: 'container-mom-deployments' kubernetes_sd_configs: - role: pod namespaces: names: - customer-* relabel_configs: # Extract customer ID from namespace - source_labels: [__meta_kubernetes_namespace] regex: 'customer-(.*)' target_label: customer_id replacement: '$1' # Add namespace label - source_labels: [__meta_kubernetes_namespace] target_label: namespace # Add pod name - source_labels: [__meta_kubernetes_pod_name] target_label: pod # Extract deployment name from pod - source_labels: [__meta_kubernetes_pod_label_app] target_label: deployment # Only scrape pods with annotation - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true # Get metrics path - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) # Get metrics port - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - job_name: 'container-mom-operator' static_configs: - targets: ['container-mom-operator.container-mom.svc.cluster.local:8080'] metric_relabel_configs: - source_labels: [__name__] regex: 'container_mom_.*' action: keep # Rules for multi-tenant alerting ruleNamespaceSelector: matchLabels: container.mom/monitoring: "true" ruleSelector: matchLabels: container.mom/rules: "true" # Prometheus Operator configuration prometheusOperator: enabled: true # Resource configuration resources: requests: cpu: 100m memory: 100Mi limits: cpu: 200m memory: 200Mi # Node exporter for infrastructure metrics nodeExporter: enabled: true # Kube state metrics for Kubernetes object metrics kubeStateMetrics: enabled: true # Container Mom specific configuration containerMom: # Multi-tenancy settings multiTenancy: enabled: true # Customer isolation configuration customerIsolation: enabled: true namespacePrefix: "customer-" # Metrics to collect per customer metrics: - container_cpu_usage_seconds_total - container_memory_usage_bytes - container_network_receive_bytes_total - container_network_transmit_bytes_total - kube_deployment_status_replicas - kube_deployment_status_replicas_available - kube_pod_status_phase # Recording rules for performance recordingRules: enabled: true # Pre-aggregate metrics for better query performance rules: - record: container_mom:cpu_usage:rate5m expr: | sum by (customer_id, deployment) ( rate(container_cpu_usage_seconds_total[5m]) ) - record: container_mom:memory_usage:avg expr: | avg by (customer_id, deployment) ( container_memory_usage_bytes ) - record: container_mom:network_receive:rate5m expr: | sum by (customer_id, deployment) ( rate(container_network_receive_bytes_total[5m]) ) - record: container_mom:deployment_availability expr: | kube_deployment_status_replicas_available / kube_deployment_status_replicas # Alerting rules (if alertmanager is enabled separately) alertingRules: enabled: true rules: - alert: CustomerDeploymentDown expr: kube_deployment_status_replicas_available{namespace=~"customer-.*"} == 0 for: 5m labels: severity: critical component: deployment annotations: summary: "Customer deployment {{ $labels.deployment }} is down" description: "Deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has no available replicas" - alert: CustomerHighCPUUsage expr: container_mom:cpu_usage:rate5m > 0.8 for: 10m labels: severity: warning component: resources annotations: summary: "High CPU usage for customer {{ $labels.customer_id }}" description: "Deployment {{ $labels.deployment }} is using {{ $value | humanizePercentage }} CPU" - alert: CustomerHighMemoryUsage expr: container_mom:memory_usage:avg > 0.9 * 1073741824 # 90% of 1GB for: 10m labels: severity: warning component: resources annotations: summary: "High memory usage for customer {{ $labels.customer_id }}" description: "Deployment {{ $labels.deployment }} is using {{ $value | humanize }} memory" # Global labels commonLabels: app.kubernetes.io/part-of: "container-mom" app.kubernetes.io/component: "observability" # Namespace configuration namespace: container-mom-monitoring