#!/bin/bash

# must-gather.sh - Collect diagnostic information for Karpenter IBM Cloud Provider support
# Usage: ./must-gather.sh [--output-dir DIR] [--namespace NAMESPACE]

set -euo pipefail

# Configuration
DEFAULT_NAMESPACE="karpenter"
DEFAULT_OUTPUT_DIR="/tmp/karpenter-must-gather-$(date +%Y%m%d-%H%M%S)"
KUBECONFIG_PATH=""

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Global variables
OUTPUT_DIR=""
NAMESPACE=""
COLLECTED_FILES=()

log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

log_step() {
    echo -e "${BLUE}[STEP]${NC} $1"
}

show_usage() {
    cat <<EOF
Usage: $0 [OPTIONS]

Collect diagnostic information for Karpenter IBM Cloud Provider support requests.
Automatically sanitizes sensitive information like API keys and tokens.

OPTIONS:
    --output-dir DIR     Directory to store collected data (default: /tmp/karpenter-must-gather-TIMESTAMP)
    --namespace NS       Kubernetes namespace for Karpenter (default: karpenter)
    --kubeconfig PATH    Path to kubeconfig file (default: ./kubeconfig or \$KUBECONFIG)
    --help              Show this help message

EXAMPLES:
    # Basic collection
    ./must-gather.sh

    # Custom namespace and output directory
    ./must-gather.sh --namespace karpenter-system --output-dir ./diagnostics

    # Use specific kubeconfig
    KUBECONFIG=/path/to/config ./must-gather.sh

OUTPUT:
    Creates a tarball containing sanitized logs, configurations, and cluster state
    suitable for sharing with support teams.

SECURITY:
    - Automatically redacts API keys, tokens, and other sensitive data
    - Uses pattern matching to identify and sanitize secrets
    - Provides clear indication of redacted content
EOF
}

# Parse command line arguments
parse_args() {
    local user_specified_kubeconfig=""
    
    while [[ $# -gt 0 ]]; do
        case $1 in
            --output-dir)
                OUTPUT_DIR="$2"
                shift 2
                ;;
            --namespace)
                NAMESPACE="$2"
                shift 2
                ;;
            --kubeconfig)
                user_specified_kubeconfig="$2"
                shift 2
                ;;
            --help)
                show_usage
                exit 0
                ;;
            *)
                log_error "Unknown option: $1"
                show_usage
                exit 1
                ;;
        esac
    done

    # Set defaults if not provided
    OUTPUT_DIR="${OUTPUT_DIR:-$DEFAULT_OUTPUT_DIR}"
    NAMESPACE="${NAMESPACE:-$DEFAULT_NAMESPACE}"
    
    # Store user-specified kubeconfig if provided
    if [[ -n "$user_specified_kubeconfig" ]]; then
        KUBECONFIG_PATH="$user_specified_kubeconfig"
    fi
}

# Determine which kubeconfig to use
determine_kubeconfig() {
    # If user specified a kubeconfig, validate it exists
    if [[ -n "$KUBECONFIG_PATH" ]]; then
        if [[ -f "$KUBECONFIG_PATH" ]]; then
            log_info "Using specified kubeconfig: $KUBECONFIG_PATH"
            return 0
        else
            log_error "Specified kubeconfig not found: $KUBECONFIG_PATH"
            exit 1
        fi
    fi
    
    # Check if already authenticated (can run kubectl without kubeconfig)
    if kubectl cluster-info >/dev/null 2>&1; then
        log_info "Using existing kubectl authentication (no kubeconfig needed)"
        KUBECONFIG_PATH=""
        return 0
    fi
    
    # Check KUBECONFIG environment variable
    if [[ -n "${KUBECONFIG:-}" ]] && [[ -f "${KUBECONFIG}" ]]; then
        KUBECONFIG_PATH="${KUBECONFIG}"
        log_info "Using kubeconfig from KUBECONFIG env: $KUBECONFIG_PATH"
        return 0
    fi
    
    # Check default kubeconfig location
    if [[ -f "$HOME/.kube/config" ]]; then
        KUBECONFIG_PATH="$HOME/.kube/config"
        log_info "Using default kubeconfig: $KUBECONFIG_PATH"
        return 0
    fi
    
    # Check local kubeconfig file
    if [[ -f "./kubeconfig" ]]; then
        KUBECONFIG_PATH="./kubeconfig"
        log_info "Using local kubeconfig: $KUBECONFIG_PATH"
        return 0
    fi
    
    # No valid kubeconfig found
    log_error "No valid kubeconfig found. Please ensure you are authenticated or specify a kubeconfig with --kubeconfig"
    log_error "Checked: existing auth, KUBECONFIG env, ~/.kube/config, ./kubeconfig"
    exit 1
}

# Sanitize sensitive data from input
sanitize_content() {
    local content="$1"
    
    # IBM Cloud API key patterns
    content=$(echo "$content" | sed -E 's/([A-Za-z0-9_-]{40,})/[REDACTED-API-KEY]/g')
    
    # Kubernetes tokens and secrets
    content=$(echo "$content" | sed -E 's/(token|password|secret|key):\s*[A-Za-z0-9+/=]{20,}/\1: [REDACTED]/gi')
    
    # Base64 encoded data (likely secrets)
    content=$(echo "$content" | sed -E 's/([A-Za-z0-9+/]{40,}={0,2})/[REDACTED-BASE64]/g')
    
    # Certificate data
    content=$(echo "$content" | sed -E 's/-----BEGIN[^-]*-----.*-----END[^-]*-----/[REDACTED-CERTIFICATE]/gs')
    
    # IP addresses that might be sensitive (private ranges)
    content=$(echo "$content" | sed -E 's/10\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/[REDACTED-PRIVATE-IP]/g')
    content=$(echo "$content" | sed -E 's/172\.(1[6-9]|2[0-9]|3[0-1])\.[0-9]{1,3}\.[0-9]{1,3}/[REDACTED-PRIVATE-IP]/g')
    content=$(echo "$content" | sed -E 's/192\.168\.[0-9]{1,3}\.[0-9]{1,3}/[REDACTED-PRIVATE-IP]/g')
    
    echo "$content"
}

# Safe kubectl wrapper that uses kubeconfig if specified
safe_kubectl() {
    if [[ -n "$KUBECONFIG_PATH" ]]; then
        kubectl --kubeconfig="$KUBECONFIG_PATH" "$@" 2>/dev/null || echo "Error: Command failed or resource not found"
    else
        kubectl "$@" 2>/dev/null || echo "Error: Command failed or resource not found"
    fi
}

# Safe file collection with sanitization
collect_file() {
    local source_path="$1"
    local dest_path="$2"
    local description="$3"
    
    log_info "Collecting: $description"
    
    if [[ -f "$source_path" ]]; then
        local content
        content=$(cat "$source_path" 2>/dev/null || echo "Error: Could not read file")
        sanitize_content "$content" > "$dest_path"
        COLLECTED_FILES+=("$dest_path")
    else
        echo "File not found: $source_path" > "$dest_path"
        COLLECTED_FILES+=("$dest_path")
    fi
}

# Safe command execution with sanitization
collect_command() {
    local command="$1"
    local dest_path="$2"
    local description="$3"
    
    log_info "Collecting: $description"
    
    local output
    output=$(eval "$command" 2>&1 || echo "Error: Command failed")
    sanitize_content "$output" > "$dest_path"
    COLLECTED_FILES+=("$dest_path")
}

# Create output directory structure
setup_output_dir() {
    log_step "Setting up output directory: $OUTPUT_DIR"
    
    mkdir -p "$OUTPUT_DIR"/{cluster,karpenter,nodes,problem-pods,logs,ibmcloud}
    
    # Determine kubeconfig source for metadata
    local kubeconfig_source
    if [[ -z "$KUBECONFIG_PATH" ]]; then
        kubeconfig_source="Using existing kubectl authentication"
    else
        kubeconfig_source="$KUBECONFIG_PATH"
    fi
    
    # Create collection metadata
    cat > "$OUTPUT_DIR/collection-info.txt" <<EOF
Karpenter IBM Cloud Provider Must-Gather Report
Generated: $(date)
Namespace: $NAMESPACE
Kubeconfig: $kubeconfig_source

SECURITY NOTICE:
This collection has been automatically sanitized to remove:
- API keys and tokens
- Certificate data
- Private IP addresses
- Base64 encoded secrets
- Other sensitive patterns

Redacted content is marked with [REDACTED-*] placeholders.
EOF
    COLLECTED_FILES+=("$OUTPUT_DIR/collection-info.txt")
}

# Collect basic cluster information
collect_cluster_info() {
    log_step "Collecting cluster information"
    
    local cluster_dir="$OUTPUT_DIR/cluster"
    
    # Basic cluster info
    collect_command "safe_kubectl cluster-info" "$cluster_dir/cluster-info.txt" "Cluster info"
    collect_command "safe_kubectl version" "$cluster_dir/version.txt" "Kubernetes version"
    collect_command "safe_kubectl get nodes -o wide" "$cluster_dir/nodes.txt" "Node list"
    collect_command "safe_kubectl get nodes -o yaml" "$cluster_dir/nodes-yaml.txt" "Node details"
    
    # Storage classes and CSI
    collect_command "safe_kubectl get storageclass -o yaml" "$cluster_dir/storageclasses.txt" "Storage classes"
    collect_command "safe_kubectl get csinode -o yaml" "$cluster_dir/csinodes.txt" "CSI nodes"
    
    # Network policies and services
    collect_command "safe_kubectl get networkpolicy -A -o yaml" "$cluster_dir/networkpolicies.txt" "Network policies"
    collect_command "safe_kubectl get service -A -o wide" "$cluster_dir/services.txt" "Services"
}

# Collect Karpenter-specific information
collect_karpenter_info() {
    log_step "Collecting Karpenter information"
    
    local karpenter_dir="$OUTPUT_DIR/karpenter"
    
    # Karpenter deployment and pods
    collect_command "safe_kubectl get deployment -n $NAMESPACE" "$karpenter_dir/deployments.txt" "Karpenter deployments"
    collect_command "safe_kubectl get pods -n $NAMESPACE -o wide" "$karpenter_dir/pods.txt" "Karpenter pods"
    collect_command "safe_kubectl describe pods -n $NAMESPACE" "$karpenter_dir/pods-describe.txt" "Karpenter pod details"
    
    # Karpenter logs
    local pods
    pods=$(safe_kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -E '(karpenter|nodepool)' || echo "")
    
    if [[ -n "$pods" ]]; then
        while IFS= read -r pod; do
            if [[ -n "$pod" ]]; then
                collect_command "safe_kubectl logs -n $NAMESPACE $pod --previous" "$karpenter_dir/logs-$pod-previous.txt" "Previous logs for $pod"
                collect_command "safe_kubectl logs -n $NAMESPACE $pod" "$karpenter_dir/logs-$pod.txt" "Current logs for $pod"
            fi
        done <<< "$pods"
    else
        echo "No Karpenter pods found" > "$karpenter_dir/no-pods-found.txt"
        COLLECTED_FILES+=("$karpenter_dir/no-pods-found.txt")
    fi
    
    # Custom Resources
    collect_command "safe_kubectl get ibmnodeclass -A -o yaml" "$karpenter_dir/ibmnodeclasses.txt" "IBM Node Classes"
    collect_command "safe_kubectl get nodepool -A -o yaml" "$karpenter_dir/nodepools.txt" "Node Pools"
    collect_command "safe_kubectl get nodeclaim -A -o yaml" "$karpenter_dir/nodeclaims.txt" "Node Claims"
    
    # Events related to Karpenter
    collect_command "safe_kubectl get events -A --field-selector reason=Provisioned -o yaml" "$karpenter_dir/provisioning-events.txt" "Provisioning events"
    collect_command "safe_kubectl get events -A --field-selector involvedObject.kind=IBMNodeClass -o yaml" "$karpenter_dir/nodeclass-events.txt" "Node class events"
}

# Collect node-specific information
collect_node_info() {
    log_step "Collecting node information"
    
    local nodes_dir="$OUTPUT_DIR/nodes"
    
    # Node conditions and allocatable resources
    collect_command "safe_kubectl describe nodes" "$nodes_dir/nodes-describe.txt" "Node descriptions"
    collect_command "safe_kubectl top nodes" "$nodes_dir/node-usage.txt" "Node resource usage"
    
    # Pod distribution across nodes
    collect_command "safe_kubectl get pods -A -o wide --sort-by=.spec.nodeName" "$nodes_dir/pods-by-node.txt" "Pods by node"
    
    # Node conditions and taints
    collect_command "safe_kubectl get nodes -o custom-columns='NAME:.metadata.name,STATUS:.status.conditions[?(@.type==\"Ready\")].status,TAINTS:.spec.taints[*].key'" "$nodes_dir/node-conditions.txt" "Node conditions"
}

# Collect problematic pod information
collect_problem_pods() {
    log_step "Collecting problem pod information"
    
    local pods_dir="$OUTPUT_DIR/problem-pods"
    mkdir -p "$pods_dir"
    
    # Get all pods not in Running or Completed state
    collect_command "safe_kubectl get pods -A --field-selector='status.phase!=Running,status.phase!=Succeeded' -o wide" "$pods_dir/problem-pods-list.txt" "List of problem pods"
    
    # Collect details for each problem pod
    local problem_pods
    problem_pods=$(safe_kubectl get pods -A --field-selector='status.phase!=Running,status.phase!=Succeeded' -o json | jq -r '.items[] | "\(.metadata.namespace)/\(.metadata.name)"' 2>/dev/null || echo "")
    
    if [[ -n "$problem_pods" ]]; then
        log_info "Found problem pods, collecting detailed information..."
        
        while IFS= read -r pod_ref; do
            if [[ -n "$pod_ref" ]]; then
                local namespace=$(echo "$pod_ref" | cut -d'/' -f1)
                local pod_name=$(echo "$pod_ref" | cut -d'/' -f2)
                local safe_name=$(echo "${namespace}_${pod_name}" | tr '/' '_')
                
                # Collect pod description
                collect_command "safe_kubectl describe pod -n $namespace $pod_name" "$pods_dir/pod-describe-$safe_name.txt" "Description for $namespace/$pod_name"
                
                # Collect pod YAML
                collect_command "safe_kubectl get pod -n $namespace $pod_name -o yaml" "$pods_dir/pod-yaml-$safe_name.txt" "YAML for $namespace/$pod_name"
                
                # Collect pod events
                collect_command "safe_kubectl get events -n $namespace --field-selector involvedObject.name=$pod_name --sort-by='.lastTimestamp'" "$pods_dir/pod-events-$safe_name.txt" "Events for $namespace/$pod_name"
                
                # Try to get logs (might fail for pods that never started)
                collect_command "safe_kubectl logs -n $namespace $pod_name --tail=100" "$pods_dir/pod-logs-$safe_name.txt" "Logs for $namespace/$pod_name"
                
                # If pod has previous logs (from restarts), collect those too
                collect_command "safe_kubectl logs -n $namespace $pod_name --previous --tail=100" "$pods_dir/pod-logs-previous-$safe_name.txt" "Previous logs for $namespace/$pod_name"
            fi
        done <<< "$problem_pods"
        
        # Create a summary of problem pods
        log_info "Creating problem pods summary..."
        {
            echo "Problem Pods Summary"
            echo "===================="
            echo ""
            echo "Pods by Status:"
            safe_kubectl get pods -A -o json | jq -r '.items[] | select(.status.phase != "Running" and .status.phase != "Succeeded") | "\(.status.phase): \(.metadata.namespace)/\(.metadata.name)"' | sort | uniq -c || echo "Error getting pod status summary"
            echo ""
            echo "Pods with Container Issues:"
            safe_kubectl get pods -A -o json | jq -r '.items[] | select(.status.containerStatuses[]?.state | has("waiting") or has("terminated")) | "\(.metadata.namespace)/\(.metadata.name): \(.status.containerStatuses[].state | keys[0])"' || echo "Error getting container status"
            echo ""
            echo "Recent Pod Events (last 20):"
            safe_kubectl get events -A --field-selector reason=Failed --sort-by='.lastTimestamp' | tail -20 || echo "Error getting recent events"
        } > "$pods_dir/problem-pods-summary.txt"
        
        COLLECTED_FILES+=("$pods_dir/problem-pods-summary.txt")
    else
        echo "No problem pods found" > "$pods_dir/no-problems-found.txt"
        COLLECTED_FILES+=("$pods_dir/no-problems-found.txt")
    fi
}

# Collect relevant logs from nodes (if accessible)
collect_system_logs() {
    log_step "Collecting system logs"
    
    local logs_dir="$OUTPUT_DIR/logs"
    
    # Note: These commands will fail if not running on the node itself
    # But we attempt them anyway in case the script is run from a node
    
    echo "Note: System log collection requires direct node access" > "$logs_dir/system-logs-note.txt"
    COLLECTED_FILES+=("$logs_dir/system-logs-note.txt")
    
    # Attempt to collect kubelet logs (will likely fail unless run on node)
    collect_command "journalctl -u kubelet --no-pager -n 100" "$logs_dir/kubelet.txt" "Kubelet logs (if available)"
    
    # Cloud-init logs (if available)
    collect_file "/var/log/cloud-init.log" "$logs_dir/cloud-init.txt" "Cloud-init logs (if available)"
    collect_file "/var/log/cloud-init-output.log" "$logs_dir/cloud-init-output.txt" "Cloud-init output (if available)"
    
    # Karpenter bootstrap logs (if available)
    collect_file "/var/log/karpenter-bootstrap.log" "$logs_dir/karpenter-bootstrap.txt" "Karpenter bootstrap logs (if available)"
}

# Collect IBM Cloud specific information
collect_ibmcloud_info() {
    log_step "Collecting IBM Cloud information"
    
    local ibm_dir="$OUTPUT_DIR/ibmcloud"
    
    # Check if IBM Cloud CLI is available
    if command -v ibmcloud >/dev/null 2>&1; then
        # Get current target info (without sensitive details)
        collect_command "ibmcloud target" "$ibm_dir/target.txt" "IBM Cloud target"
        
        # VPC information (if accessible)
        collect_command "ibmcloud is vpcs --output json" "$ibm_dir/vpcs.txt" "VPC list"
        collect_command "ibmcloud is instances --output json" "$ibm_dir/instances.txt" "Instance list"
        collect_command "ibmcloud is subnets --output json" "$ibm_dir/subnets.txt" "Subnet list"
        
        # IKS cluster info (if applicable)
        collect_command "ibmcloud ks clusters --output json" "$ibm_dir/iks-clusters.txt" "IKS clusters"
    else
        echo "IBM Cloud CLI not available" > "$ibm_dir/cli-not-available.txt"
        COLLECTED_FILES+=("$ibm_dir/cli-not-available.txt")
    fi
    
    # Try to get instance metadata (if running on IBM Cloud instance)
    collect_command "curl -sf -m 5 http://169.254.169.254/metadata/v1/instance" "$ibm_dir/instance-metadata.txt" "Instance metadata (if available)"
}

# Create final tarball
create_archive() {
    log_step "Creating archive"
    
    local archive_name="karpenter-must-gather-$(date +%Y%m%d-%H%M%S).tar.gz"
    local archive_path="$(dirname "$OUTPUT_DIR")/$archive_name"
    
    # Create file list
    cat > "$OUTPUT_DIR/file-list.txt" <<EOF
Collected Files:
EOF
    
    for file in "${COLLECTED_FILES[@]}"; do
        local relative_path="${file#$OUTPUT_DIR/}"
        local file_size="unknown"
        if [[ -f "$file" ]]; then
            file_size=$(du -h "$file" | cut -f1)
        fi
        echo "  $relative_path ($file_size)" >> "$OUTPUT_DIR/file-list.txt"
    done
    
    # Create the archive
    tar -czf "$archive_path" -C "$(dirname "$OUTPUT_DIR")" "$(basename "$OUTPUT_DIR")"
    
    log_info "Archive created: $archive_path"
    log_info "Archive size: $(du -h "$archive_path" | cut -f1)"
    
    # Show summary
    cat <<EOF

${GREEN}Must-gather collection completed successfully!${NC}

Archive: $archive_path
Files collected: ${#COLLECTED_FILES[@]}

${YELLOW}IMPORTANT SECURITY NOTICE:${NC}
This archive has been automatically sanitized to remove sensitive data.
However, please review the contents before sharing with support teams.

${BLUE}Next steps:${NC}
1. Review the archive contents if needed
2. Share the archive with Karpenter IBM Cloud Provider support
3. Clean up temporary files: rm -rf "$OUTPUT_DIR"

EOF
}

# Cleanup function
cleanup() {
    if [[ -d "$OUTPUT_DIR" ]]; then
        log_info "Cleaning up temporary directory: $OUTPUT_DIR"
        rm -rf "$OUTPUT_DIR"
    fi
}

# Main execution
main() {
    log_info "Starting Karpenter IBM Cloud Provider must-gather collection"
    
    parse_args "$@"
    
    # Determine kubeconfig to use
    determine_kubeconfig
    
    # Test kubectl connectivity
    if ! safe_kubectl cluster-info >/dev/null 2>&1; then
        log_error "Cannot connect to Kubernetes cluster"
        log_error "Please check your kubeconfig and cluster connectivity"
        exit 1
    fi
    
    # Set up trap for cleanup on exit
    trap cleanup EXIT
    
    # Execute collection steps
    setup_output_dir
    collect_cluster_info
    collect_karpenter_info
    collect_node_info
    collect_problem_pods
    collect_system_logs
    collect_ibmcloud_info
    create_archive
    
    log_info "Must-gather collection completed successfully"
}

# Run main function
main "$@"
