"""
Least Angle Regression algorithm. See the documentation on the
Generalized Linear Model for a complete discussion.
"""

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import sys
import warnings
from math import log
from numbers import Integral, Real

import numpy as np
from scipy import interpolate, linalg
from scipy.linalg.lapack import get_lapack_funcs

from ..base import MultiOutputMixin, RegressorMixin, _fit_context
from ..exceptions import ConvergenceWarning
from ..model_selection import check_cv

# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
from ..utils import (  # type: ignore
    Bunch,
    arrayfuncs,
    as_float_array,
    check_random_state,
)
from ..utils._metadata_requests import (
    MetadataRouter,
    MethodMapping,
    _raise_for_params,
    _routing_enabled,
    process_routing,
)
from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
from ..utils.parallel import Parallel, delayed
from ..utils.validation import validate_data
from ._base import LinearModel, LinearRegression, _preprocess_data

SOLVE_TRIANGULAR_ARGS = {"check_finite": False}


@validate_params(
    {
        "X": [np.ndarray, None],
        "y": [np.ndarray, None],
        "Xy": [np.ndarray, None],
        "Gram": [StrOptions({"auto"}), "boolean", np.ndarray, None],
        "max_iter": [Interval(Integral, 0, None, closed="left")],
        "alpha_min": [Interval(Real, 0, None, closed="left")],
        "method": [StrOptions({"lar", "lasso"})],
        "copy_X": ["boolean"],
        "eps": [Interval(Real, 0, None, closed="neither"), None],
        "copy_Gram": ["boolean"],
        "verbose": ["verbose"],
        "return_path": ["boolean"],
        "return_n_iter": ["boolean"],
        "positive": ["boolean"],
    },
    prefer_skip_nested_validation=True,
)
def lars_path(
    X,
    y,
    Xy=None,
    *,
    Gram=None,
    max_iter=500,
    alpha_min=0,
    method="lar",
    copy_X=True,
    eps=np.finfo(float).eps,
    copy_Gram=True,
    verbose=0,
    return_path=True,
    return_n_iter=False,
    positive=False,
):
    """Compute Least Angle Regression or Lasso path using the LARS algorithm.

    The optimization objective for the case method='lasso' is::

    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    in the case of method='lar', the objective function is only known in
    the form of an implicit equation (see discussion in [1]_).

    Read more in the :ref:`User Guide <least_angle_regression>`.

    Parameters
    ----------
    X : None or ndarray of shape (n_samples, n_features)
        Input data. If X is `None`, Gram must also be `None`.
        If only the Gram matrix is available, use `lars_path_gram` instead.

    y : None or ndarray of shape (n_samples,)
        Input targets.

    Xy : array-like of shape (n_features,), default=None
        `Xy = X.T @ y` that can be precomputed. It is useful
        only when the Gram matrix is precomputed.

    Gram : None, 'auto', bool, ndarray of shape (n_features, n_features), \
            default=None
        Precomputed Gram matrix `X.T @ X`, if `'auto'`, the Gram
        matrix is precomputed from the given X, if there are more samples
        than features.

    max_iter : int, default=500
        Maximum number of iterations to perform, set to infinity for no limit.

    alpha_min : float, default=0
        Minimum correlation along the path. It corresponds to the
        regularization parameter `alpha` in the Lasso.

    method : {'lar', 'lasso'}, default='lar'
        Specifies the returned model. Select `'lar'` for Least Angle
        Regression, `'lasso'` for the Lasso.

    copy_X : bool, default=True
        If `False`, `X` is overwritten.

    eps : float, default=np.finfo(float).eps
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems. Unlike the `tol` parameter in some iterative
        optimization-based algorithms, this parameter does not control
        the tolerance of the optimization.

    copy_Gram : bool, default=True
        If `False`, `Gram` is overwritten.

    verbose : int, default=0
        Controls output verbosity.

    return_path : bool, default=True
        If `True`, returns the entire path, else returns only the
        last point of the path.

    return_n_iter : bool, default=False
        Whether to return the number of iterations.

    positive : bool, default=False
        Restrict coefficients to be >= 0.
        This option is only allowed with method 'lasso'. Note that the model
        coefficients will not converge to the ordinary-least-squares solution
        for small values of alpha. Only coefficients up to the smallest alpha
        value (`alphas_[alphas_ > 0.].min()` when fit_path=True) reached by
        the stepwise Lars-Lasso algorithm are typically in congruence with the
        solution of the coordinate descent `lasso_path` function.

    Returns
    -------
    alphas : ndarray of shape (n_alphas + 1,)
        Maximum of covariances (in absolute value) at each iteration.
        `n_alphas` is either `max_iter`, `n_features`, or the
        number of nodes in the path with `alpha >= alpha_min`, whichever
        is smaller.

    active : ndarray of shape (n_alphas,)
        Indices of active variables at the end of the path.

    coefs : ndarray of shape (n_features, n_alphas + 1)
        Coefficients along the path.

    n_iter : int
        Number of iterations run. Returned only if `return_n_iter` is set
        to True.

    See Also
    --------
    lars_path_gram : Compute LARS path in the sufficient stats mode.
    lasso_path : Compute Lasso path with coordinate descent.
    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
    Lars : Least Angle Regression model a.k.a. LAR.
    LassoLarsCV : Cross-validated Lasso, using the LARS algorithm.
    LarsCV : Cross-validated Least Angle Regression model.
    sklearn.decomposition.sparse_encode : Sparse coding.

    References
    ----------
    .. [1] "Least Angle Regression", Efron et al.
           http://statweb.stanford.edu/~tibs/ftp/lars.pdf

    .. [2] `Wikipedia entry on the Least-angle regression
           <https://en.wikipedia.org/wiki/Least-angle_regression>`_

    .. [3] `Wikipedia entry on the Lasso
           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_

    Examples
    --------
    >>> from sklearn.linear_model import lars_path
    >>> from sklearn.datasets import make_regression
    >>> X, y, true_coef = make_regression(
    ...    n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0
    ... )
    >>> true_coef
    array([ 0.        ,  0.        ,  0.        , 97.9..., 45.7...])
    >>> alphas, _, estimated_coef = lars_path(X, y)
    >>> alphas.shape
    (3,)
    >>> estimated_coef
    array([[ 0.     ,  0.     ,  0.     ],
           [ 0.     ,  0.     ,  0.     ],
           [ 0.     ,  0.     ,  0.     ],
           [ 0.     , 46.96..., 97.99...],
           [ 0.     ,  0.     , 45.70...]])
    """
    if X is None and Gram is not None:
        raise ValueError(
            "X cannot be None if Gram is not None"
            "Use lars_path_gram to avoid passing X and y."
        )
    return _lars_path_solver(
        X=X,
        y=y,
        Xy=Xy,
        Gram=Gram,
        n_samples=None,
        max_iter=max_iter,
        alpha_min=alpha_min,
        method=method,
        copy_X=copy_X,
        eps=eps,
        copy_Gram=copy_Gram,
        verbose=verbose,
        return_path=return_path,
        return_n_iter=return_n_iter,
        positive=positive,
    )


@validate_params(
    {
        "Xy": [np.ndarray],
        "Gram": [np.ndarray],
        "n_samples": [Interval(Integral, 0, None, closed="left")],
        "max_iter": [Interval(Integral, 0, None, closed="left")],
        "alpha_min": [Interval(Real, 0, None, closed="left")],
        "method": [StrOptions({"lar", "lasso"})],
        "copy_X": ["boolean"],
        "eps": [Interval(Real, 0, None, closed="neither"), None],
        "copy_Gram": ["boolean"],
        "verbose": ["verbose"],
        "return_path": ["boolean"],
        "return_n_iter": ["boolean"],
        "positive": ["boolean"],
    },
    prefer_skip_nested_validation=True,
)
def lars_path_gram(
    Xy,
    Gram,
    *,
    n_samples,
    max_iter=500,
    alpha_min=0,
    method="lar",
    copy_X=True,
    eps=np.finfo(float).eps,
    copy_Gram=True,
    verbose=0,
    return_path=True,
    return_n_iter=False,
    positive=False,
):
    """The lars_path in the sufficient stats mode.

    The optimization objective for the case method='lasso' is::

    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    in the case of method='lar', the objective function is only known in
    the form of an implicit equation (see discussion in [1]_).

    Read more in the :ref:`User Guide <least_angle_regression>`.

    Parameters
    ----------
    Xy : ndarray of shape (n_features,)
        `Xy = X.T @ y`.

    Gram : ndarray of shape (n_features, n_features)
        `Gram = X.T @ X`.

    n_samples : int
        Equivalent size of sample.

    max_iter : int, default=500
        Maximum number of iterations to perform, set to infinity for no limit.

    alpha_min : float, default=0
        Minimum correlation along the path. It corresponds to the
        regularization parameter alpha parameter in the Lasso.

    method : {'lar', 'lasso'}, default='lar'
        Specifies the returned model. Select `'lar'` for Least Angle
        Regression, ``'lasso'`` for the Lasso.

    copy_X : bool, default=True
        If `False`, `X` is overwritten.

    eps : float, default=np.finfo(float).eps
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems. Unlike the `tol` parameter in some iterative
        optimization-based algorithms, this parameter does not control
        the tolerance of the optimization.

    copy_Gram : bool, default=True
        If `False`, `Gram` is overwritten.

    verbose : int, default=0
        Controls output verbosity.

    return_path : bool, default=True
        If `return_path==True` returns the entire path, else returns only the
        last point of the path.

    return_n_iter : bool, default=False
        Whether to return the number of iterations.

    positive : bool, default=False
        Restrict coefficients to be >= 0.
        This option is only allowed with method 'lasso'. Note that the model
        coefficients will not converge to the ordinary-least-squares solution
        for small values of alpha. Only coefficients up to the smallest alpha
        value (`alphas_[alphas_ > 0.].min()` when `fit_path=True`) reached by
        the stepwise Lars-Lasso algorithm are typically in congruence with the
        solution of the coordinate descent lasso_path function.

    Returns
    -------
    alphas : ndarray of shape (n_alphas + 1,)
        Maximum of covariances (in absolute value) at each iteration.
        `n_alphas` is either `max_iter`, `n_features` or the
        number of nodes in the path with `alpha >= alpha_min`, whichever
        is smaller.

    active : ndarray of shape (n_alphas,)
        Indices of active variables at the end of the path.

    coefs : ndarray of shape (n_features, n_alphas + 1)
        Coefficients along the path.

    n_iter : int
        Number of iterations run. Returned only if `return_n_iter` is set
        to True.

    See Also
    --------
    lars_path_gram : Compute LARS path.
    lasso_path : Compute Lasso path with coordinate descent.
    LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.
    Lars : Least Angle Regression model a.k.a. LAR.
    LassoLarsCV : Cross-validated Lasso, using the LARS algorithm.
    LarsCV : Cross-validated Least Angle Regression model.
    sklearn.decomposition.sparse_encode : Sparse coding.

    References
    ----------
    .. [1] "Least Angle Regression", Efron et al.
           http://statweb.stanford.edu/~tibs/ftp/lars.pdf

    .. [2] `Wikipedia entry on the Least-angle regression
           <https://en.wikipedia.org/wiki/Least-angle_regression>`_

    .. [3] `Wikipedia entry on the Lasso
           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_

    Examples
    --------
    >>> from sklearn.linear_model import lars_path_gram
    >>> from sklearn.datasets import make_regression
    >>> X, y, true_coef = make_regression(
    ...    n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0
    ... )
    >>> true_coef
    array([ 0.        ,  0.        ,  0.        , 97.9..., 45.7...])
    >>> alphas, _, estimated_coef = lars_path_gram(X.T @ y, X.T @ X, n_samples=100)
    >>> alphas.shape
    (3,)
    >>> estimated_coef
    array([[ 0.     ,  0.     ,  0.     ],
           [ 0.     ,  0.     ,  0.     ],
           [ 0.     ,  0.     ,  0.     ],
           [ 0.     , 46.96..., 97.99...],
           [ 0.     ,  0.     , 45.70...]])
    """
    return _lars_path_solver(
        X=None,
        y=None,
        Xy=Xy,
        Gram=Gram,
        n_samples=n_samples,
        max_iter=max_iter,
        alpha_min=alpha_min,
        method=method,
        copy_X=copy_X,
        eps=eps,
        copy_Gram=copy_Gram,
        verbose=verbose,
        return_path=return_path,
        return_n_iter=return_n_iter,
        positive=positive,
    )


def _lars_path_solver(
    X,
    y,
    Xy=None,
    Gram=None,
    n_samples=None,
    max_iter=500,
    alpha_min=0,
    method="lar",
    copy_X=True,
    eps=np.finfo(float).eps,
    copy_Gram=True,
    verbose=0,
    return_path=True,
    return_n_iter=False,
    positive=False,
):
    """Compute Least Angle Regression or Lasso path using LARS algorithm [1]

    The optimization objective for the case method='lasso' is::

    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    in the case of method='lar', the objective function is only known in
    the form of an implicit equation (see discussion in [1])

    Read more in the :ref:`User Guide <least_angle_regression>`.

    Parameters
    ----------
    X : None or ndarray of shape (n_samples, n_features)
        Input data. Note that if X is None then Gram must be specified,
        i.e., cannot be None or False.

    y : None or ndarray of shape (n_samples,)
        Input targets.

    Xy : array-like of shape (n_features,), default=None
        `Xy = np.dot(X.T, y)` that can be precomputed. It is useful
        only when the Gram matrix is precomputed.

    Gram : None, 'auto' or array-like of shape (n_features, n_features), \
            default=None
        Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram
        matrix is precomputed from the given X, if there are more samples
        than features.

    n_samples : int or float, default=None
        Equivalent size of sample. If `None`, it will be `n_samples`.

    max_iter : int, default=500
        Maximum number of iterations to perform, set to infinity for no limit.

    alpha_min : float, default=0
        Minimum correlation along the path. It corresponds to the
        regularization parameter alpha parameter in the Lasso.

    method : {'lar', 'lasso'}, default='lar'
        Specifies the returned model. Select ``'lar'`` for Least Angle
        Regression, ``'lasso'`` for the Lasso.

    copy_X : bool, default=True
        If ``False``, ``X`` is overwritten.

    eps : float, default=np.finfo(float).eps
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems. Unlike the ``tol`` parameter in some iterative
        optimization-based algorithms, this parameter does not control
        the tolerance of the optimization.

    copy_Gram : bool, default=True
        If ``False``, ``Gram`` is overwritten.

    verbose : int, default=0
        Controls output verbosity.

    return_path : bool, default=True
        If ``return_path==True`` returns the entire path, else returns only the
        last point of the path.

    return_n_iter : bool, default=False
        Whether to return the number of iterations.

    positive : bool, default=False
        Restrict coefficients to be >= 0.
        This option is only allowed with method 'lasso'. Note that the model
        coefficients will not converge to the ordinary-least-squares solution
        for small values of alpha. Only coefficients up to the smallest alpha
        value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
        the stepwise Lars-Lasso algorithm are typically in congruence with the
        solution of the coordinate descent lasso_path function.

    Returns
    -------
    alphas : array-like of shape (n_alphas + 1,)
        Maximum of covariances (in absolute value) at each iteration.
        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
        number of nodes in the path with ``alpha >= alpha_min``, whichever
        is smaller.

    active : array-like of shape (n_alphas,)
        Indices of active variables at the end of the path.

    coefs : array-like of shape (n_features, n_alphas + 1)
        Coefficients along the path

    n_iter : int
        Number of iterations run. Returned only if return_n_iter is set
        to True.

    See Also
    --------
    lasso_path
    LassoLars
    Lars
    LassoLarsCV
    LarsCV
    sklearn.decomposition.sparse_encode

    References
    ----------
    .. [1] "Least Angle Regression", Efron et al.
           http://statweb.stanford.edu/~tibs/ftp/lars.pdf

    .. [2] `Wikipedia entry on the Least-angle regression
           <https://en.wikipedia.org/wiki/Least-angle_regression>`_

    .. [3] `Wikipedia entry on the Lasso
           <https://en.wikipedia.org/wiki/Lasso_(statistics)>`_

    """
    if method == "lar" and positive:
        raise ValueError("Positive constraint not supported for 'lar' coding method.")

    n_samples = n_samples if n_samples is not None else y.size

    if Xy is None:
        Cov = np.dot(X.T, y)
    else:
        Cov = Xy.copy()

    if Gram is None or Gram is False:
        Gram = None
        if X is None:
            raise ValueError("X and Gram cannot both be unspecified.")
    elif isinstance(Gram, str) and Gram == "auto" or Gram is True:
        if Gram is True or X.shape[0] > X.shape[1]:
            Gram = np.dot(X.T, X)
        else:
            Gram = None
    elif copy_Gram:
        Gram = Gram.copy()

    if Gram is None:
        n_features = X.shape[1]
    else:
        n_features = Cov.shape[0]
        if Gram.shape != (n_features, n_features):
            raise ValueError("The shapes of the inputs Gram and Xy do not match.")

    if copy_X and X is not None and Gram is None:
        # force copy. setting the array to be fortran-ordered
        # speeds up the calculation of the (partial) Gram matrix
        # and allows to easily swap columns
        X = X.copy("F")

    max_features = min(max_iter, n_features)

    dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None)
    if len(dtypes) == 1:
        # use the precision level of input data if it is consistent
        return_dtype = next(iter(dtypes))
    else:
        # fallback to double precision otherwise
        return_dtype = np.float64

    if return_path:
        coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)
        alphas = np.zeros(max_features + 1, dtype=return_dtype)
    else:
        coef, prev_coef = (
            np.zeros(n_features, dtype=return_dtype),
            np.zeros(n_features, dtype=return_dtype),
        )
        alpha, prev_alpha = (
            np.array([0.0], dtype=return_dtype),
            np.array([0.0], dtype=return_dtype),
        )
        # above better ideas?

    n_iter, n_active = 0, 0
    active, indices = list(), np.arange(n_features)
    # holds the sign of covariance
    sign_active = np.empty(max_features, dtype=np.int8)
    drop = False

    # will hold the cholesky factorization. Only lower part is
    # referenced.
    if Gram is None:
        L = np.empty((max_features, max_features), dtype=X.dtype)
        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (X,))
    else:
        L = np.empty((max_features, max_features), dtype=Gram.dtype)
        swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (Cov,))
    (solve_cholesky,) = get_lapack_funcs(("potrs",), (L,))

    if verbose:
        if verbose > 1:
            print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC")
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

    tiny32 = np.finfo(np.float32).tiny  # to avoid division by 0 warning
    cov_precision = np.finfo(Cov.dtype).precision
    equality_tolerance = np.finfo(np.float32).eps

    if Gram is not None:
        Gram_copy = Gram.copy()
        Cov_copy = Cov.copy()

    while True:
        if Cov.size:
            if positive:
                C_idx = np.argmax(Cov)
            else:
                C_idx = np.argmax(np.abs(Cov))

            C_ = Cov[C_idx]

            if positive:
                C = C_
            else:
                C = np.fabs(C_)
        else:
            C = 0.0

        if return_path:
            alpha = alphas[n_iter, np.newaxis]
            coef = coefs[n_iter]
            prev_alpha = alphas[n_iter - 1, np.newaxis]
            prev_coef = coefs[n_iter - 1]

        alpha[0] = C / n_samples
        if alpha[0] <= alpha_min + equality_tolerance:  # early stopping
            if abs(alpha[0] - alpha_min) > equality_tolerance:
                # interpolation factor 0 <= ss < 1
                if n_iter > 0:
                    # In the first iteration, all alphas are zero, the formula
                    # below would make ss a NaN
                    ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])
                    coef[:] = prev_coef + ss * (coef - prev_coef)
                alpha[0] = alpha_min
            if return_path:
                coefs[n_iter] = coef
            break

        if n_iter >= max_iter or n_active >= n_features:
            break
        if not drop:
            ##########################################################
            # Append x_j to the Cholesky factorization of (Xa * Xa') #
            #                                                        #
            #            ( L   0 )                                   #
            #     L  ->  (       )  , where L * w = Xa' x_j          #
            #            ( w   z )    and z = ||x_j||                #
            #                                                        #
            ##########################################################

            if positive:
                sign_active[n_active] = np.ones_like(C_)
            else:
                sign_active[n_active] = np.sign(C_)
            m, n = n_active, C_idx + n_active

            Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
            indices[n], indices[m] = indices[m], indices[n]
            Cov_not_shortened = Cov
            Cov = Cov[1:]  # remove Cov[0]

            if Gram is None:
                X.T[n], X.T[m] = swap(X.T[n], X.T[m])
                c = nrm2(X.T[n_active]) ** 2
                L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T)
            else:
                # swap does only work inplace if matrix is fortran
                # contiguous ...
                Gram[m], Gram[n] = swap(Gram[m], Gram[n])
                Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
                c = Gram[n_active, n_active]
                L[n_active, :n_active] = Gram[n_active, :n_active]

            # Update the cholesky decomposition for the Gram matrix
            if n_active:
                linalg.solve_triangular(
                    L[:n_active, :n_active],
                    L[n_active, :n_active],
                    trans=0,
                    lower=1,
                    overwrite_b=True,
                    **SOLVE_TRIANGULAR_ARGS,
                )

            v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
            diag = max(np.sqrt(np.abs(c - v)), eps)
            L[n_active, n_active] = diag

            if diag < 1e-7:
                # The system is becoming too ill-conditioned.
                # We have degenerate ve