if z_pos < gamma_:
            # some coefficients have changed sign
            idx = np.where(z == z_pos)[0][::-1]

            # update the sign, important for LAR
            sign_active[idx] = -sign_active[idx]

            if method == "lasso":
                gamma_ = z_pos
            drop = True

        n_iter += 1

        if return_path:
            if n_iter >= coefs.shape[0]:
                del coef, alpha, prev_alpha, prev_coef
                # resize the coefs and alphas array
                add_features = 2 * max(1, (max_features - n_active))
                coefs = np.resize(coefs, (n_iter + add_features, n_features))
                coefs[-add_features:] = 0
                alphas = np.resize(alphas, n_iter + add_features)
                alphas[-add_features:] = 0
            coef = coefs[n_iter]
            prev_coef = coefs[n_iter - 1]
        else:
            # mimic the effect of incrementing n_iter on the array references
            prev_coef = coef
            prev_alpha[0] = alpha[0]
            coef = np.zeros_like(coef)

        coef[active] = prev_coef[active] + gamma_ * least_squares

        # update correlations
        Cov -= gamma_ * corr_eq_dir

        # See if any coefficient has changed sign
        if drop and method == "lasso":
            # handle the case when idx is not length of 1
            for ii in idx:
                arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii)

            n_active -= 1
            # handle the case when idx is not length of 1
            drop_idx = [active.pop(ii) for ii in idx]

            if Gram is None:
                # propagate dropped variable
                for ii in idx:
                    for i in range(ii, n_active):
                        X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
                        # yeah this is stupid
                        indices[i], indices[i + 1] = indices[i + 1], indices[i]

                # TODO: this could be updated
                residual = y - np.dot(X[:, :n_active], coef[active])
                temp = np.dot(X.T[n_active], residual)

                Cov = np.r_[temp, Cov]
            else:
                for ii in idx:
                    for i in range(ii, n_active):
                        indices[i], indices[i + 1] = indices[i + 1], indices[i]
                        Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
                        Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1])

                # Cov_n = Cov_j + x_j * X + increment(betas) TODO:
                # will this still work with multiple drops ?

                # recompute covariance. Probably could be done better
                # wrong as Xy is not swapped with the rest of variables

                # TODO: this could be updated
                temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef)
                Cov = np.r_[temp, Cov]

            sign_active = np.delete(sign_active, idx)
            sign_active = np.append(sign_active, 0.0)  # just to maintain size
            if verbose > 1:
                print(
                    "%s\t\t%s\t\t%s\t\t%s\t\t%s"
                    % (n_iter, "", drop_idx, n_active, abs(temp))
                )

    if return_path:
        # resize coefs in case of early stop
        alphas = alphas[: n_iter + 1]
        coefs = coefs[: n_iter + 1]

        if return_n_iter:
            return alphas, active, coefs.T, n_iter
        else:
            return alphas, active, coefs.T
    else:
        if return_n_iter:
            return alpha, active, coef, n_iter
        else:
            return alpha, active, coef


###############################################################################
# Estimator classes


class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
    """Least Angle Regression model a.k.a. LAR.

    Read more in the :ref:`User Guide <least_angle_regression>`.

    Parameters
    ----------
    fit_intercept : bool, default=True
        Whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (i.e. data is expected to be centered).

    verbose : bool or int, default=False
        Sets the verbosity amount.

    precompute : bool, 'auto' or array-like , default='auto'
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to ``'auto'`` let us decide. The Gram
        matrix can also be passed as argument.

    n_nonzero_coefs : int, default=500
        Target number of non-zero coefficients. Use ``np.inf`` for no limit.

    eps : float, default=np.finfo(float).eps
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems. Unlike the ``tol`` parameter in some iterative
        optimization-based algorithms, this parameter does not control
        the tolerance of the optimization.

    copy_X : bool, default=True
        If ``True``, X will be copied; else, it may be overwritten.

    fit_path : bool, default=True
        If True the full path is stored in the ``coef_path_`` attribute.
        If you compute the solution for a large problem or many targets,
        setting ``fit_path`` to ``False`` will lead to a speedup, especially
        with a small alpha.

    jitter : float, default=None
        Upper bound on a uniform noise parameter to be added to the
        `y` values, to satisfy the model's assumption of
        one-at-a-time computations. Might help with stability.

        .. versionadded:: 0.23

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for jittering. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.

        .. versionadded:: 0.23

    Attributes
    ----------
    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
        Maximum of covariances (in absolute value) at each iteration.
        ``n_alphas`` is either ``max_iter``, ``n_features`` or the
        number of nodes in the path with ``alpha >= alpha_min``, whichever
        is smaller. If this is a list of array-like, the length of the outer
        list is `n_targets`.

    active_ : list of shape (n_alphas,) or list of such lists
        Indices of active variables at the end of the path.
        If this is a list of list, the length of the outer list is `n_targets`.

    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
            of such arrays
        The varying values of the coefficients along the path. It is not
        present if the ``fit_path`` parameter is ``False``. If this is a list
        of array-like, the length of the outer list is `n_targets`.

    coef_ : array-like of shape (n_features,) or (n_targets, n_features)
        Parameter vector (w in the formulation formula).

    intercept_ : float or array-like of shape (n_targets,)
        Independent term in decision function.

    n_iter_ : array-like or int
        The number of iterations taken by lars_path to find the
        grid of alphas for each target.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    lars_path: Compute Least Angle Regression or Lasso
        path using LARS algorithm.
    LarsCV : Cross-validated Least Angle Regression model.
    sklearn.decomposition.sparse_encode : Sparse coding.

    Examples
    --------
    >>> from sklearn import linear_model
    >>> reg = linear_model.Lars(n_nonzero_coefs=1)
    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
    Lars(n_nonzero_coefs=1)
    >>> print(reg.coef_)
    [ 0. -1.11...]
    """

    _parameter_constraints: dict = {
        "fit_intercept": ["boolean"],
        "verbose": ["verbose"],
        "precompute": ["boolean", StrOptions({"auto"}), np.ndarray, Hidden(None)],
        "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")],
        "eps": [Interval(Real, 0, None, closed="left")],
        "copy_X": ["boolean"],
        "fit_path": ["boolean"],
        "jitter": [Interval(Real, 0, None, closed="left"), None],
        "random_state": ["random_state"],
    }

    method = "lar"
    positive = False

    def __init__(
        self,
        *,
        fit_intercept=True,
        verbose=False,
        precompute="auto",
        n_nonzero_coefs=500,
        eps=np.finfo(float).eps,
        copy_X=True,
        fit_path=True,
        jitter=None,
        random_state=None,
    ):
        self.fit_intercept = fit_intercept
        self.verbose = verbose
        self.precompute = precompute
        self.n_nonzero_coefs = n_nonzero_coefs
        self.eps = eps
        self.copy_X = copy_X
        self.fit_path = fit_path
        self.jitter = jitter
        self.random_state = random_state

    @staticmethod
    def _get_gram(precompute, X, y):
        if (not hasattr(precompute, "__array__")) and (
            (precompute is True)
            or (precompute == "auto" and X.shape[0] > X.shape[1])
            or (precompute == "auto" and y.shape[1] > 1)
        ):
            precompute = np.dot(X.T, X)

        return precompute

    def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
        """Auxiliary method to fit the model using X, y as training data"""
        n_features = X.shape[1]

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
        )

        if y.ndim == 1:
            y = y[:, np.newaxis]

        n_targets = y.shape[1]

        Gram = self._get_gram(self.precompute, X, y)

        self.alphas_ = []
        self.n_iter_ = []
        self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype)

        if fit_path:
            self.active_ = []
            self.coef_path_ = []
            for k in range(n_targets):
                this_Xy = None if Xy is None else Xy[:, k]
                alphas, active, coef_path, n_iter_ = lars_path(
                    X,
                    y[:, k],
                    Gram=Gram,
                    Xy=this_Xy,
                    copy_X=self.copy_X,
                    copy_Gram=True,
                    alpha_min=alpha,
                    method=self.method,
                    verbose=max(0, self.verbose - 1),
                    max_iter=max_iter,
                    eps=self.eps,
                    return_path=True,
                    return_n_iter=True,
                    positive=self.positive,
                )
                self.alphas_.append(alphas)
                self.active_.append(active)
                self.n_iter_.append(n_iter_)
                self.coef_path_.append(coef_path)
                self.coef_[k] = coef_path[:, -1]

            if n_targets == 1:
                self.alphas_, self.active_, self.coef_path_, self.coef_ = [
                    a[0]
                    for a in (self.alphas_, self.active_, self.coef_path_, self.coef_)
                ]
                self.n_iter_ = self.n_iter_[0]
        else:
            for k in range(n_targets):
                this_Xy = None if Xy is None else Xy[:, k]
                alphas, _, self.coef_[k], n_iter_ = lars_path(
                    X,
                    y[:, k],
                    Gram=Gram,
                    Xy=this_Xy,
                    copy_X=self.copy_X,
                    copy_Gram=True,
                    alpha_min=alpha,
                    method=self.method,
                    verbose=max(0, self.verbose - 1),
                    max_iter=max_iter,
                    eps=self.eps,
                    return_path=False,
                    return_n_iter=True,
                    positive=self.positive,
                )
                self.alphas_.append(alphas)
                self.n_iter_.append(n_iter_)
            if n_targets == 1:
                self.alphas_ = self.alphas_[0]
                self.n_iter_ = self.n_iter_[0]

        self._set_intercept(X_offset, y_offset, X_scale)
        return self

    @_fit_context(prefer_skip_nested_validation=True)
    def fit(self, X, y, Xy=None):
        """Fit the model using X, y as training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.

        y : array-like of shape (n_samples,) or (n_samples, n_targets)
            Target values.

        Xy : array-like of shape (n_features,) or (n_features, n_targets), \
                default=None
            Xy = np.dot(X.T, y) that can be precomputed. It is useful
            only when the Gram matrix is precomputed.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        X, y = validate_data(
            self, X, y, force_writeable=True, y_numeric=True, multi_output=True
        )

        alpha = getattr(self, "alpha", 0.0)
        if hasattr(self, "n_nonzero_coefs"):
            alpha = 0.0  # n_nonzero_coefs parametrization takes priority
            max_iter = self.n_nonzero_coefs
        else:
            max_iter = self.max_iter

        if self.jitter is not None:
            rng = check_random_state(self.random_state)

            noise = rng.uniform(high=self.jitter, size=len(y))
            y = y + noise

        self._fit(
            X,
            y,
            max_iter=max_iter,
            alpha=alpha,
            fit_path=self.fit_path,
            Xy=Xy,
        )

        return self


class LassoLars(Lars):
    """Lasso model fit with Least Angle Regression a.k.a. Lars.

    It is a Linear Model trained with an L1 prior as regularizer.

    The optimization objective for Lasso is::

    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    Read more in the :ref:`User Guide <least_angle_regression>`.

    Parameters
    ----------
    alpha : float, default=1.0
        Constant that multiplies the penalty term. Defaults to 1.0.
        ``alpha = 0`` is equivalent to an ordinary least square, solved
        by :class:`LinearRegression`. For numerical reasons, using
        ``alpha = 0`` with the LassoLars object is not advised and you
        should prefer the LinearRegression object.

    fit_intercept : bool, default=True
        Whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (i.e. data is expected to be centered).

    verbose : bool or int, default=False
        Sets the verbosity amount.

    precompute : bool, 'auto' or array-like, default='auto'
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to ``'auto'`` let us decide. The Gram
        matrix can also be passed as argument.

    max_iter : int, default=500
        Maximum number of iterations to perform.

    eps : float, default=np.finfo(float).eps
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems. Unlike the ``tol`` parameter in some iterative
        optimization-based algorithms, this parameter does not control
        the tolerance of the optimization.

    copy_X : bool, default=True
        If True, X will be copied; else, it may be overwritten.

    fit_path : bool, default=True
        If ``True`` the full path is stored in the ``coef_path_`` attribute.
        If you compute the solution for a large problem or many targets,
        setting ``fit_path`` to ``False`` will lead to a speedup, especially
        with a small alpha.

    positive : bool, default=False
        Restrict coefficients to be >= 0. Be aware that you might want to
        remove fit_intercept which is set True by default.
        Under the positive restriction the model coefficients will not converge
        to the ordinary-least-squares solution for small values of alpha.
        Only coefficients up to the smallest alpha value (``alphas_[alphas_ >