= est.cost_complexity_pruning_path(X, y) pruning_path = info.ccp_alphas impurities = info.impurities assert np.all(np.diff(pruning_path) >= 0) assert np.all(np.diff(impurities) >= 0) assert_pruning_creates_subtree(tree_cls, X, y, pruning_path) @pytest.mark.parametrize("dataset", DATASETS.keys()) @pytest.mark.parametrize("tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor]) def test_prune_tree_regression_are_subtrees(dataset, tree_cls): dataset = DATASETS[dataset] X, y = dataset["X"], dataset["y"] est = tree_cls(max_leaf_nodes=20, random_state=0) info = est.cost_complexity_pruning_path(X, y) pruning_path = info.ccp_alphas impurities = info.impurities assert np.all(np.diff(pruning_path) >= 0) assert np.all(np.diff(impurities) >= 0) assert_pruning_creates_subtree(tree_cls, X, y, pruning_path) def test_prune_single_node_tree(): # single node tree clf1 = DecisionTreeClassifier(random_state=0) clf1.fit([[0], [1]], [0, 0]) # pruned single node tree clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10) clf2.fit([[0], [1]], [0, 0]) assert_is_subtree(clf1.tree_, clf2.tree_) def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path): # generate trees with increasing alphas estimators = [] for ccp_alpha in pruning_path: est = estimator_cls(max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit( X, y ) estimators.append(est) # A pruned tree must be a subtree of the previous tree (which had a # smaller ccp_alpha) for prev_est, next_est in zip(estimators, estimators[1:]): assert_is_subtree(prev_est.tree_, next_est.tree_) def assert_is_subtree(tree, subtree): assert tree.node_count >= subtree.node_count assert tree.max_depth >= subtree.max_depth tree_c_left = tree.children_left tree_c_right = tree.children_right subtree_c_left = subtree.children_left subtree_c_right = subtree.children_right stack = [(0, 0)] while stack: tree_node_idx, subtree_node_idx = stack.pop() assert_array_almost_equal( tree.value[tree_node_idx], subtree.value[subtree_node_idx] ) assert_almost_equal( tree.impurity[tree_node_idx], subtree.impurity[subtree_node_idx] ) assert_almost_equal( tree.n_node_samples[tree_node_idx], subtree.n_node_samples[subtree_node_idx] ) assert_almost_equal( tree.weighted_n_node_samples[tree_node_idx], subtree.weighted_n_node_samples[subtree_node_idx], ) if subtree_c_left[subtree_node_idx] == subtree_c_right[subtree_node_idx]: # is a leaf assert_almost_equal(TREE_UNDEFINED, subtree.threshold[subtree_node_idx]) else: # not a leaf assert_almost_equal( tree.threshold[tree_node_idx], subtree.threshold[subtree_node_idx] ) stack.append((tree_c_left[tree_node_idx], subtree_c_left[subtree_node_idx])) stack.append( (tree_c_right[tree_node_idx], subtree_c_right[subtree_node_idx]) ) @pytest.mark.parametrize("name", ALL_TREES) @pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_apply_path_readonly_all_trees(name, splitter, sparse_container): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) if sparse_container is None: X_readonly = create_memmap_backed_data(X_small) else: X_readonly = sparse_container(dataset["X"]) X_readonly.data = np.array(X_readonly.data, dtype=tree._tree.DTYPE) ( X_readonly.data, X_readonly.indices, X_readonly.indptr, ) = create_memmap_backed_data( (X_readonly.data, X_readonly.indices, X_readonly.indptr) ) y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE)) est = ALL_TREES[name](splitter=splitter) est.fit(X_readonly, y_readonly) assert_array_equal(est.predict(X_readonly), est.predict(X_small)) assert_array_equal( est.decision_path(X_readonly).todense(), est.decision_path(X_small).todense() ) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse", "poisson"]) @pytest.mark.parametrize("Tree", REG_TREES.values()) def test_balance_property(criterion, Tree): # Test that sum(y_pred)=sum(y_true) on training set. # This works if the mean is predicted (should even be true for each leaf). # MAE predicts the median and is therefore excluded from this test. # Choose a training set with non-negative targets (for poisson) X, y = diabetes.data, diabetes.target reg = Tree(criterion=criterion) reg.fit(X, y) assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) @pytest.mark.parametrize("seed", range(3)) def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]] y = [0, 0, 0, 0, 1, 2, 3, 4] # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can # easily learn that: reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed) reg.fit(X, y) assert np.amin(reg.predict(X)) == 0 # whereas Poisson must predict strictly positive numbers reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) # Test additional dataset where something could go wrong. n_features = 10 X, y = datasets.make_regression( effective_rank=n_features * 2 // 3, tail_strength=0.6, n_samples=1_000, n_features=n_features, n_informative=n_features * 2 // 3, random_state=seed, ) # some excess zeros y[(-1 < y) & (y < 0)] = 0 # make sure the target is positive y = np.abs(y) reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) def test_poisson_vs_mse(): # For a Poisson distributed target, Poisson loss should give better results # than squared error measured in Poisson deviance as metric. # We have a similar test, test_poisson(), in # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py rng = np.random.RandomState(42) n_train, n_test, n_features = 500, 500, 10 X = datasets.make_low_rank_matrix( n_samples=n_train + n_test, n_features=n_features, random_state=rng ) # We create a log-linear Poisson model and downscale coef as it will get # exponentiated. coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) y = rng.poisson(lam=np.exp(X @ coef)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=n_test, random_state=rng ) # We prevent some overfitting by setting min_samples_split=10. tree_poi = DecisionTreeRegressor( criterion="poisson", min_samples_split=10, random_state=rng ) tree_mse = DecisionTreeRegressor( criterion="squared_error", min_samples_split=10, random_state=rng ) tree_poi.fit(X_train, y_train) tree_mse.fit(X_train, y_train) dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]: metric_poi = mean_poisson_deviance(y, tree_poi.predict(X)) # squared_error might produce non-positive predictions => clip metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None)) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) # As squared_error might correctly predict 0 in train set, its train # score can be better than Poisson. This is no longer the case for the # test set. if val == "test": assert metric_poi < 0.5 * metric_mse assert metric_poi < 0.75 * metric_dummy @pytest.mark.parametrize("Tree", [DecisionTreeClassifier, ExtraTreeClassifier]) @pytest.mark.parametrize("n_classes", [2, 4]) def test_criterion_entropy_same_as_log_loss(Tree, n_classes): """Test that criterion=entropy gives same as log_loss.""" n_samples, n_features = 50, 5 X, y = datasets.make_classification( n_classes=n_classes, n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, random_state=42, ) tree_log_loss = Tree(criterion="log_loss", random_state=43).fit(X, y) tree_entropy = Tree(criterion="entropy", random_state=43).fit(X, y) assert_tree_equal( tree_log_loss.tree_, tree_entropy.tree_, f"{Tree!r} with criterion 'entropy' and 'log_loss' gave different trees.", ) assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X)) def test_different_endianness_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def reduce_ndarray(arr): return arr.byteswap().view(arr.dtype.newbyteorder()).__reduce__() def get_pickle_non_native_endianness(): f = io.BytesIO() p = pickle.Pickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[np.ndarray] = reduce_ndarray p.dump(clf) f.seek(0) return f new_clf = pickle.load(get_pickle_non_native_endianness()) new_score = new_clf.score(X, y) assert np.isclose(score, new_score) def test_different_endianness_joblib_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) class NonNativeEndiannessNumpyPickler(NumpyPickler): def save(self, obj): if isinstance(obj, np.ndarray): obj = obj.byteswap().view(obj.dtype.newbyteorder()) super().save(obj) def get_joblib_pickle_non_native_endianness(): f = io.BytesIO() p = NonNativeEndiannessNumpyPickler(f) p.dump(clf) f.seek(0) return f new_clf = joblib.load(get_joblib_pickle_non_native_endianness()) new_score = new_clf.score(X, y) assert np.isclose(score, new_score) def get_different_bitness_node_ndarray(node_ndarray): new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32 # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd) indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"] new_dtype_dict = { name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items() } for name in indexing_field_names: new_dtype_dict[name] = new_dtype_for_indexing_fields new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) return node_ndarray.astype(new_dtype, casting="same_kind") def get_different_alignment_node_ndarray(node_ndarray): new_dtype_dict = { name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items() } offsets = [offset for dtype, offset in node_ndarray.dtype.fields.values()] shifted_offsets = [8 + offset for offset in offsets] new_dtype = np.dtype( { "names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values()), "offsets": shifted_offsets, } ) return node_ndarray.astype(new_dtype, casting="same_kind") def reduce_tree_with_different_bitness(tree): new_dtype = np.int64 if _IS_32BIT else np.int32 tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__() new_n_classes = n_classes.astype(new_dtype, casting="same_kind") new_state = state.copy() new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"]) return (tree_cls, (n_features, new_n_classes, n_outputs), new_state) def test_different_bitness_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def pickle_dump_with_different_bitness(): f = io.BytesIO() p = pickle.Pickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness p.dump(clf) f.seek(0) return f new_clf = pickle.load(pickle_dump_with_different_bitness()) new_score = new_clf.score(X, y) assert score == pytest.approx(new_score) def test_different_bitness_joblib_pickle(): # Make sure that a platform specific pickle generated on a 64 bit # platform can be converted at pickle load time into an estimator # with Cython code that works with the host's native integer precision # to index nodes in the tree data structure when the host is a 32 bit # platform (and vice versa). X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def joblib_dump_with_different_bitness(): f = io.BytesIO() p = NumpyPickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness p.dump(clf) f.seek(0) return f new_clf = joblib.load(joblib_dump_with_different_bitness()) new_score = new_clf.score(X, y) assert score == pytest.approx(new_score) def test_check_n_classes(): expected_dtype = np.dtype(np.int32) if _IS_32BIT else np.dtype(np.int64) allowed_dtypes = [np.dtype(np.int32), np.dtype(np.int64)] allowed_dtypes += [dt.newbyteorder() for dt in allowed_dtypes] n_classes = np.array([0, 1], dtype=expected_dtype) for dt in allowed_dtypes: _check_n_classes(n_classes.astype(dt), expected_dtype) with pytest.raises(ValueError, match="Wrong dimensions.+n_classes"): wrong_dim_n_classes = np.array([[0, 1]], dtype=expected_dtype) _check_n_classes(wrong_dim_n_classes, expected_dtype) with pytest.raises(ValueError, match="n_classes.+incompatible dtype"): wrong_dtype_n_classes = n_classes.astype(np.float64) _check_n_classes(wrong_dtype_n_classes, expected_dtype) def test_check_value_ndarray(): expected_dtype = np.dtype(np.float64) expected_shape = (5, 1, 2) value_ndarray = np.zeros(expected_shape, dtype=expected_dtype) allowed_dtypes = [expected_dtype, expected_dtype.newbyteorder()] for dt in allowed_dtypes: _check_value_ndarray( value_ndarray, expected_dtype=dt, expected_shape=expected_shape ) with pytest.raises(ValueError, match="Wrong shape.+value array"): _check_value_ndarray( value_ndarray, expected_dtype=expected_dtype, expected_shape=(1, 2) ) for problematic_arr in [value_ndarray[:, :, :1], np.asfortranarray(value_ndarray)]: with pytest.raises(ValueError, match="value array.+C-contiguous"): _check_value_ndarray( problematic_arr, expected_dtype=expected_dtype, expected_shape=problematic_arr.shape, ) with pytest.raises(ValueError, match="value array.+incompatible dtype"): _check_value_ndarray( value_ndarray.astype(np.float32), expected_dtype=expected_dtype, expected_shape=expected_shape, ) def test_check_node_ndarray(): expected_dtype = NODE_DTYPE node_ndarray = np.zeros((5,), dtype=expected_dtype) valid_node_ndarrays = [ node_ndarray, get_different_bitness_node_ndarray(node_ndarray), get_different_alignment_node_ndarray(node_ndarray), ] valid_node_ndarrays += [ arr.astype(arr.dtype.newbyteorder()) for arr in valid_node_ndarrays ] for arr in valid_node_ndarrays: _check_node_ndarray(node_ndarray, expected_dtype=expected_dtype) with pytest.raises(ValueError, match="Wrong dimensions.+node array"): problematic_node_ndarray = np.zeros((5, 2), dtype=expected_dtype) _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) with pytest.raises(ValueError, match="node array.+C-contiguous"): problematic_node_ndarray = node_ndarray[::2] _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) dtype_dict = {name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()} # array with wrong 'threshold' field dtype (int64 rather than float64) new_dtype_dict = dtype_dict.copy() new_dtype_dict["threshold"] = np.int64 new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) problematic_node_ndarray = node_ndarray.astype(new_dtype) with pytest.raises(ValueError, match="node array.+incompatible dtype"): _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) # array with wrong 'left_child' field dtype (float64 rather than int64 or int32) new_dtype_dict = dtype_dict.copy() new_dtype_dict["left_child"] = np.float64 new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) problematic_node_ndarray = node_ndarray.astype(new_dtype) with pytest.raises(ValueError, match="node array.+incompatible dtype"): _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) @pytest.mark.parametrize( "Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values()) ) def test_splitter_serializable(Splitter): """Check that splitters are serializable.""" rng = np.random.RandomState(42) max_features = 10 n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp) criterion = CRITERIA_CLF["gini"](n_outputs, n_classes) splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None) splitter_serialize = pickle.dumps(splitter) splitter_back = pickle.loads(splitter_serialize) assert splitter_back.max_features == max_features assert isinstance(splitter_back, Splitter) def test_tree_deserialization_from_read_only_buffer(tmpdir): """Check that Trees can be deserialized with read only buffers. Non-regression test for gh-25584. """ pickle_path = str(tmpdir.join("clf.joblib")) clf = DecisionTreeClassifier(random_state=0) clf.fit(X_small, y_small) joblib.dump(clf, pickle_path) loaded_clf = joblib.load(pickle_path, mmap_mode="r") assert_tree_equal( loaded_clf.tree_, clf.tree_, "The trees of the original and loaded classifiers are not equal.", ) @pytest.mark.parametrize("Tree", ALL_TREES.values()) def test_min_sample_split_1_error(Tree): """Check that an error is raised when min_sample_split=1. non-regression test for issue gh-25481. """ X = np.array([[0, 0], [1, 1]]) y = np.array([0, 1]) # min_samples_split=1.0 is valid Tree(min_samples_split=1.0).fit(X, y) # min_samples_split=1 is invalid tree = Tree(min_samples_split=1) msg = ( r"'min_samples_split' .* must be an int in the range \[2, inf\) " r"or a float in the range \(0.0, 1.0\]" ) with pytest.raises(ValueError, match=msg): tree.fit(X, y) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"]) def test_missing_values_best_splitter_on_equal_nodes_no_missing(criterion): """Check missing values goes to correct node during predictions.""" X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X, y) # Goes to right node because it has the most data points y_pred = dtc.predict([[np.nan]]) assert_allclose(y_pred, [np.mean(y[-5:])]) # equal number of elements in both nodes X_equal = X[:-1] y_equal = y[:-1] dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X_equal, y_equal) # Goes to right node because the implementation sets: # missing_go_to_left = n_left > n_right, which is False y_pred = dtc.predict([[np.nan]]) assert_allclose(y_pred, [np.mean(y_equal[-4:])]) @pytest.mark.parametrize("seed", range(3)) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"]) def test_missing_values_random_splitter_on_equal_nodes_no_missing(criterion, seed): """Check missing values go to the correct node during predictions for ExtraTree. Since ETC use random splits, we use different seeds to verify that the left/right node is chosen correctly when the splits occur. """ X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) etr = ExtraTreeRegressor(random_state=seed, max_depth=1, criterion=criterion) etr.fit(X, y) # Get the left and right children of the root node left_child = etr.tree_.children_left[0] right_child = etr.tree_.children_right[0] # Get the number of samples for the left and right children left_samples = etr.tree_.weighted_n_node_samples[left_child] right_samples = etr.tree_.weighted_n_node_samples[right_child] went_left = left_samples > right_samples # predictions y_pred_left = etr.tree_.value[left_child][0] y_pred_right = etr.tree_.value[right_child][0] # Goes to node with the most data points y_pred = etr.predict([[np.nan]]) if went_left: assert_allclose(y_pred_left, y_pred) else: assert_allclose(y_pred_right, y_pred) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_three_classes(criterion): """Test when missing values are uniquely present in a class among 3 classes.""" missing_values_class = 0 X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 8, 9, 11, 12]]).T y = np.array([missing_values_class] * 4 + [1] * 4 + [2] * 4) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 3, 12]]).T y_nan_pred = dtc.predict(X_test) # Missing values necessarily are associated to the observed class. assert_array_equal(y_nan_pred, [missing_values_class, 1, 2]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_to_left(criterion): """Missing values spanning only one class at fit-time must make missing values at predict-time be classified has belonging to this class.""" X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T y = np.array([0] * 4 + [1] * 6) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 5, np.nan]]).T y_pred = dtc.predict(X_test) assert_array_equal(y_pred, [0, 1, 0]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_to_right(criterion): """Missing values and non-missing values sharing one class at fit-time must make missing values at predict-time be classified has belonging to this class.""" X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T y = np.array([1] * 4 + [0] * 4 + [1] * 2) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 1.2, 4.8]]).T y_pred = dtc.predict(X_test) assert_array_equal(y_pred, [1, 0, 1]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_missing_both_classes_has_nan(criterion): """Check behavior of missing value when there is one missing value in each class.""" X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T y = np.array([0] * 5 + [1] * 5) dtc = DecisionTreeClassifier(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 2.3, 34.2]]).T y_pred = dtc.predict(X_test) # Missing value goes to the class at the right (here 1) because the implementation # searches right first. assert_array_equal(y_pred, [1, 0, 1]) @pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) @pytest.mark.parametrize( "tree", [ DecisionTreeRegressor(criterion="absolute_error"), ExtraTreeRegressor(criterion="absolute_error"), ], ) def test_missing_value_errors(sparse_container, tree): """Check unsupported configurations for missing values.""" X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T y = np.array([0] * 5 + [1] * 5) if sparse_container is not None: X = sparse_container(X) with pytest.raises(ValueError, match="Input X contains NaN"): tree.fit(X, y) @pytest.mark.parametrize("Tree", REG_TREES.values()) def test_missing_values_poisson(Tree): """Smoke test for poisson regression and missing values.""" X, y = diabetes.data.copy(), diabetes.target # Set some values missing X[::5, 0] = np.nan X[::6, -1] = np.nan reg = Tree(criterion="poisson", random_state=42) reg.fit(X, y) y_pred = reg.predict(X) assert (y_pred >= 0.0).all() def make_friedman1_classification(*args, **kwargs): X, y = datasets.make_friedman1(*args, **kwargs) y = y > 14 return X, y @pytest.mark.parametrize( "make_data, Tree, tolerance", [ # Due to the sine link between X and y, we expect the native handling of # missing values to always be better than the naive mean imputation in the # regression case. # # Due to randomness in ExtraTree, we expect the native handling of missing # values to be sometimes better than the naive mean imputation, but not always (datasets.make_friedman1, DecisionTreeRegressor, 0), (datasets.make_friedman1, ExtraTreeRegressor, 0.07), (make_friedman1_classification, DecisionTreeClassifier, 0.03), (make_friedman1_classification, ExtraTreeClassifier, 0.12), ], ) @pytest.mark.parametrize("sample_weight_train", [None, "ones"]) def test_missing_values_is_resilience( make_data, Tree, sample_weight_train, global_random_seed, tolerance ): """Check that trees can deal with missing values have decent performance.""" n_samples, n_features = 5_000, 10 X, y = make_data( n_samples=n_samples, n_features=n_features, noise=1.0, random_state=global_random_seed, ) X_missing = X.copy() rng = np.random.RandomState(global_random_seed) X_missing[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan X_missing_train, X_missing_test, y_train, y_test = train_test_split( X_missing, y, random_state=global_random_seed ) if sample_weight_train == "ones": sample_weight = np.ones(X_missing_train.shape[0]) else: sample_weight = None # max_depth is used to avoid overfitting and also improve the runtime # of the test. max_depth = 10 native_tree = Tree(max_depth=max_depth, random_state=global_random_seed) native_tree.fit(X_missing_train, y_train, sample_weight=sample_weight) score_native_tree = native_tree.score(X_missing_test, y_test) tree_with_imputer = make_pipeline( SimpleImputer(), Tree(max_depth=max_depth, random_state=global_random_seed) ) tree_with_imputer.fit(X_missing_train, y_train) score_tree_with_imputer = tree_with_imputer.score(X_missing_test, y_test) assert score_native_tree + tolerance > score_tree_with_imputer, ( f"{score_native_tree=} + {tolerance} should be strictly greater than" f" {score_tree_with_imputer}" ) # A single ExtraTree will randomly send missing values down the left, or right child, # and therefore will not necessarily have the same performance as the greedy # handling of missing values. @pytest.mark.parametrize("Tree, expected_score", zip(CLF_TREES.values(), [0.85, 0.53])) def test_missing_value_is_predictive(Tree, expected_score, global_random_seed): """Check the tree learns when only the missing value is predictive.""" rng = np.random.RandomState(0) n_samples = 500 X = rng.standard_normal(size=(n_samples, 20)) y = np.concatenate([np.zeros(n_samples // 2), np.ones(n_samples // 2)]) # y = rng.randint(0, high=2, size=n_samples) # Create a predictive feature using `y` and with some noise X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) y_mask = y.copy().astype(bool) y_mask[X_random_mask] = ~y_mask[X_random_mask] X_predictive = rng.standard_normal(size=n_samples) X_predictive[y_mask] = np.nan X[:, 5] = X_predictive tree = Tree(random_state=global_random_seed) # Check that the tree can learn the predictive feature # over an average of cross-validation fits. tree_cv_score = cross_val_score(tree, X, y, cv=5).mean() assert ( tree_cv_score >= expected_score ), f"Expected CV score: {expected_score} but got {tree_cv_score}" @pytest.mark.parametrize( "make_data, Tree", [ (datasets.make_regression, DecisionTreeRegressor), (datasets.make_classification, DecisionTreeClassifier), ], ) def test_sample_weight_non_uniform(make_data, Tree): """Check sample weight is correctly handled with missing values.""" rng = np.random.RandomState(0) n_samples, n_features = 1000, 10 X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) # Create dataset with missing values X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan # Zero sample weight is the same as removing the sample sample_weight = np.ones(X.shape[0]) sample_weight[::2] = 0.0 tree_with_sw = Tree(random_state=0) tree_with_sw.fit(X, y, sample_weight=sample_weight) tree_samples_removed = Tree(random_state=0) tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) def test_deterministic_pickle(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/27268 # Uninitialised memory would lead to the two pickle strings being different. tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) pickle1 = pickle.dumps(tree1) pickle2 = pickle.dumps(tree2) assert pickle1 == pickle2 @pytest.mark.parametrize("Tree", [DecisionTreeRegressor, ExtraTreeRegressor]) @pytest.mark.parametrize( "X", [ # missing values will go left for greedy splits np.array([np.nan, 2, np.nan, 4, 5, 6]), np.array([np.nan, np.nan, 3, 4, 5, 6]), # missing values will go right for greedy splits np.array([1, 2, 3, 4, np.nan, np.nan]), np.array([1, 2, 3, np.nan, 6, np.nan]), ], ) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"]) def test_regression_tree_missing_values_toy(Tree, X, criterion): """Check that we properly handle missing values in regression trees using a toy dataset. The regression targeted by this test was that we were not reinitializing the criterion when it comes to the number of missing values. Therefore, the value of the critetion (i.e. MSE) was completely wrong. This test check that the MSE is null when there is a single sample in the leaf. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/28254 https://github.com/scikit-learn/scikit-learn/issues/28316 """ X = X.reshape(-1, 1) y = np.arange(6) tree = Tree(criterion=criterion, random_state=0).fit(X, y) tree_ref = clone(tree).fit(y.reshape(-1, 1), y) impurity = tree.tree_.impurity assert all(impurity >= 0), impurity.min() # MSE should always be positive # Check the impurity match after the first split assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2]) # Find the leaves with a single sample where the MSE should be 0 leaves_idx = np.flatnonzero( (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1) ) assert_allclose(tree.tree_.impurity[leaves_idx], 0.0) def test_regression_extra_tree_missing_values_toy(global_random_seed): rng = np.random.RandomState(global_random_seed) n_samples = 100 X = np.arange(n_samples, dtype=np.float64).reshape(-1, 1) X[-20:, :] = np.nan rng.shuffle(X) y = np.arange(n_samples) tree = ExtraTreeRegressor(random_state=global_random_seed, max_depth=5).fit(X, y) impurity = tree.tree_.impurity assert all(impurity >= 0), impurity # MSE should always be positive def test_classification_tree_missing_values_toy(): """Check that we properly handle missing values in clasification trees using a toy dataset. The test is more involved because we use a case where we detected a regression in a random forest. We therefore define the seed and bootstrap indices to detect one of the non-frequent regression. Here, we check that the impurity is null or positive in the leaves. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/28254 """ X, y = datasets.load_iris(return_X_y=True) rng = np.random.RandomState(42) X_missing = X.copy() mask = rng.binomial( n=np.ones(shape=(1, 4), dtype=np.int32), p=X[:, [2]] / 8 ).astype(bool) X_missing[mask] = np.nan X_train, _, y_train, _ = train_test_split(X_missing, y, random_state=13) # fmt: off # no black reformatting for this specific array indices = np.array([ 2, 81, 39, 97, 91, 38, 46, 31, 101, 13, 89, 82, 100, 42, 69, 27, 81, 16, 73, 74, 51, 47, 107, 17, 75, 110, 20, 15, 104, 57, 26, 15, 75, 79, 35, 77, 90, 51, 46, 13, 94, 91, 23, 8, 93, 93, 73, 77, 12, 13, 74, 109, 110, 24, 10, 23, 104, 27, 92, 52, 20, 109, 8, 8, 28, 27, 35, 12, 12, 7, 43, 0, 30, 31, 78, 12, 24, 105, 50, 0, 73, 12, 102, 105, 13, 31, 1, 69, 11, 32, 75, 90, 106, 94, 60, 56, 35, 17, 62, 85, 81, 39, 80, 16, 63, 6, 80, 84, 3, 3, 76, 78 ], dtype=np.int32) # fmt: on tree = DecisionTreeClassifier( max_depth=3, max_features="sqrt", random_state=1857819720 ) tree.fit(X_train[indices], y_train[indices]) assert all(tree.tree_.impurity >= 0) leaves_idx = np.flatnonzero( (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1) ) assert_allclose(tree.tree_.impurity[leaves_idx], 0.0) def test_build_pruned_tree_py(): """Test pruning a tree with the Python caller of the Cythonized prune tree.""" tree = DecisionTreeClassifier(random_state=0, max_depth=1) tree.fit(iris.data, iris.target) n_classes = np.atleast_1d(tree.n_classes_) pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) # only keep the root note leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) leave_in_subtree[0] = 1 _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree) assert tree.tree_.node_count == 3 assert pruned_tree.node_count == 1 with pytest.raises(AssertionError): assert_array_equal(tree.tree_.value, pruned_tree.value) assert_array_equal(tree.tree_.value[0], pruned_tree.value[0]) # now keep all the leaves pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) leave_in_subtree[1:] = 1 # Prune the tree _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree) assert tree.tree_.node_count == 3 assert pruned_tree.node_count == 3, pruned_tree.node_count assert_array_equal(tree.tree_.value, pruned_tree.value) def test_build_pruned_tree_infinite_loop(): """Test pruning a tree does not result in an infinite loop.""" # Create a tree with root and two children tree = DecisionTreeClassifier(random_state=0, max_depth=1) tree.fit(iris.data, iris.target) n_classes = np.atleast_1d(tree.n_classes_) pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) # only keeping one child as a leaf results in an improper tree leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) leave_in_subtree[1] = 1 with pytest.raises( ValueError, match="Node has reached a leaf in the original tree" ): _build_pruned_tree_py(pruned_tree, tree.tree_, leave_in_subtree) def test_sort_log2_build(): """Non-regression test for gh-30554. Using log2 and log in sort correctly sorts feature_values, but the tie breaking is different which can results in placing samples in a different order. """ rng = np.random.default_rng(75) some = rng.normal(loc=0.0, scale=10.0, size=10).astype(np.float32) feature_values = np.concatenate([some] * 5) samples = np.arange(50) _py_sort(feature_values, samples, 50) # fmt: off # no black reformatting for this specific array expected_samples = [ 0, 40, 30, 20, 10, 29, 39, 19, 49, 9, 45, 15, 35, 5, 25, 11, 31, 41, 1, 21, 22, 12, 2, 42, 32, 23, 13, 43, 3, 33, 6, 36, 46, 16, 26, 4, 14, 24, 34, 44, 27, 47, 7, 37, 17, 8, 38, 48, 28, 18 ] # fmt: on assert_array_equal(samples, expected_samples)