sebp · sebp · Jun 17, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/sksurv/tree/_criterion.pyx b/sksurv/tree/_criterion.pyx
@@ -328,12 +328,29 @@ cdef class LogrankCriterion(Criterion):
             DOUBLE_t ratio
             DOUBLE_t n_events
             DOUBLE_t n_at_risk
+            DOUBLE_t dest_j0
+            DOUBLE_t dest_j1
 
         self.riskset_total.at(0, &n_at_risk, &n_events)
         ratio = n_events / n_at_risk
         dest[0] = ratio  # Nelson-Aalen estimator
         dest[1] = 1.0 - ratio  # Kaplan-Meier estimator
 
+        # low memory mode
+        if  self.n_outputs == 1:
+            dest_j0 = dest[0]
+            dest_j1 = dest[1]
+            for i in range(1, self.n_unique_times):
+                self.riskset_total.at(i, &n_at_risk, &n_events)
+                if n_at_risk != 0:
+                    ratio = n_events / n_at_risk
+                    dest_j0 += ratio
+                    dest_j1 *= 1.0 - ratio
+                if True: # TODO: only sum for event times
+                    dest[0] += dest_j0
+                    dest[1] += dest_j1
+            return
+
         j = 2
         for i in range(1, self.n_unique_times):
             self.riskset_total.at(i, &n_at_risk, &n_events)

diff --git a/sksurv/tree/tree.py b/sksurv/tree/tree.py
@@ -106,6 +106,10 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
+    low_memory : boolean, default: False
+        If set, ``predict`` computations use heavy memory but ``predict_cumulative_hazard_function``
+        and ``predict_survival_function`` are not implemented.
+
     Attributes
     ----------
     unique_times_ : array of shape = (n_unique_times,)
@@ -162,6 +166,7 @@ class SurvivalTree(BaseEstimator, SurvivalAnalysisMixin):
         ],
         "random_state": ["random_state"],
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
+        "low_memory": ["boolean"],
     }
 
     def __init__(
@@ -175,6 +180,7 @@ def __init__(
         max_features=None,
         random_state=None,
         max_leaf_nodes=None,
+        low_memory=False,
     ):
         self.splitter = splitter
         self.max_depth = max_depth
@@ -184,6 +190,7 @@ def __init__(
         self.max_features = max_features
         self.random_state = random_state
         self.max_leaf_nodes = max_leaf_nodes
+        self.low_memory = low_memory
 
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a survival tree from the training set (X, y).
@@ -229,6 +236,11 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         # one "class" for CHF, one for survival function
         self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2
 
+        if self.low_memory:
+            self.n_outputs_ = 1
+            # one "class" for the sum over the CHF, one for the sum over the survival function
+            self.n_classes_ = np.ones(self.n_outputs_, dtype=np.intp) * 2
+
         # Build tree
         self.criterion = "logrank"
         criterion = LogrankCriterion(self.n_outputs_, n_samples, self.unique_times_)
@@ -364,6 +376,13 @@ def predict(self, X, check_input=True):
         risk_scores : ndarray, shape = (n_samples,)
             Predicted risk scores.
         """
+
+        if self.low_memory:
+            check_is_fitted(self, "tree_")
+            X = self._validate_X_predict(X, check_input, accept_sparse="csr")
+            pred = self.tree_.predict(X)
+            return pred[..., 0]
+
         chf = self.predict_cumulative_hazard_function(X, check_input, return_array=True)
         return chf[:, self.is_event_time_].sum(1)
 
@@ -424,6 +443,10 @@ def predict_cumulative_hazard_function(self, X, check_input=True, return_array=F
         >>> plt.ylim(0, 1)
         >>> plt.show()
         """
+
+        if self.low_memory:
+            raise NotImplementedError("predict_cumulative_hazard_function is not implemented in low memory mode.")
+
         check_is_fitted(self, "tree_")
         X = self._validate_X_predict(X, check_input, accept_sparse="csr")
 
@@ -491,6 +514,10 @@ def predict_survival_function(self, X, check_input=True, return_array=False):
         >>> plt.ylim(0, 1)
         >>> plt.show()
         """
+
+        if self.low_memory:
+            raise NotImplementedError("predict_survival_function is not implemented in low memory mode.")
+
         check_is_fitted(self, "tree_")
         X = self._validate_X_predict(X, check_input, accept_sparse="csr")
 

diff --git a/tests/test_tree.py b/tests/test_tree.py
@@ -779,3 +779,36 @@ def test_predict_sparse(make_whas500):
     assert_array_equal(y_pred, y_pred_csr)
     assert_array_equal(y_cum_h, y_cum_h_csr)
     assert_array_equal(y_surv, y_surv_csr)
+
+
+def test_predict_low_memory(make_whas500):
+    seed = 42
+    whas500 = make_whas500(to_numeric=True)
+    X, y = whas500.x, whas500.y
+    # Duplicates values in whas500 leads to assert errors because of
+    # tie resolution during tree fitting.
+    # Using a synthetic dataset resolves this issue.
+    X = np.random.RandomState(seed).binomial(n=5, p=0.1, size=X.shape)
+
+    X_train, X_test, y_train, _ = train_test_split(X, y, random_state=seed)
+
+    tree0 = SurvivalTree(min_samples_leaf=10, random_state=seed, low_memory=False)
+    tree0.fit(X_train, y_train)
+    y_pred_0 = tree0.predict(X_test)
+
+    tree1 = SurvivalTree(min_samples_leaf=10, random_state=seed, low_memory=True)
+    tree1.fit(X_train, y_train)
+    y_pred_1 = tree1.predict(X_test)
+
+    assert y_pred_0.shape[0] == X_test.shape[0]
+    assert y_pred_1.shape[0] == X_test.shape[0]
+
+    assert_array_almost_equal(y_pred_0, y_pred_1)
+
+    msg = r"predict_cumulative_hazard_function is not implemented in low memory mode."
+    with pytest.raises(NotImplementedError, match=msg):
+        tree1.predict_cumulative_hazard_function(X_test)
+
+    msg = r"predict_survival_function is not implemented in low memory mode."
+    with pytest.raises(NotImplementedError, match=msg):
+        tree1.predict_survival_function(X_test)