Update pandas>2 and networkx>3 dependencies (#65)

* Allow pandas >2 dependency * Allow networkx >3 dependency * Github actions add tests for Python 3.10 and 3.11 * Change `faiss` dependency from GPU to CPU GPU not yet supported for Python 3.11 facebookresearch/faiss#2861 (comment) * Bump version: v0.9.6 * Fix requirements bump twice for pandas none for nx Bumped pandas twice instead pandas and networkx --------- Signed-off-by: Ehud-Karavani <[email protected]>
BiomedSciAI · Oct 25, 2023 · d80e3ea · d80e3ea
1 parent f871ac1
commit d80e3ea
Show file tree

Hide file tree

Showing 9 changed files with 41 additions and 24 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       fail-fast: false  # Don't cancel entire run if one python-version fails
       matrix:
-        python-version: ["3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
     name: Build and test on Python ${{ matrix.python-version }}
     steps:
       - uses: actions/checkout@v3

diff --git a/causallib/__init__.py b/causallib/__init__.py
@@ -1 +1 @@
-__version__ = "0.9.5"
+__version__ = "0.9.6"
diff --git a/causallib/contrib/requirements.txt b/causallib/contrib/requirements.txt
@@ -1,3 +1,3 @@
 -f https://download.pytorch.org/whl/cpu/  # To support cpu torch installation
 torch>=1.2.0
-faiss-gpu~=1.7.0
+faiss-cpu~=1.7.0  # Can also use gpu for some Python versions
diff --git a/causallib/estimation/matching.py b/causallib/estimation/matching.py
@@ -679,7 +679,7 @@ def _matches_to_weights_single_matching(self, s, t, match_df):
         name = {0: "control", 1: "treatment"}
         weights.name = "{s}_to_{t}".format(s=name[s], t=name[t])
         s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches
-        for source_idx, matches_list in s_to_t_matches.iteritems():
+        for source_idx, matches_list in s_to_t_matches.items():
             if matches_list:
                 weights.loc[source_idx] += 1
             for match in matches_list:

diff --git a/causallib/simulation/CausalSimulator3.py b/causallib/simulation/CausalSimulator3.py
@@ -186,7 +186,7 @@ def __init__(self, topology, var_types, prob_categories, link_types, snr, treatm
 
         # Create a graph out of matrix topology:
         self.topology = topology
-        self.graph_topology = nx.from_numpy_matrix(topology.transpose(), create_using=nx.DiGraph())  # type:nx.DiGraph
+        self.graph_topology = nx.from_numpy_array(topology.transpose(), create_using=nx.DiGraph())  # type:nx.DiGraph
         self.graph_topology = nx.relabel_nodes(self.graph_topology,
                                                dict(list(zip(list(range(self.m)), self.var_names))))
 
@@ -751,13 +751,17 @@ def generate_outcome_col(self, X_parents, link_type, snr, prob_category, outcome
         elif outcome_type == SURVIVAL:
             if survival_distribution == "expon":
                 rnd_state = np.random.randint(low=0, high=999999)
-                param = survival_baseline * np.exp(x_outcome)
+                param = survival_baseline * np.exp(x_outcome.astype(float))
                 x_outcome = pd.Series(
                     stats.expon(loc=0.0, scale=(1.0 / param)).rvs(x_outcome.size, random_state=rnd_state),
                     index=x_outcome.index)
                 cf = {i: pd.Series(
-                    stats.expon(loc=0.0, scale=(1 / (survival_baseline * np.exp(cf[i])))).rvs(x_outcome.size,
-                                                                                              random_state=rnd_state),
+                    stats.expon(
+                        loc=0.0,
+                        scale=(1 / (survival_baseline * np.exp(cf[i].astype(float))))).rvs(
+                            x_outcome.size,
+                            random_state=rnd_state
+                    ),
                     index=x_outcome.index)
                 if has_treatment_parent else cf[i] for i in list(cf.keys())}
                 # Supplying the random state assures that the resulting outcome and cfs is consistent while sampling rvs
@@ -826,7 +830,7 @@ def generate_censor_col(self, X_parents, link_type, snr, prob_category, outcome_
                                                          var_name=var_name)
             if survival_distribution == "expon":
                 # param = survival_baseline * (prob_category.iloc[0]/prob_category.loc[1]) * np.exp(x_signal)  # Cox ph
-                param = survival_baseline * np.exp(x_signal)  # Cox ph model
+                param = survival_baseline * np.exp(x_signal.astype(float))  # Cox ph model
                 survival_distribution = stats.expon(loc=0.0, scale=(1.0 / param))
                 x_censor = pd.Series(survival_distribution.rvs(size=x_signal.size), index=x_signal.index)
                 # scale values with censoring proportions - 0 is non censored, 1 is censored:
@@ -941,7 +945,7 @@ def _treatment_logistic_dichotomous(x_continuous, prob_category, params=None):
         # compute propensities:
         t = x_continuous.quantile(prob_category.iloc[1], interpolation="higher")
         slope = params.get("slope", 1.0) if params is not None else 1.0
-        cur_propensity = 1.0 / (1 + np.exp(slope * (x_continuous - np.repeat(t, x_continuous.size))))
+        cur_propensity = 1.0 / (1 + np.exp(slope * (x_continuous - np.repeat(t, x_continuous.size)).astype(float)))
         # assign the propensity values:
         propensity.loc[:, columns_names[1]] = cur_propensity
         propensity.loc[:, columns_names[0]] = np.ones(cur_propensity.size) - cur_propensity
@@ -968,11 +972,12 @@ def _treatment_odds_ratio(x_continuous, prob_category, snr):
             - **propensity** (*pd.DataFrame*): The marginal conditional probability of treatment given covariates.
                                                A DataFrame shaped (num_samples x num_of_possible_treatment_categories).
         """
+        x_continuous = x_continuous.astype(float)
         index_names = x_continuous.index
         columns_names = prob_category.index
         propensity = pd.DataFrame(index=index_names, columns=columns_names)
         # start with filling up the odds ratio:
-        for cur_category, p in prob_category.iteritems():
+        for cur_category, p in prob_category.items():
             t = x_continuous.quantile(p, interpolation="higher")
             cur_propensity = (1.0 / (1 + np.exp((x_continuous - np.repeat(t, x_continuous.size)))))  # type: pd.Series
             cur_propensity = cur_propensity.div(np.ones_like(cur_propensity) - cur_propensity)
@@ -1012,8 +1017,12 @@ def _treatment_quantile_gauss_fit(x_continuous, prob_category, snr):
         columns_names = prob_category.index
         propensity = pd.DataFrame(index=index_names, columns=columns_names)
         # section the signal into bins based on the probabilities (quantiles)
-        bins = pd.qcut(x=x_continuous, q=np.cumsum(pd.Series(0, index=["null"]).append(prob_category)),
-                       labels=columns_names)
+        x_continuous = x_continuous.astype(float)
+        bins = pd.qcut(
+            x=x_continuous,
+            q=np.cumsum(pd.concat([pd.Series(0, index=["null"]), prob_category])),
+            labels=columns_names
+        )
         for cur_category in columns_names:
             cur_samples_mask = (bins == cur_category)
             cur_samples = x_continuous[cur_samples_mask]
@@ -1103,8 +1112,10 @@ def _discretize_col(x_col, prob_category, method="empiric", retbins=False, bins=
                     res = cutoffs.sum(axis="columns")
                 elif method == "empiric":  # discretize according to percentiles from the empirical data itself
                     try:
-                        cumulative_ps = pd.Series(0, index=["null"]).append(prob_category).cumsum()
-                        res, bins = pd.qcut(x=x_col, q=cumulative_ps,
+                        cumulative_ps = pd.concat(
+                            [pd.Series(0, index=["null"]), prob_category]
+                        ).cumsum()
+                        res, bins = pd.qcut(x=x_col.astype(float), q=cumulative_ps,
                                             labels=prob_category.index, retbins=True)
                         bins = pd.Series(data=bins, index=cumulative_ps.index)
                         # TODO: maybe noise this a little?
@@ -1541,7 +1552,7 @@ def generate_random_topology(n_covariates, p, n_treatments=1, n_outcomes=1, n_ce
     generated_vars = covariates + treatments + outcomes + censoring
     generated_vars = pd.Series(data=generated_vars, index=generated_vars)
 
-    total_vars = given_vars.append(generated_vars)
+    total_vars = pd.concat([given_vars, generated_vars])
     topology = pd.DataFrame(data=0, index=total_vars, columns=total_vars, dtype=bool)
 
     # generate between the independent given set to generated set:
@@ -1577,10 +1588,12 @@ def generate_random_topology(n_covariates, p, n_treatments=1, n_outcomes=1, n_ce
     generated_types[treatments] = TREATMENT
     generated_types[outcomes] = OUTCOME
     generated_types[censoring] = CENSOR
-    var_types = pd.Series(data=COVARIATE, index=given_vars).append(generated_types)
+    var_types = pd.concat(
+        [pd.Series(data=COVARIATE, index=given_vars), generated_types]
+    )
 
     # create a hidden variables mask:
-    hidden_vars = given_vars.append(pd.Series(covariates)).sample(frac=p_hidden)
+    hidden_vars = pd.concat([given_vars, pd.Series(covariates)]).sample(frac=p_hidden)
     var_types[hidden_vars] = HIDDEN
 
     return topology, var_types
diff --git a/causallib/tests/test_causal_simulator3.py b/causallib/tests/test_causal_simulator3.py
@@ -308,7 +308,7 @@ def test_random_topology_generation(self):
         np.testing.assert_array_equal(T.loc[X.columns, :].sum(axis="columns"), np.zeros(5))
 
         # Test for DAGness:
-        from networkx import DiGraph, from_numpy_matrix, is_directed_acyclic_graph
+        from networkx import DiGraph, from_numpy_array, is_directed_acyclic_graph
         NUM_TESTS = 50
         for test in range(NUM_TESTS):
             n_cov = np.random.randint(low=10, high=100)
@@ -317,7 +317,7 @@ def test_random_topology_generation(self):
             n_cen = np.random.randint(low=0, high=n_tre_out)
             T, _ = CS3m.generate_random_topology(n_covariates=n_cov, p=p, n_treatments=n_tre_out, n_outcomes=n_tre_out,
                                                  n_censoring=n_cen, given_vars=[], p_hidden=0)
-            G = from_numpy_matrix(T.values.transpose(), create_using=DiGraph())
+            G = from_numpy_array(T.values.transpose(), create_using=DiGraph())
             res = is_directed_acyclic_graph(G)
             self.assertTrue(res)
 
@@ -350,7 +350,7 @@ def test_linear_linking(self):
                   outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
         X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
 
-        singular_values = np.linalg.svd(X.values, compute_uv=False)
+        singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
         eps = 1e-10
         rank = np.sum(singular_values > eps)
         self.assertEqual(rank, 2,

diff --git a/causallib/tests/test_overlap_weights.py b/causallib/tests/test_overlap_weights.py
@@ -80,6 +80,7 @@ def test_ow_weights_reversed_to_propensity(self):
         propensity = pd.DataFrame(propensity)
         ow_weights = self.estimator.compute_weight_matrix(self.data_r_100["X"], self.data_r_100["a"],
                                                           clip_min=None, clip_max=None)
+        propensity.columns = propensity.columns.astype(ow_weights.columns.dtype)  # Avoid column dtype assert
         pd.testing.assert_series_equal(propensity.loc[:, 0], ow_weights.loc[:, 1], check_names=False)
         pd.testing.assert_series_equal(propensity.loc[:, 1], ow_weights.loc[:, 0], check_names=False)
-        pd.testing.assert_index_equal(propensity.columns, ow_weights.columns)
+        pd.testing.assert_index_equal(propensity.columns, ow_weights.columns)
diff --git a/causallib/tests/test_survival.py b/causallib/tests/test_survival.py
@@ -635,6 +635,9 @@ def test_marginal_kaplan_meier_curves(self):
                                                   1: lifelines_km_a1.predict(sorted(self.t.unique()))})
         marginal_curves_lifelines.columns.name = 'a'
         marginal_curves_lifelines.index.name = 't'
+        marginal_curves_lifelines.columns = (
+            marginal_curves_lifelines.columns.astype(marginal_curves_causallib.columns.dtype)
+        )  # Avoid column dtype assert
 
         pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_causallib_lifelines)
         pd.testing.assert_frame_equal(marginal_curves_causallib, marginal_curves_lifelines)

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-pandas>=0.25.2,<2
+pandas>=0.25.2,<3
 scipy>=0.19,<2
 statsmodels>=0.9,<1
-networkx>=1.1,<3
+networkx>=1.1,<4
 numpy>=1.13,<2
 scikit-learn>=0.20,<1.2
 matplotlib>=2.2,<4