MaxHalford · MaxHalford · Jun 17, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,20 @@
 # Changelog
 
-## 0.19.0
+## 0.20.0 — unreleased
+
+### Bug fixes
+
+- **MCA: incorrect Greenacre correction with subset MCA**. When `one_hot_columns_to_drop` was set, the closed-form Benzécri/Greenacre formula was still applied, but it assumes uniform row sums in the indicator matrix — which subsetting violates. The corrected path uses Greenacre's subset-MCA adjustment (CA in Practice, ch. 21), ported from R's `ca::mjca(lambda='adjusted', subsetcat=...)`. The non-subset path is unchanged. Fixes [#206](https://github.com/MaxHalford/prince/issues/206).
+- **PCA: supplementary column coordinates were missing a factor of √n**. Coordinates exceeded [-1, 1] instead of being correlations with the principal components. The standardization of `X_sup` and `_column_dist` also ignored `sample_weight`, making them inconsistent with the active-variable treatment. The fix now matches FactoMineR's `pca$quanti.sup$coord` and propagates to MFA supplementary groups.
+- **SVD: `random_state` was ignored with `engine='fbpca'`**. Output was deterministic regardless of the user-supplied seed. The seed is now passed through to fbpca, and the fbpca path no longer mutates the global numpy RNG state.
+
+### New features
+
+- **MFA: categorical groups**. Categorical groups are now fitted with MCA (indicator columns centered and divided by √(p_j·(1-p_j)), with column weight (1-p_j)/(λ₁·Q)). This allows numeric and categorical groups to be mixed in a single MFA. Adds `prince.datasets.load_poison()`. Closes [#231](https://github.com/MaxHalford/prince/issues/231).
+- **MCA: faster `fit`**. The indicator matrix is now built directly as a scipy CSC sparse matrix (factorize columns, build COO from offsets) instead of going through a dense intermediate, with a single densification at the end. Materially faster on wide categorical datasets.
+- **`prince.__version__`**. The package version is now exposed as `prince.__version__`.
+
+## 0.19.0 — 2026-05-05
 
 ### Bug fixes
 
@@ -14,7 +28,7 @@
 - **MFA: partial axes results**. Added `partial_correlations_` and `partial_contributions_` properties. These describe how each group's own PCA axes relate to the global MFA axes, corresponding to FactoMineR's `result$partial.axes$cor` and `result$partial.axes$contrib`. Requested in [#217](https://github.com/MaxHalford/prince/issues/217).
 - **MFA: partial axes correlation plot**. Added `plot_partial` method. Produces a correlation circle showing how each group's PCA axes project onto the global MFA plane, corresponding to FactoMineR's `plot.MFA(res, choix="axes")`.
 
-## 0.18.0
+## 0.18.0 — 2026-05-04
 
 ### Bug fixes
 

diff --git a/prince/mca.py b/prince/mca.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pandas as pd
+import scipy.sparse as sp
 import sklearn.base
 import sklearn.preprocessing
 import sklearn.utils
@@ -82,25 +83,66 @@ def __init__(
         self.one_hot_columns_to_drop = one_hot_columns_to_drop
         self.correction = correction
 
+    def _one_hot(self, X):
+        if self.one_hot:
+            return pd.get_dummies(X, columns=X.columns, prefix_sep=self.one_hot_prefix_sep)
+        return X
+
     def _prepare(self, X):
         """One-hot encode the input if needed, and align columns with the fitted indicator matrix."""
-        if self.one_hot:
-            X = pd.get_dummies(X, columns=X.columns, prefix_sep=self.one_hot_prefix_sep)
-            if self.one_hot_columns_to_drop is not None:
-                X = X.drop(columns=self.one_hot_columns_to_drop, errors="ignore")
-            if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
-                X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
+        X = self._one_hot(X)
+        if self.one_hot and self.one_hot_columns_to_drop is not None:
+            X = X.drop(columns=self.one_hot_columns_to_drop, errors="ignore")
+        if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
+            X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
         return X
 
     def get_feature_names_out(self, input_features=None):
         return np.arange(self.n_components_)
 
+    def _subset_greenacre_quantities(self):
+        """Adjusted eigenvalues and total inertia for subset MCA with Greenacre correction.
+
+        Ports the ``lambda = "adjusted"`` + ``subsetcat`` branch of R's ``ca::mjca``
+        (Nenadić & Greenacre 2007; *Correspondence Analysis in Practice* ch. 19 & 21).
+        Notation follows the R source so the two implementations can be cross-checked.
+
+        """
+        B = self._subset_full_burt_
+        cm = B.sum(axis=0) / B.sum()
+        sqrt_cm_outer = np.sqrt(np.outer(cm, cm))
+        cm_outer = np.outer(cm, cm)
+
+        # S_null: standardised residuals of the off-block-diagonal Burt.
+        B_null = B - np.diag(np.diag(B))
+        S_null = (B_null / B_null.sum() - cm_outer) / sqrt_cm_outer
+
+        # Se: standardised residuals under the "independence within each variable" model.
+        Pe = (B / B.sum()).copy()
+        for q in range(self.K_):
+            idx = np.where(self._subset_col_to_var_ == q)[0]
+            Pe[np.ix_(idx, idx)] = np.outer(cm[idx], cm[idx])
+        Se = (Pe - cm_outer) / sqrt_cm_outer
+
+        # Principal inertias on the active subset; clip numerical negatives.
+        mask = self._subset_mask_
+        eigvals = np.linalg.eigvalsh(S_null[np.ix_(mask, mask)])[::-1]
+        lambda_adj = np.clip(eigvals, 0, None) ** 2
+        # Total adjusted inertia. The Q/(Q-1) factor mirrors the non-subset Greenacre
+        # adjustment (see eqn. 19.5 in Greenacre, CA in Practice).
+        Q = self.K_
+        lambda_t = (Se[np.ix_(mask, mask)] ** 2).sum() * Q / (Q - 1)
+        return lambda_adj, lambda_t
+
     @property
     def eigenvalues_(self):
         """Returns the eigenvalues associated with each principal component."""
         eigenvalues = super().eigenvalues_
         # Benzécri and Greenacre corrections
         if self.correction in {"benzecri", "greenacre"}:
+            if self.correction == "greenacre" and self.one_hot_columns_to_drop is not None:
+                lambda_adj, _ = self._subset_greenacre_quantities()
+                return lambda_adj[: len(eigenvalues)]
             K = self.K_
             return np.array(
                 [(K / (K - 1) * (eig - 1 / K)) ** 2 if eig > 1 / K else 0 for eig in eigenvalues]
@@ -111,6 +153,12 @@ def eigenvalues_(self):
     @utils.check_is_fitted
     def percentage_of_variance_(self):
         """Returns the percentage of explained inertia per principal component."""
+        # Greenacre correction on a subset MCA: closed-form Benzécri assumes uniform row
+        # sums in the indicator matrix and so mis-calibrates when categories are dropped.
+        if self.correction == "greenacre" and self.one_hot_columns_to_drop is not None:
+            lambda_adj, lambda_t = self._subset_greenacre_quantities()
+            n = len(super().eigenvalues_)
+            return 100 * lambda_adj[:n] / lambda_t
         # Benzécri correction
         if self.correction == "benzecri":
             eigenvalues = self.eigenvalues_
@@ -151,16 +199,65 @@ def fit(self, X, y=None):
         # K is the number of actual variables, to apply the Benzécri correction
         self.K_ = X.shape[1]
 
-        # One-hot encode the data
-        one_hot = self._prepare(X)
-        self.one_hot_columns_ = one_hot.columns
+        # Build the indicator matrix directly as a scipy sparse CSC: factorize each
+        # input column to get integer codes, then construct one COO from offsets. This
+        # avoids ``pd.get_dummies`` (which is dominated by per-column object work) and
+        # gives a sparse Zᵀ Z for the subset-Greenacre Burt with no extra conversion.
+        # We densify once into a single contiguous float ndarray right before CA.fit,
+        # so sklearn's ``check_array`` sees one block instead of iterating over J
+        # per-column ExtensionArrays.
+        if self.one_hot:
+            categories_per_col = [pd.Categorical(X[c]) for c in X.columns]
+            n_levels = np.array([len(c.categories) for c in categories_per_col])
+            offsets = np.concatenate(([0], np.cumsum(n_levels)[:-1]))
+            J_full = int(n_levels.sum())
+            sep = self.one_hot_prefix_sep
+            full_columns = pd.Index(
+                [
+                    f"{col}{sep}{level}"
+                    for col, cat in zip(X.columns, categories_per_col)
+                    for level in cat.categories
+                ]
+            )
 
-        # We need the number of columns to apply the Greenacre correction
-        self.J_ = one_hot.shape[1]
+            n_rows = len(X)
+            codes = np.column_stack([c.codes.astype(np.int64) for c in categories_per_col])
+            valid = (codes >= 0).ravel()
+            row_idx = np.repeat(np.arange(n_rows), self.K_)[valid]
+            col_idx = (codes + offsets[None, :]).ravel()[valid]
+            full_one_hot_sp = sp.csc_matrix(
+                (np.ones(row_idx.size, dtype=np.float64), (row_idx, col_idx)),
+                shape=(n_rows, J_full),
+            )
 
-        # Apply CA to the indicator matrix
-        super().fit(one_hot)
+            if self.one_hot_columns_to_drop is not None:
+                keep_mask = ~full_columns.isin(self.one_hot_columns_to_drop)
+            else:
+                keep_mask = np.ones(J_full, dtype=bool)
+            kept_columns = full_columns[keep_mask]
+            one_hot_sp = full_one_hot_sp[:, keep_mask] if not keep_mask.all() else full_one_hot_sp
+            one_hot_dense = one_hot_sp.toarray()
 
+            if self.one_hot_columns_to_drop is not None and self.correction == "greenacre":
+                # Sparse Zᵀ Z is much faster than dense for wide indicator matrices.
+                self._subset_full_burt_ = np.asarray(
+                    (full_one_hot_sp.T @ full_one_hot_sp).todense()
+                )
+                self._subset_col_to_var_ = np.repeat(np.arange(self.K_), n_levels)
+                self._subset_mask_ = np.asarray(keep_mask, dtype=bool)
+        else:
+            keep_mask = np.ones(X.shape[1], dtype=bool)
+            kept_columns = X.columns
+            one_hot_dense = np.ascontiguousarray(X.to_numpy(), dtype=np.float64)
+
+        self.one_hot_columns_ = kept_columns
+        self.J_ = len(kept_columns)
+
+        # Wrap the dense ndarray in a single-block DataFrame so CA.fit sees one
+        # contiguous array instead of J per-column ExtensionArrays.
+        one_hot = pd.DataFrame(one_hot_dense, index=X.index, columns=kept_columns)
+
+        super().fit(one_hot)
         return self
 
     @utils.check_is_dataframe_input

diff --git a/prince/svd.py b/prince/svd.py
@@ -50,17 +50,30 @@ def compute_svd(
         else:
             raise ValueError("fbpca is not installed; please install it if you want to use it")
     elif engine == "scipy":
-        U, s, V = scipy.linalg.svd(X)
-        U = U[:, :n_components]
-        s = s[:n_components]
-        V = V[:n_components, :]
+        U, s, V = scipy.linalg.svd(X, full_matrices=False)
     elif engine == "sklearn":
         U, s, V = extmath.randomized_svd(
             X, n_components=n_components, n_iter=n_iter, random_state=random_state
         )
     else:
         raise ValueError("engine has to be one of ('fbpca', 'scipy', 'sklearn')")
 
+    # Normalise to exactly ``n_components``. ``scipy.linalg.svd`` (and ``fbpca``) cap
+    # at ``min(M, N)`` for rank reasons, while ``sklearn.randomized_svd`` always returns
+    # the requested number, padding the tail with noise. We unify on the sklearn shape
+    # contract — extra components past the matrix rank get zero singular values, and the
+    # corresponding U/V columns are zeros (downstream code multiplies them by 0 anyway).
+    k = len(s)
+    if k < n_components:
+        pad = n_components - k
+        s = np.concatenate([s, np.zeros(pad)])
+        U = np.hstack([U, np.zeros((U.shape[0], pad))])
+        V = np.vstack([V, np.zeros((pad, V.shape[1]))])
+    elif k > n_components:
+        s = s[:n_components]
+        U = U[:, :n_components]
+        V = V[:n_components, :]
+
     # U, V = extmath.svd_flip(U, V)
 
     if row_weights is not None:

diff --git a/tests/DESCRIPTION b/tests/DESCRIPTION
@@ -2,4 +2,5 @@ Package: prince-test
 Version: 0.0.0.1
 Title: Test dependencies
 Imports:
-    FactoMineR
+    FactoMineR,
+    ca
diff --git a/tests/test_mca.py b/tests/test_mca.py
@@ -271,6 +271,93 @@ def test_issue_161():
     """
 
 
+def test_non_subset_correction_matches_ca_mjca():
+    """Non-subset Benzécri/Greenacre corrections match R's ``ca::mjca(lambda='adjusted')``.
+
+    FactoMineR doesn't expose these corrections, but Greenacre's own ``ca`` package does,
+    and its closed-form is what prince's non-subset path implements.
+    """
+    wines = prince.datasets.load_burgundy_wines().drop(columns=["Oak type"], level=0)
+    wines.columns = [f"{a}_{b}" for a, b in wines.columns]
+
+    R("library('ca')")
+    with tempfile.NamedTemporaryFile(suffix=".csv") as fp:
+        wines.to_csv(fp.name, index=False)
+        R(f"dataset <- read.csv('{fp.name}')")
+    R("""
+    dataset <- data.frame(lapply(dataset, factor))
+    mj <- mjca(dataset, lambda='adjusted', nd=4)
+    """)
+    r_lambda = np.array(R("mj$sv")) ** 2
+    r_inertia_e = np.array(R("mj$inertia.e"))
+    n_nonzero = int(np.sum(r_lambda > 0))
+
+    mca_g = prince.MCA(n_components=4, correction="greenacre").fit(wines)
+    np.testing.assert_allclose(mca_g.eigenvalues_[:n_nonzero], r_lambda[:n_nonzero], atol=1e-8)
+    np.testing.assert_allclose(
+        mca_g.percentage_of_variance_[:n_nonzero] / 100, r_inertia_e[:n_nonzero], atol=1e-8
+    )
+
+    # Benzécri shares the same adjusted eigenvalues; only the percentages differ — they
+    # renormalise to sum to 100% instead of using Greenacre's adjusted-inertia denominator.
+    mca_b = prince.MCA(n_components=4, correction="benzecri").fit(wines)
+    np.testing.assert_allclose(mca_b.eigenvalues_[:n_nonzero], r_lambda[:n_nonzero], atol=1e-8)
+    expected_benzecri_pct = r_lambda[:n_nonzero] / r_lambda.sum() * 100
+    np.testing.assert_allclose(
+        mca_b.percentage_of_variance_[:n_nonzero], expected_benzecri_pct, atol=1e-8
+    )
+
+
+def test_subset_greenacre_matches_ca_mjca():
+    """Subset-MCA Greenacre correction matches R's ``ca::mjca(lambda='adjusted', subsetcat=...)``.
+
+    Background: https://github.com/MaxHalford/prince/issues/206
+    """
+    wines = prince.datasets.load_burgundy_wines().drop(columns=["Oak type"], level=0)
+    wines.columns = [f"{a}_{b}" for a, b in wines.columns]
+    sep = "__"
+    one_hot = pd.get_dummies(wines, columns=wines.columns, prefix_sep=sep)
+    to_drop = [c for c in one_hot.columns if c.endswith(f"{sep}No")]
+
+    mca = prince.MCA(
+        n_components=4,
+        correction="greenacre",
+        one_hot_prefix_sep=sep,
+        one_hot_columns_to_drop=to_drop,
+    ).fit(wines)
+
+    R("library('ca')")
+    with tempfile.NamedTemporaryFile(suffix=".csv") as fp:
+        wines.to_csv(fp.name, index=False)
+        R(f"dataset <- read.csv('{fp.name}')")
+    R("""
+    dataset <- data.frame(lapply(dataset, factor))
+    lvl_lens <- sapply(dataset, nlevels)
+    offs <- c(0, cumsum(lvl_lens)[-length(lvl_lens)])
+    subsetcat <- c()
+    for (i in seq_along(dataset)) {
+      lv <- levels(dataset[[i]])
+      for (j in seq_along(lv)) {
+        if (lv[j] != 'No') subsetcat <- c(subsetcat, offs[i] + j)
+      }
+    }
+    mj <- mjca(dataset, lambda='adjusted', subsetcat=subsetcat, nd=4)
+    """)
+    r_lambda = np.array(R("mj$sv"))[: mca.eigenvalues_.shape[0]] ** 2
+    r_inertia_e = np.array(R("mj$inertia.e"))
+    r_inertia_t = float(np.array(R("mj$inertia.t"))[0])
+
+    np.testing.assert_allclose(mca.eigenvalues_[: len(r_lambda)], r_lambda, atol=1e-8)
+    np.testing.assert_allclose(
+        mca.percentage_of_variance_[: len(r_inertia_e)] / 100, r_inertia_e, atol=1e-8
+    )
+    np.testing.assert_allclose(
+        mca.percentage_of_variance_[: len(r_inertia_e)] / 100,
+        mca.eigenvalues_[: len(r_inertia_e)] / r_inertia_t,
+        atol=1e-10,
+    )
+
+
 def test_abdi_2007_correction():
     """
 

diff --git a/tests/test_mfa.py b/tests/test_mfa.py
@@ -253,7 +253,10 @@ def _prepare(self, sup_rows, sup_groups):
         if self.sup_rows:
             active = active.drop(index=self._sup_row_indices)
         supplementary_groups = [self._sup_group_name] if self.sup_groups else []
-        self.mfa = prince.MFA(n_components=self.n_components).fit(
+        # engine="scipy" forces a deterministic full SVD so the tight (atol=1e-4)
+        # comparisons against FactoMineR don't flake on randomized-SVD precision noise
+        # on the last component (see CI flake on PR #236).
+        self.mfa = prince.MFA(n_components=self.n_components, engine="scipy").fit(
             active,
             supplementary_groups=supplementary_groups,
         )