Source code for qig.exponential_family

"""
Matrix Exponential Family and BKM metric interface for the quantum inaccessible game.

This module contains:
- local operator bases (Pauli, Gell-Mann, generalised Gell-Mann);
- construction of the full operator basis {F_a};
- the `MatrixExponentialFamily` class providing ρ(θ), log-partition ψ(θ),
  Fisher/BKM metric G(θ), and the marginal-entropy constraint.
"""

from typing import Tuple, List, Optional, Dict, Any

import numpy as np
from scipy.linalg import expm, eigh, logm

from qig.core import marginal_entropies, partial_trace
from qig.pair_operators import (
    pair_basis_generators,
    multi_pair_basis,
    bell_state_density_matrix,
    product_of_bell_states
)


# ============================================================================
# Operator Basis: Local Lie Algebra Generators
# ============================================================================


[docs]
def pauli_basis(site: int, n_sites: int) -> List[np.ndarray]:
    """
    Create Pauli operator basis for site 'site' in an n_sites qubit system.

    Returns [sigma_x, sigma_y, sigma_z] tensored with identity on other sites.
    """
    # Define Pauli matrices
    I = np.array([[1, 0], [0, 1]], dtype=complex)
    X = np.array([[0, 1], [1, 0]], dtype=complex)
    Y = np.array([[0, -1j], [1j, 0]], dtype=complex)
    Z = np.array([[1, 0], [0, -1]], dtype=complex)

    paulis = [X, Y, Z]
    operators: List[np.ndarray] = []

    for P in paulis:
        # Build tensor product: I ⊗ ... ⊗ P ⊗ ... ⊗ I
        op = None
        for i in range(n_sites):
            current = P if i == site else I
            op = current if op is None else np.kron(op, current)
        operators.append(op)

    return operators




[docs]
def gell_mann_matrices() -> List[np.ndarray]:
    """
    Return the 8 Gell-Mann matrices (generators of SU(3)).
    """
    gm: List[np.ndarray] = []

    # λ1 and λ2 (off-diagonal in 01 block)
    gm.append(np.array([[0, 1, 0], [1, 0, 0], [0, 0, 0]], dtype=complex))
    gm.append(np.array([[0, -1j, 0], [1j, 0, 0], [0, 0, 0]], dtype=complex))

    # λ3 (diagonal in 01 block)
    gm.append(np.array([[1, 0, 0], [0, -1, 0], [0, 0, 0]], dtype=complex))

    # λ4 and λ5 (off-diagonal in 02 block)
    gm.append(np.array([[0, 0, 1], [0, 0, 0], [1, 0, 0]], dtype=complex))
    gm.append(np.array([[0, 0, -1j], [0, 0, 0], [1j, 0, 0]], dtype=complex))

    # λ6 and λ7 (off-diagonal in 12 block)
    gm.append(np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0]], dtype=complex))
    gm.append(np.array([[0, 0, 0], [0, 0, -1j], [0, 1j, 0]], dtype=complex))

    # λ8 (diagonal)
    gm.append(np.array([[1, 0, 0], [0, 1, 0], [0, 0, -2]], dtype=complex) / np.sqrt(3))

    return gm




[docs]
def qutrit_basis(site: int, n_sites: int) -> List[np.ndarray]:
    """
    Create Gell-Mann operator basis for site 'site' in n_sites qutrit system.
    """
    I = np.eye(3, dtype=complex)
    gm = gell_mann_matrices()
    operators: List[np.ndarray] = []

    for G in gm:
        op = None
        for i in range(n_sites):
            current = G if i == site else I
            op = current if op is None else np.kron(op, current)
        operators.append(op)

    return operators




[docs]
def create_operator_basis(n_sites: int, d: int) -> Tuple[list, list]:
    """
    Create full operator basis {F_a} for matrix exponential family.

    Parameters
    -----------
    n_sites : int
        Number of sites
    d : int
        Local dimension (2 for qubits, 3 for qutrits, d>=2 in general)
    """
    operators: List[np.ndarray] = []
    labels: List[str] = []

    if d == 2:
        # Qubits: Pauli basis
        pauli_names = ["X", "Y", "Z"]
        for site in range(n_sites):
            ops = pauli_basis(site, n_sites)
            operators.extend(ops)
            labels.extend([f"{name}_{site+1}" for name in pauli_names])

    elif d == 3:
        # Qutrits: Gell-Mann basis
        for site in range(n_sites):
            ops = qutrit_basis(site, n_sites)
            operators.extend(ops)
            labels.extend([f"λ{k+1}_{site+1}" for k in range(8)])

    elif d >= 4:
        # Higher dimensions: use generalised Gell-Mann matrices
        for site in range(n_sites):
            # Create d^2-1 Hermitian traceless operators
            basis_ops: List[np.ndarray] = []
            # Off-diagonal symmetric and antisymmetric
            for i in range(d):
                for j in range(i + 1, d):
                    # Symmetric: |i><j| + |j><i|
                    op = np.zeros((d, d), dtype=complex)
                    op[i, j] = 1
                    op[j, i] = 1
                    basis_ops.append(op)

                    # Antisymmetric: -i|i><j| + i|j><i|
                    op = np.zeros((d, d), dtype=complex)
                    op[i, j] = -1j
                    op[j, i] = 1j
                    basis_ops.append(op)

            # Diagonal traceless
            for k in range(d - 1):
                op = np.zeros((d, d), dtype=complex)
                for i in range(k + 1):
                    op[i, i] = 1
                op[k + 1, k + 1] = -(k + 1)
                op = op / np.sqrt(k + 1 + (k + 1) ** 2)
                basis_ops.append(op)

            # Tensor with identity on other sites
            for op_local in basis_ops:
                op_full = None
                for s in range(n_sites):
                    current = op_local if s == site else np.eye(d, dtype=complex)
                    op_full = current if op_full is None else np.kron(op_full, current)
                operators.append(op_full)
                labels.append(f"H{len(operators)}_{site+1}")

    else:
        raise ValueError(f"Dimension d={d} must be >= 2.")

    return operators, labels



# ============================================================================
# matrix exponential family
# ============================================================================



[docs]
class MatrixExponentialFamily:
    """
    matrix exponential family: ρ(θ) = exp(∑ θ_a F_a - ψ(θ))
    """


[docs]
    def __init__(self, n_sites: Optional[int] = None, d: int = 2, 
                 n_pairs: Optional[int] = None, pair_basis: bool = False):
        """
        Initialise matrix exponential family.

        Parameters
        -----------
        n_sites : int, optional
            Number of subsystems (for local operator basis)
        d : int
            Local dimension (2 for qubits, 3 for qutrits)
        n_pairs : int, optional
            Number of entangled pairs (for pair basis)
        pair_basis : bool
            If True, use su(d²) operators for entangled pairs
            If False, use local operators (default)
            
        Notes
        -----
        Two modes of operation:
        
        1. Local operators (pair_basis=False, default):
           - Specify n_sites: number of independent subsystems
           - Uses operators like σ_x⊗I, I⊗σ_y (local Pauli/Gell-Mann)
           - Can ONLY represent separable states
           - Suitable for testing, but NOT for quantum game with entanglement
           
        2. Pair operators (pair_basis=True):
           - Specify n_pairs: number of entangled pairs
           - Uses su(d²) generators acting on each pair
           - Can represent entangled states (including Bell states)
           - Required for proper quantum inaccessible game
           - Fisher metric G is block-diagonal (one block per pair)
        """
        self.pair_basis = pair_basis
        
        if pair_basis:
            # Pair-based operators for entangled pairs
            if n_pairs is None:
                raise ValueError("Must specify n_pairs when using pair_basis=True")
            self.n_pairs = n_pairs
            self.n_sites = 2 * n_pairs  # Each pair has 2 subsystems
            self.d = d
            self.dims = [d] * self.n_sites
            self.D = d**(2 * n_pairs)  # Hilbert space: (d²)^n_pairs
            
            # Create pair operator basis
            self.operators, self.pair_indices = multi_pair_basis(n_pairs, d)
            self.labels = [f"F{a}_pair{k}" for k in range(n_pairs) 
                          for a in range(d**4 - 1)]
            self.n_params = len(self.operators)
            
            print(f"Initialised {n_pairs}-pair system with d={d} (pair basis)")
            print(f"Number of subsystems: {self.n_sites} (2 per pair)")
            print(f"Hilbert space dimension: {self.D}")
            print(f"Number of parameters: {self.n_params} = {n_pairs} × {d**4-1}")
        else:
            # Local operators (original mode)
            if n_sites is None:
                raise ValueError("Must specify n_sites when using pair_basis=False")
            self.n_sites = n_sites
            self.d = d
            self.dims = [d] * n_sites
            self.D = d**n_sites
            self.n_pairs = None
            self.pair_indices = None

            # Create operator basis
            self.operators, self.labels = create_operator_basis(n_sites, d)
            self.n_params = len(self.operators)

            print(f"Initialised {n_sites}-site system with d={d} (local basis)")
            print(f"Hilbert space dimension: {self.D}")
            print(f"Number of parameters: {self.n_params}")


    def _lift_to_full_space(self, op_i: np.ndarray, site_i: int) -> np.ndarray:
        """
        Lift operator on subsystem i to full Hilbert space.
        
        This is the adjoint of partial_trace(): it embeds a marginal operator
        into the full space by tensoring with identity operators on other subsystems.
        
        Result: I₀ ⊗ ... ⊗ I_{i-1} ⊗ op_i ⊗ I_{i+1} ⊗ ... ⊗ I_{n-1}
        
        Parameters
        ----------
        op_i : ndarray, shape (d_i, d_i)
            Operator on subsystem i
        site_i : int
            Which subsystem (0 to n_sites-1)
            
        Returns
        -------
        op_full : ndarray, shape (D, D)
            Operator on full Hilbert space
        """
        result = None
        for j, d_j in enumerate(self.dims):
            if j == site_i:
                current = op_i
            else:
                current = np.eye(d_j, dtype=complex)
            result = current if result is None else np.kron(result, current)
        return result

    def _bkm_kernel(self, rho: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Compute BKM (Bogoliubov-Kubo-Mori) kernel from density matrix eigenvalues.
        
        The BKM kernel for the quantum Fisher information metric is:
            k(p_i, p_j) = (p_i - p_j) / (log p_i - log p_j)  for i ≠ j
            k(p_i, p_i) = p_i                                 for i = j
        
        This kernel appears in the BKM inner product:
            ⟨A, B⟩_BKM = ∑_{i,j} k(p_i, p_j) A[i,j] conj(B[i,j])
        
        where A, B are operators in the eigenbasis of ρ.
        
        Parameters
        ----------
        rho : ndarray, shape (D, D)
            Density matrix
            
        Returns
        -------
        k : ndarray, shape (D, D)
            BKM kernel matrix
        p : ndarray, shape (D,)
            Eigenvalues of ρ (clipped to ≥ 1e-14)
        U : ndarray, shape (D, D)
            Eigenvectors of ρ (columns)
        """
        from scipy.linalg import eigh
        
        p, U = eigh(rho)
        p = np.clip(p.real, 1e-14, None)
        
        # Build kernel matrix
        p_i = p[:, None]
        p_j = p[None, :]
        diff = p_i - p_j
        log_diff = np.log(p_i) - np.log(p_j)
        
        k = np.zeros_like(diff)
        # For non-degenerate eigenvalues: k(p_i, p_j) = (p_i - p_j)/(log p_i - log p_j)
        non_degenerate = np.abs(diff) > 1e-14
        k[non_degenerate] = diff[non_degenerate] / log_diff[non_degenerate]
        # For degenerate or near-degenerate eigenvalues: k(p_i, p_j) → (p_i + p_j)/2 ≈ p
        degenerate = np.abs(diff) <= 1e-14
        k[degenerate] = 0.5 * (p_i + p_j)[degenerate]
        
        return k, p, U


[docs]
    def rho_from_theta(self, theta: np.ndarray) -> np.ndarray:
        """
        Compute ρ(θ) = exp(K(θ) - ψ(θ)) where K(θ) = ∑ θ_a F_a.
        """
        # Numerical stability:
        # ρ = expm(K) / Tr(expm(K)) is invariant under K -> K - cI.
        # Shifting by the max eigenvalue avoids overflow/underflow in expm(K)
        # for large ||theta|| (common near pure-state limits).
        K = np.zeros((self.D, self.D), dtype=complex)
        for theta_a, F_a in zip(theta, self.operators):
            K += theta_a * F_a

        # K is Hermitian for real theta and Hermitian generators.
        lambda_max = float(np.max(np.linalg.eigvalsh(K).real))
        K_shift = K - lambda_max * np.eye(self.D, dtype=complex)

        rho_unnorm = expm(K_shift)
        Z = np.trace(rho_unnorm)
        rho = rho_unnorm / Z
        return rho



[docs]
    def psi(self, theta: np.ndarray) -> float:
        """
        Compute the cumulant generating function ψ(θ) = log Tr(exp(∑ θ_a F_a)).

        This is the log partition function for the exponential family.
        """
        K = sum(theta_a * F_a for theta_a, F_a in zip(theta, self.operators))
        lambda_max = float(np.max(np.linalg.eigvalsh(K).real))
        K_shift = K - lambda_max * np.eye(self.D, dtype=complex)
        Z_shift = np.trace(expm(K_shift))
        return float(lambda_max + np.log(Z_shift).real)


    # Backward compatibility alias

[docs]
    def log_partition(self, theta: np.ndarray) -> float:
        """
        Deprecated: Use psi() instead.

        Compute the cumulant generating function ψ(θ) = log Tr(exp(∑ θ_a F_a)).
        """
        import warnings
        warnings.warn("log_partition() is deprecated, use psi() instead",
                     DeprecationWarning, stacklevel=2)
        return self.psi(theta)



[docs]
    def rho_derivative(self, theta: np.ndarray, a: int, **kwargs) -> np.ndarray:
        """
        Compute ∂ρ/∂θ_a using the correct quantum formula.
        
        Three main methods available.
        1. 'sld': Symmetric Logarithmic Derivative (fast, ~5% error)
            ∂ρ/∂θ = (1/2)[ρ(F - ⟨F⟩I) + (F - ⟨F⟩I)ρ]
        
        2. 'duhamel': Duhamel exponential formula with numerical quadrature
           (slower, ~5e-6 error for n_points≈200)
            ∂ρ/∂θ = ∫₀¹ exp(sH)(F - ⟨F⟩I)exp((1-s)H) ds
        
        3. 'duhamel_spectral' / 'duhamel_bch': Duhamel formula via spectral/BCH
           representation of H (no explicit s-quadrature, high precision).

        4. 'duhamel_block': Higham block-matrix identity for the Fréchet derivative
           (one 2D×2D matrix exponential; no eigendecomposition, no quadrature).
        
        ⚠️ QUANTUM ALERT: Simple ρ(F - ⟨F⟩I) is WRONG for non-commuting operators!
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        a : int
            Parameter index
        method : str, default='sld'
            'sld' for fast SLD, 'duhamel' for high precision
        n_points : int, default=100
            Quadrature points for Duhamel (ignored for 'sld')
        
        Returns
        -------
        drho : ndarray, shape (D, D)
            Derivative ∂ρ/∂θ_a (Hermitian matrix)
        
        Notes
        -----
        Quantum derivative principles applied:
        ✅ Check operator commutation: Uses symmetric/integral form
        ✅ Verify operator ordering: Preserves ordering in exponentials
        ✅ Distinguish quantum vs classical: Uses quantum formulas
        ✅ Respect Hilbert space structure: Preserves Hermiticity
        """
        # Default: Higham block-matrix Fréchet derivative (robust, no eigendecomp)
        method = kwargs.get('method', 'duhamel_block')
        n_points = kwargs.get('n_points', 200)  # legacy quadrature control
        
        rho = self.rho_from_theta(theta)
        F_a = self.operators[a]
        mean_Fa = np.trace(rho @ F_a).real
        I = np.eye(self.D, dtype=complex)
        
        F_centered = F_a - mean_Fa * I
        
        if method == 'sld':
            # Symmetric logarithmic derivative (fast approximation)
            drho = 0.5 * (rho @ F_centered + F_centered @ rho)
        elif method == 'duhamel':
            # High-precision Duhamel formula via numerical quadrature
            from qig.duhamel import duhamel_derivative, compute_H_from_theta
            H, K, psi = compute_H_from_theta(self.operators, theta)
            drho = duhamel_derivative(rho, H, F_centered, n_points)
        elif method == 'duhamel_block':
            # Higham block-matrix identity (Fréchet derivative via a single expm on 2D×2D)
            from qig.duhamel import duhamel_derivative_block, compute_H_from_theta
            H, K, psi = compute_H_from_theta(self.operators, theta)
            drho = duhamel_derivative_block(rho, H, F_centered)
        elif method in ('duhamel_spectral', 'duhamel_bch'):
            # Duhamel via spectral/BCH representation of H (no explicit s-quadrature)
            from qig.duhamel import duhamel_derivative_spectral, compute_H_from_theta
            H, K, psi = compute_H_from_theta(self.operators, theta)
            drho = duhamel_derivative_spectral(rho, H, F_centered)
        else:
            raise ValueError(
                f"Unknown method: {method}. "
                "Use 'duhamel_block' (default), 'duhamel_spectral', 'duhamel', or 'sld'"
            )
        
        return drho

    

[docs]
    def rho_second_derivative(self, theta: np.ndarray, a: int, b: int, 
                              method: str = 'numerical_duhamel', 
                              n_points: int = 100, eps: float = 1e-7) -> np.ndarray:
        """
        Compute ∂²ρ/∂θ_a∂θ_b using high-precision numerical differentiation.
        
        Strategy: Use finite differences of the high-precision Duhamel ∂ρ/∂θ.
        
        ∂²ρ/∂θ_a∂θ_b ≈ [∂ρ/∂θ_a(θ+ε·e_b) - ∂ρ/∂θ_a(θ-ε·e_b)] / (2ε)
        
        This gives 0.55-2.6% error, which is 30-100× better than the
        analytic SLD-based formula.
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        a : int
            First parameter index
        b : int
            Second parameter index
        method : str, default='numerical_duhamel'
            Currently only 'numerical_duhamel' is supported
        n_points : int, default=100
            Quadrature points for Duhamel integration
        eps : float, default=1e-7
            Finite difference step size
            
        Returns
        -------
        d2rho : ndarray, shape (D, D)
            Second derivative ∂²ρ/∂θ_a∂θ_b (Hermitian matrix)
            
        Notes
        -----
        - Uses central differences for better accuracy
        - Hermiticity preserved to machine precision
        - More accurate than analytic SLD-based second derivative
        
        Quantum derivative principles applied:
        ✅ Respects non-commutativity through Duhamel integration
        ✅ Preserves Hermiticity
        ✅ Uses high-precision first derivatives
        """
        if method != 'numerical_duhamel':
            raise ValueError(f"Only 'numerical_duhamel' supported, got {method}")
        
        # Compute ∂ρ/∂θ_a at θ + ε·e_b using block Duhamel by default
        theta_plus = theta.copy()
        theta_plus[b] += eps
        drho_a_plus = self.rho_derivative(theta_plus, a, method='duhamel_block', n_points=n_points)
        
        # Compute ∂ρ/∂θ_a at θ - ε·e_b
        theta_minus = theta.copy()
        theta_minus[b] -= eps
        drho_a_minus = self.rho_derivative(theta_minus, a, method='duhamel_block', n_points=n_points)
        
        # Central difference
        d2rho = (drho_a_plus - drho_a_minus) / (2 * eps)
        
        return d2rho



[docs]
    def fisher_information(self, theta: np.ndarray) -> np.ndarray:
        """
        Compute Fisher information (BKM metric) G(θ) = ∇∇ψ(θ) using the
        Kubo-Mori / BKM inner product.

        For a matrix exponential family ``ρ(θ) = exp(K(θ)) / Z(θ)`` with
        ``K(θ) = ∑_a θ_a F_a``, the Bogoliubov-Kubo-Mori metric is:

        .. math::

            G_{ab}(θ) = \\int_0^1 \\mathrm{Tr}\\left(
                ρ(θ)^s \\tilde{F}_a ρ(θ)^{1-s} \\tilde{F}_b
            \\right) \\mathrm{d}s

        where ``F̃_a = F_a - Tr[ρ(θ) F_a] I`` are centred sufficient statistics.

        In the eigenbasis of ρ(θ), this reduces to the spectral representation
        with the Morozova-Chentsov function ``c(λ, μ) = (log λ - log μ)/(λ - μ)``
        (diagonal limit: ``c(λ, λ) = 1/λ``).

        When all F_a commute with ρ(θ) (classical case), this reduces to the
        usual covariance Fisher information matrix.

        This implementation:

        - Diagonalises ρ(θ) = U diag(p) U†
        - Centres each F_a in that basis
        - Applies the BKM kernel c(p_i, p_j) to all matrix elements
        - Symmetrises G to guard against numerical asymmetries
        """
        # Step 1: spectral decomposition of ρ(θ)
        rho = self.rho_from_theta(theta)
        eigvals, U = eigh(rho)  # ρ is Hermitian

        # Regularise eigenvalues to avoid log(0); keep normalisation approximate
        eps_p = 1e-14
        p = np.clip(np.real(eigvals), eps_p, None)

        # Step 2: build centred sufficient statistics in eigenbasis of ρ
        D = self.D
        n = self.n_params
        A_tilde = np.zeros((n, D, D), dtype=complex)

        I = np.eye(D, dtype=complex)
        for a, F_a in enumerate(self.operators):
            # Centre F_a: F_a - ⟨F_a⟩_ρ I
            mean_Fa = np.trace(rho @ F_a).real
            A_a = F_a - mean_Fa * I
            # Transform to eigenbasis of ρ
            A_tilde[a] = U.conj().T @ A_a @ U

        # Step 3: construct BKM kernel k(p_i, p_j)
        p_i = p[:, None]
        p_j = p[None, :]
        diff = p_i - p_j
        log_diff = np.log(p_i) - np.log(p_j)

        # k(λ, μ) = (λ - μ)/(log λ - log μ), with diagonal limit k(λ, λ) = λ
        # This is the standard Kubo–Mori / BKM kernel. In the diagonal
        # (commuting) case it reduces to the classical covariance weight p_i.
        k = np.zeros_like(diff)
        
        # For non-degenerate eigenvalues: k(p_i, p_j) = (p_i - p_j)/(log p_i - log p_j)
        non_degenerate = np.abs(diff) > 1e-14
        k[non_degenerate] = diff[non_degenerate] / log_diff[non_degenerate]
        
        # For degenerate or near-degenerate eigenvalues: k(p_i, p_j) → p_i = p_j
        # This includes both diagonal (i==j) and off-diagonal (i≠j but p_i ≈ p_j)
        # Use the average (p_i + p_j)/2 which equals p_i (or p_j) when they're equal
        degenerate = np.abs(diff) <= 1e-14
        k[degenerate] = 0.5 * (p_i + p_j)[degenerate]

        # Step 4: assemble G_ab = Σ_{i,j} k(p_i, p_j) A_a[i,j] conj(A_b[i,j])
        G = np.zeros((n, n))
        for a in range(n):
            A_a = A_tilde[a]
            for b in range(a, n):
                A_b = A_tilde[b]
                # For Hermitian operators in the eigenbasis of ρ, the BKM metric is:
                # G_ab = ∑_{i,j} k(p_i, p_j) * A_a[i,j] * conj(A_b[i,j])
                # where k(p_i, p_j) = (p_i - p_j)/(log p_i - log p_j).
                prod = A_a * np.conj(A_b)
                Gab = np.sum(k * prod)
                # BKM inner product is real for Hermitian observables; take Re.
                Gab_real = float(np.real(Gab))
                G[a, b] = Gab_real
                G[b, a] = Gab_real

        # Symmetrise to enforce exact symmetry numerically
        G = 0.5 * (G + G.T)
        return G



[docs]
    def fisher_information_product(
        self, theta: np.ndarray, check_product: bool = True, tol: float = 1e-6
    ) -> np.ndarray:
        """
        Compute Fisher information exploiting block-diagonal structure for product states.
        
        For product states ρ = ρ₁ ⊗ ρ₂ ⊗ ... ⊗ ρₙ with pair-based operators,
        the BKM Fisher metric is block-diagonal:
        
            G = diag(G₁, G₂, ..., Gₙ)
        
        where each Gₖ is computed from the marginal ρₖ on pair k.
        
        Complexity:
            - Full: O(D³) = O(d^(6n)) where D = d^(2n)
            - Block: O(n × d¹²) - exponentially faster for n > 1
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        check_product : bool, default True
            If True, verify state is approximately a product state.
            If False, assume product structure (faster but may be wrong).
        tol : float, default 1e-6
            Tolerance for product state check
            
        Returns
        -------
        G : ndarray, shape (n_params, n_params)
            Block-diagonal Fisher information matrix
            
        Raises
        ------
        ValueError
            If not using pair_basis mode
            If check_product=True and state is not a product state
            
        Notes
        -----
        For n qutrit pairs (d=3):
        - n=2: Full O(81³)=530k, Block O(2×3¹²)=1M → similar
        - n=3: Full O(729³)=387M, Block O(3×3¹²)=1.6M → 240× faster
        - n=4: Full O(6561³)=282B, Block O(4×3¹²)=2.1M → 134000× faster
        
        The crossover point where block computation wins is typically n≥2
        for qutrits and n≥3 for qubits.
        """
        if not self.pair_basis:
            raise ValueError(
                "fisher_information_product requires pair_basis=True. "
                "For local operators, use fisher_information()."
            )
        
        from .pair_operators import pair_basis_generators
        
        rho = self.rho_from_theta(theta)
        
        # Optionally check if state is approximately a product state
        if check_product:
            if not self._is_product_state(rho, tol=tol):
                raise ValueError(
                    f"State is not a product state (within tol={tol}). "
                    "Use fisher_information() for entangled states, or "
                    "set check_product=False if you know the structure is block-diagonal."
                )
        
        # Get single-pair generators (d²×d² matrices)
        single_pair_generators = pair_basis_generators(self.d)
        n_ops_per_pair = len(single_pair_generators)  # d⁴ - 1
        
        # Compute Fisher metric block for each pair
        G = np.zeros((self.n_params, self.n_params))
        
        for k in range(self.n_pairs):
            # Extract marginal ρₖ on pair k
            rho_k = self._partial_trace_to_pair(rho, k)
            
            # Compute BKM metric for this pair
            G_k = self._fisher_block(rho_k, single_pair_generators)
            
            # Insert into block-diagonal structure
            start_idx = k * n_ops_per_pair
            end_idx = (k + 1) * n_ops_per_pair
            G[start_idx:end_idx, start_idx:end_idx] = G_k
        
        return G

    
    def _fisher_block(
        self, rho: np.ndarray, operators: List[np.ndarray]
    ) -> np.ndarray:
        """
        Compute Fisher information block for a single subsystem.
        
        This is the BKM metric computed from a single marginal density matrix
        and its associated operators.
        
        Parameters
        ----------
        rho : ndarray, shape (d², d²)
            Density matrix for a single pair
        operators : List[ndarray]
            List of d⁴-1 operators, each of shape (d², d²)
            
        Returns
        -------
        G : ndarray, shape (d⁴-1, d⁴-1)
            Fisher information block
        """
        from scipy.linalg import eigh
        
        D = rho.shape[0]
        n = len(operators)
        
        # Eigendecomposition
        eigvals, U = eigh(rho)
        p = np.clip(np.real(eigvals), 1e-14, None)
        
        # Centre operators and transform to eigenbasis
        A_tilde = np.zeros((n, D, D), dtype=complex)
        I = np.eye(D, dtype=complex)
        
        for a, F_a in enumerate(operators):
            mean_Fa = np.trace(rho @ F_a).real
            A_a = F_a - mean_Fa * I
            A_tilde[a] = U.conj().T @ A_a @ U
        
        # BKM kernel
        p_i = p[:, None]
        p_j = p[None, :]
        diff = p_i - p_j
        log_diff = np.log(p_i) - np.log(p_j)
        
        k = np.zeros_like(diff)
        non_degenerate = np.abs(diff) > 1e-14
        k[non_degenerate] = diff[non_degenerate] / log_diff[non_degenerate]
        degenerate = np.abs(diff) <= 1e-14
        k[degenerate] = 0.5 * (p_i + p_j)[degenerate]
        
        # Assemble metric
        G = np.zeros((n, n))
        for a in range(n):
            A_a = A_tilde[a]
            for b in range(a, n):
                A_b = A_tilde[b]
                prod = A_a * np.conj(A_b)
                Gab = float(np.real(np.sum(k * prod)))
                G[a, b] = Gab
                G[b, a] = Gab
        
        return 0.5 * (G + G.T)
    
    def _is_product_state(self, rho: np.ndarray, tol: float = 1e-6) -> bool:
        """
        Check if ρ is approximately a product state ρ₁ ⊗ ... ⊗ ρₙ.
        
        Uses the criterion: ρ is a product iff ρ = ⊗ₖ Trₖ̄(ρ)
        where Trₖ̄ traces out all pairs except k.
        
        Parameters
        ----------
        rho : ndarray
            Density matrix
        tol : float
            Tolerance for comparison
            
        Returns
        -------
        bool
            True if ρ is approximately a product state
        """
        if not self.pair_basis or self.n_pairs == 1:
            return True  # Single pair is trivially "product"
        
        # Compute product of marginals
        D_pair = self.d ** 2
        rho_product = np.array([[1.0]])
        
        for k in range(self.n_pairs):
            rho_k = self._partial_trace_to_pair(rho, k)
            rho_product = np.kron(rho_product, rho_k)
        
        return np.allclose(rho, rho_product, atol=tol)


[docs]
    def marginal_entropy_constraint_theta_only(
        self, theta: np.ndarray
    ) -> Tuple[float, np.ndarray]:
        """
        Compute constraint C(θ) = ∑ᵢ hᵢ and gradient ∇C using θ-only formulas.
        
        This is a fast, exact method that avoids materializing ∂ρ/∂θ for each parameter.
        Instead, it uses the BKM inner product:
        
            ∂C/∂θ_a = ⟨F̃_a, B⟩_BKM
        
        where:
            F̃_a = F_a - ⟨F_a⟩I  (centered operator)
            B = ∑ᵢ Bᵢ           (lifted test operator)
            Bᵢ = (log ρᵢ + Iᵢ) ⊗ I_rest
        
        This reuses the eigendecomposition and BKM kernel from fisher_information(),
        achieving machine precision (~10⁻¹⁴) with ~100× speedup over Duhamel method.
        
        Parameters
        ----------
        theta : ndarray
            Natural parameters
            
        Returns
        -------
        C : float
            Constraint value ∑ᵢ hᵢ
        grad_C : ndarray, shape (n_params,)
            Gradient ∇C
        """
        rho = self.rho_from_theta(theta)
        h = marginal_entropies(rho, self.dims)
        C = float(np.sum(h))
        
        # Get eigendecomposition and BKM kernel
        k, p, U = self._bkm_kernel(rho)
        
        # Build lifted test operator B = ∑ᵢ (log ρᵢ + Iᵢ) ⊗ I_rest
        B_full = np.zeros((self.D, self.D), dtype=complex)
        
        for i in range(self.n_sites):
            # Compute marginal ρᵢ
            rho_i = partial_trace(rho, self.dims, keep=i)
            
            # Compute log(ρᵢ) safely using eigendecomposition
            eigvals_i, eigvecs_i = eigh(rho_i)
            eigvals_i = np.maximum(eigvals_i.real, 1e-14)
            log_eigvals_i = np.log(eigvals_i)
            log_rho_i = eigvecs_i @ np.diag(log_eigvals_i) @ eigvecs_i.conj().T
            
            # Lifted operator: (log ρᵢ + Iᵢ) ⊗ I_rest
            B_i = log_rho_i + np.eye(self.dims[i], dtype=complex)
            B_full += self._lift_to_full_space(B_i, i)
        
        # Transform B to eigenbasis of ρ
        B_tilde = U.conj().T @ B_full @ U
        
        # Compute gradient via BKM inner products
        grad_C = np.zeros(self.n_params)
        I_full = np.eye(self.D, dtype=complex)
        
        for a, F_a in enumerate(self.operators):
            # Center operator: F̃_a = F_a - ⟨F_a⟩I
            mean_Fa = np.trace(rho @ F_a).real
            F_tilde = F_a - mean_Fa * I_full
            
            # Transform to eigenbasis
            F_tilde_eigen = U.conj().T @ F_tilde @ U
            
            # BKM inner product: ⟨F̃_a, B⟩_BKM = ∑ᵢⱼ k[i,j] F̃_a[i,j] conj(B[i,j])
            grad_C[a] = -np.real(np.sum(k * (F_tilde_eigen * np.conj(B_tilde))))
        
        return C, grad_C



[docs]
    def marginal_entropy_constraint(
        self, theta: np.ndarray, method: str = 'theta_only'
    ) -> Tuple[float, np.ndarray]:
        """
        Compute constraint value C(θ) = ∑_i h_i and gradient ∇C.
        
        This method dispatches to different implementations based on the method parameter.
        
        Parameters
        ----------
        theta : ndarray
            Natural parameters
        method : str, default='theta_only'
            Method for gradient computation. Options:

            - ``'theta_only'``: Fast θ-only method using BKM inner products (default).
              ~100× faster, machine precision accuracy.
            - ``'duhamel'``: Legacy method materializing ∂ρ/∂θ via Duhamel.
              Slow but kept for verification.
            - ``'sld'``: Legacy method using SLD approximation.
              Faster than Duhamel but ~5% error.
              
        Returns
        -------
        C : float
            Constraint value ∑ᵢ hᵢ
        grad_C : ndarray
            Gradient ∇C
        """
        if method == 'theta_only':
            return self.marginal_entropy_constraint_theta_only(theta)
        
        # Legacy method: materialize ∂ρ/∂θ for each parameter
        rho = self.rho_from_theta(theta)
        h = marginal_entropies(rho, self.dims)
        C = float(np.sum(h))

        # Compute gradient by materializing drho
        grad_C = np.zeros(self.n_params)
        I = np.eye(self.D, dtype=complex)
        
        for a in range(self.n_params):
            F_a = self.operators[a]
            
            # Compute ∂ρ/∂θ_a using specified method (duhamel or sld)
            drho_dtheta_a = self.rho_derivative(theta, a, method=method)
            
            # Sum over all subsystems
            for i in range(self.n_sites):
                # Compute marginal ρ_i
                rho_i = partial_trace(rho, self.dims, keep=i)
                
                # Compute ∂ρ_i/∂θ_a (partial trace of ∂ρ/∂θ_a)
                drho_i_dtheta_a = partial_trace(drho_dtheta_a, self.dims, keep=i)
                
                # Compute log(ρ_i) with regularisation
                eigvals_i, eigvecs_i = eigh(rho_i)
                eigvals_i = np.maximum(eigvals_i.real, 1e-14)
                log_eigvals_i = np.log(eigvals_i)
                log_rho_i = eigvecs_i @ np.diag(log_eigvals_i) @ eigvecs_i.conj().T
                
                # Compute ∂h_i/∂θ_a = -Tr((∂ρ_i/∂θ_a) log(ρ_i))
                dh_i_dtheta_a = -np.trace(drho_i_dtheta_a @ log_rho_i).real
                
                grad_C[a] += dh_i_dtheta_a

        return C, grad_C


    def _marginal_entropy_gradient_single(
        self, theta: np.ndarray, site_idx: int
    ) -> Tuple[float, np.ndarray]:
        """
        Gradient of the von Neumann entropy of one marginal subsystem.

        Computes h_k(θ) = -tr(ρ_k log ρ_k) and its gradient ∂h_k/∂θ_a for
        subsystem k using the BKM inner product:

            ∂h_k/∂θ_a = -⟨F̃_a, B_k⟩_BKM

        where B_k = (log ρ_k + I_k) ⊗ I_rest is the lifted log-marginal
        operator for subsystem k, and the BKM inner product is evaluated at
        ρ(θ).

        This is the per-subsystem analogue of
        ``marginal_entropy_constraint_theta_only``, which returns only the
        gradient of the *sum* ∑_k h_k.

        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters.
        site_idx : int
            Index of the subsystem (0-based).

        Returns
        -------
        h_k : float
            Marginal von Neumann entropy of subsystem site_idx.
        grad_h_k : ndarray, shape (n_params,)
            Gradient ∂h_k/∂θ_a.
        """
        rho = self.rho_from_theta(theta)
        rho_k = partial_trace(rho, self.dims, keep=site_idx)

        # Marginal entropy
        eigvals_k, eigvecs_k = eigh(rho_k)
        eigvals_k = np.maximum(eigvals_k.real, 1e-14)
        h_k = float(-np.sum(eigvals_k * np.log(eigvals_k)))

        # Lifted test operator B_k = (log ρ_k + I_k) ⊗ I_rest
        log_rho_k = eigvecs_k @ np.diag(np.log(eigvals_k)) @ eigvecs_k.conj().T
        B_k = log_rho_k + np.eye(self.dims[site_idx], dtype=complex)
        B_full = self._lift_to_full_space(B_k, site_idx)

        # BKM kernel at ρ(θ)
        k_kern, _, U = self._bkm_kernel(rho)
        B_tilde = U.conj().T @ B_full @ U

        I_full = np.eye(self.D, dtype=complex)
        grad_h_k = np.zeros(self.n_params)
        for a, F_a in enumerate(self.operators):
            mean_Fa = np.trace(rho @ F_a).real
            F_tilde = U.conj().T @ (F_a - mean_Fa * I_full) @ U
            grad_h_k[a] = -np.real(np.sum(k_kern * (F_tilde * np.conj(B_tilde))))

        return h_k, grad_h_k

    def _marginal_entropy_gradient_per_subsystem(
        self, theta: np.ndarray
    ) -> np.ndarray:
        """
        Full constraint Jacobian M(θ) for the marginal-entropy constraints.

        Stacks the per-subsystem gradients into a matrix:

            M[k, a] = ∂h_k/∂θ_a,   k = 0 … n_sites-1,  a = 0 … n_params-1

        This is the constraint Jacobian needed to construct the marginal
        projector Π_marg via ``pi_marg_matrix``.

        Parameters
        ----------
        theta : ndarray, shape (n_params,)

        Returns
        -------
        M : ndarray, shape (n_sites, n_params)
        """
        M = np.zeros((self.n_sites, self.n_params))
        for k in range(self.n_sites):
            _, grad_k = self._marginal_entropy_gradient_single(theta, k)
            M[k] = grad_k
        return M


[docs]
    def pi_marg_matrix(
        self, theta: np.ndarray, tol: float = 1e-10
    ) -> np.ndarray:
        """
        Marginal projector Π_marg(θ*) in natural-parameter coordinates.

        Computes the orthogonal projector onto the iso-marginal subspace:

            Π_marg = N (N^T G N)^{-1} N^T G

        where:
        - M = _marginal_entropy_gradient_per_subsystem(theta), shape (n_sites, n_params)
        - N spans ker(M), computed via SVD of M
        - G = fisher_information(theta), shape (n_params, n_params)

        This implements the formula ``eq:second-order-projector`` from
        Lawrence-origin26.  The result is the projector onto perturbation
        directions that leave every marginal entropy unchanged to first order.

        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters at the operating point θ*.
        tol : float
            Singular-value threshold for the null-space determination.
            Singular values of M below ``tol * sigma_max`` are treated as zero.

        Returns
        -------
        Pi : ndarray, shape (n_params, n_params)
            Projector satisfying Pi @ Pi ≈ Pi (idempotent) and
            Pi @ g ≈ 0 for every constraint-gradient direction g.
        """
        M = self._marginal_entropy_gradient_per_subsystem(theta)
        G = self.fisher_information(theta)

        # Null space of M via SVD: rows of Vt[rank:] span ker(M)
        _, s, Vt = np.linalg.svd(M, full_matrices=True)
        threshold = tol * s[0] if s.size > 0 else tol
        rank = int(np.sum(s > threshold))
        # Columns of N span ker(M)
        N = Vt[rank:].T  # shape (n_params, n_params - rank)

        NtGN = N.T @ G @ N
        Pi = N @ np.linalg.solve(NtGN, N.T @ G)
        return Pi



[docs]
    def third_cumulant_contraction(self, theta: np.ndarray, method: str = 'auto') -> np.ndarray:
        """
        Compute (∇G)[θ], the third cumulant tensor contracted with θ.
        
        This is the matrix with (i,j) entry: ∑_k (∂G_ik/∂θ_j) θ_k
        
        Following the paper's Appendix (eq. 824-826), this appears in the Jacobian as:
            M = -G - (∇G)[θ] + ...
        
        The third cumulant ∇G = ∇³ψ is totally symmetric in all three indices.
        
        We compute ∂G_ab/∂θ_c by differentiating the spectral BKM formula using
        perturbation theory for eigenvalues and eigenvectors.
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        method : str, optional
            Method for computing the third cumulant:
            - 'auto' (default): 'block' for n_params ≤ 20, 'fd' otherwise
            - 'block': Block Fréchet derivatives (exact, limited to n_params ≤ 20)
            - 'fd': Finite differences of Fisher metric (works for any system size)
            - 'analytic': Analytic perturbation theory (slow, approximate)
        
        Returns
        -------
        contraction : ndarray, shape (n_params, n_params)
            Matrix (∇G)[θ] with (i,j) entry = ∑_k (∂G_ik/∂θ_j) θ_k
        
        Notes
        -----
        Quantum derivative principles applied:
        ✅ Check operator commutation: F_a, F_b, F_c may not commute
        ✅ Verify operator ordering: Careful in spectral differentiation
        ✅ Distinguish quantum vs classical: Uses quantum covariance derivatives
        ✅ Respect Hilbert space structure: Works on full Hilbert space
        ✅ Question each derivative step: Uses perturbation theory
        
        The 'auto' mode uses 'block' for small systems (n_params ≤ 20) where it
        is exact and efficient, and falls back to 'fd' for larger systems where
        'block' would be prohibitively expensive.
        """
        if method == 'auto':
            method = 'block' if self.n_params <= 20 else 'fd'
        if method == 'block':
            return self._third_cumulant_contraction_block(theta)
        elif method == 'fd':
            return self._third_cumulant_contraction_fd(theta)
        elif method == 'analytic':
            return self._third_cumulant_contraction_analytic(theta)
        else:
            raise ValueError(f"Unknown method '{method}'. Use 'auto', 'fd', 'analytic', or 'block'.")



[docs]
    def psi_hessian_block(
        self, theta: np.ndarray, param_indices: Optional[List[int]] = None
    ) -> np.ndarray:
        """
        Compute Hessian of ψ(θ) = log Tr(exp(K(θ))) via block Fréchet derivatives.

        This returns the (selected) 2nd cumulant matrix:
            ∂²ψ/∂θ_a∂θ_b

        Notes
        -----
        This method is intended for validation / small systems. It scales as
        O(m²) block matrix exponentials, where m = len(param_indices) (or n_params).
        """
        from scipy.linalg import expm
        from qig.duhamel import expm_frechet_block_2

        if param_indices is None:
            param_indices = list(range(self.n_params))
        m = len(param_indices)

        # Build K(θ) = Σ θ_a F_a
        K = np.zeros((self.D, self.D), dtype=complex)
        for theta_a, F_a in zip(theta, self.operators):
            K += theta_a * F_a

        expK = expm(K)
        Z = np.trace(expK)
        Z_real = float(np.real(Z))
        if not np.isfinite(Z_real) or abs(Z) < 1e-300:
            raise FloatingPointError("Trace(exp(K)) is not finite; reduce ||theta||.")

        # First derivatives of Z: dZ_a = Tr(exp(K) F_a)
        dZ = np.zeros(m, dtype=float)
        for i, a in enumerate(param_indices):
            dZ[i] = float(np.real(np.trace(expK @ self.operators[a])))

        # Second derivatives of Z via block trick: d2Z_ab = Tr(D² exp_K[F_a, F_b])
        d2Z = np.zeros((m, m), dtype=float)
        for i, a in enumerate(param_indices):
            E = self.operators[a]
            for j, b in enumerate(param_indices[i:], start=i):
                F = self.operators[b]
                # The 3×3 block construction gives an *ordered* second-derivative piece.
                # The true 2nd Fréchet derivative is symmetric in (E, F), so we sum both
                # orderings (no 1/2 factor).
                d2_ef = np.trace(expm_frechet_block_2(K, E, F))
                d2_fe = np.trace(expm_frechet_block_2(K, F, E))
                d2_real = float(np.real(d2_ef + d2_fe))
                d2Z[i, j] = d2_real
                d2Z[j, i] = d2_real

        # Hessian of log Z: ψ'' = Z''/Z - (Z'/Z)(Z'/Z)^T
        H = d2Z / Z_real - np.outer(dZ, dZ) / (Z_real * Z_real)
        return 0.5 * (H + H.T)


    def _third_cumulant_contraction_block(self, theta: np.ndarray) -> np.ndarray:
        """
        Compute (∇G)[θ] using 3rd derivatives of ψ via block Fréchet derivatives.

        This is intended for small systems (e.g. single qubit / qutrit) and
        validation. For larger models use method='fd' (default).
        """
        from itertools import permutations
        from scipy.linalg import expm
        from qig.duhamel import expm_frechet_block_2, expm_frechet_block_3

        n = self.n_params
        if n > 20:
            raise ValueError(
                "method='block' is intended for small n_params (≤20). "
                "Use method='fd' for larger systems."
            )

        # Build K(θ) and direction ΔK = Σ θ_a F_a = K itself.
        K = np.zeros((self.D, self.D), dtype=complex)
        for theta_a, F_a in zip(theta, self.operators):
            K += theta_a * F_a
        K_dir = K

        expK = expm(K)
        Z = np.trace(expK)
        Z_real = float(np.real(Z))
        if not np.isfinite(Z_real) or abs(Z) < 1e-300:
            raise FloatingPointError("Trace(exp(K)) is not finite; reduce ||theta||.")

        # First derivatives dZ_a and dZ_K
        dZ = np.array([float(np.real(np.trace(expK @ F_a))) for F_a in self.operators])
        dZK = float(np.real(np.trace(expK @ K_dir)))

        # Second derivatives needed: d2Z_ab (all pairs) and d2Z_aK (vector)
        d2Z = np.zeros((n, n), dtype=float)
        for a in range(n):
            E = self.operators[a]
            for b in range(a, n):
                F = self.operators[b]
                # Sum both orderings to obtain the symmetric 2nd Fréchet derivative.
                d2_ef = np.trace(expm_frechet_block_2(K, E, F))
                d2_fe = np.trace(expm_frechet_block_2(K, F, E))
                val = float(np.real(d2_ef + d2_fe))
                d2Z[a, b] = val
                d2Z[b, a] = val

        d2Z_aK = np.zeros(n, dtype=float)
        for a in range(n):
            E = self.operators[a]
            d2_eK = np.trace(expm_frechet_block_2(K, E, K_dir))
            d2_Ke = np.trace(expm_frechet_block_2(K, K_dir, E))
            d2Z_aK[a] = float(np.real(d2_eK + d2_Ke))

        # Third derivative contraction: ψ3(F_a, F_b, K_dir)
        contraction = np.zeros((n, n), dtype=float)
        for a in range(n):
            E = self.operators[a]
            for b in range(a, n):
                F = self.operators[b]

                # The 4×4 block construction yields ordered 3rd-derivative pieces.
                # The true 3rd Fréchet derivative is symmetric multilinear, so we sum
                # over all 3! orderings.
                dirs = (E, F, K_dir)
                d3_sum = 0.0 + 0.0j
                for perm in permutations(dirs, 3):
                    d3_sum += np.trace(expm_frechet_block_3(K, perm[0], perm[1], perm[2]))
                d3Z = float(np.real(d3_sum))

                # ψ3 = Z'''/Z - (Z'' Z' + perms)/Z^2 + 2 Z'Z'Z'/Z^3
                term1 = d3Z / Z_real
                term2 = (
                    d2Z[a, b] * dZK + d2Z_aK[a] * dZ[b] + d2Z_aK[b] * dZ[a]
                ) / (Z_real * Z_real)
                term3 = 2.0 * dZ[a] * dZ[b] * dZK / (Z_real * Z_real * Z_real)
                val = term1 - term2 + term3

                contraction[a, b] = val
                contraction[b, a] = val

        return 0.5 * (contraction + contraction.T)
    
    def _third_cumulant_contraction_fd(self, theta: np.ndarray, eps: float = 1e-5) -> np.ndarray:
        """
        Compute (∇G)[θ] using finite differences of the Fisher metric.
        
        Fast method: ∂G_ab/∂θ_c ≈ [G_ab(θ + ε·e_c) - G_ab(θ - ε·e_c)] / (2ε)
        Then contract: (∇G)[θ]_ab = Σ_c (∂G_ab/∂θ_c) θ_c
        
        Expected speedup: 100-500× over analytic method.
        Expected accuracy: ~10⁻⁸.
        """
        n = self.n_params
        
        # Compute ∂G_ab/∂θ_c for all c by finite differences
        dG_dtheta = np.zeros((n, n, n))  # [a, b, c]
        
        theta_perturbed = theta.copy()
        for c in range(n):
            # Forward perturbation
            theta_perturbed[c] = theta[c] + eps
            G_plus = self.fisher_information(theta_perturbed)
            
            # Backward perturbation
            theta_perturbed[c] = theta[c] - eps
            G_minus = self.fisher_information(theta_perturbed)
            
            # Central difference
            dG_dtheta[:, :, c] = (G_plus - G_minus) / (2 * eps)
            
            # Reset
            theta_perturbed[c] = theta[c]
        
        # Contract with θ: (∇G)[θ]_ab = Σ_c (∂G_ab/∂θ_c) θ_c
        contraction = np.einsum('abc,c->ab', dG_dtheta, theta)
        
        return contraction
    
    def _third_cumulant_contraction_analytic(self, theta: np.ndarray) -> np.ndarray:
        """
        Compute (∇G)[θ] using analytic perturbation theory.
        
        This is the original implementation - slow but exact (modulo approximations).
        Kept for reference and validation.
        """
        rho = self.rho_from_theta(theta)
        
        # Eigendecomposition of ρ
        eigvals, eigvecs = eigh(rho)
        eigvals = np.maximum(eigvals.real, 1e-14)  # Regularise
        
        # Compute BKM kernel k(p_i, p_j)
        def bkm_kernel(p_i, p_j, eps=1e-14):
            """BKM kernel: (p - q)/(log p - log q) if p ≠ q, else p."""
            if np.abs(p_i - p_j) < eps:
                return p_i
            else:
                return (p_i - p_j) / (np.log(p_i) - np.log(p_j))
        
        # Compute centered operators in eigenbasis
        def centered_operator_eigenbasis(F_a):
            """Compute F̃_a = F_a - ⟨F_a⟩I in eigenbasis of ρ."""
            mean_Fa = np.trace(rho @ F_a).real
            F_a_centered = F_a - mean_Fa * np.eye(self.D, dtype=complex)
            # Transform to eigenbasis
            return eigvecs.conj().T @ F_a_centered @ eigvecs
        
        # Compute ∂ρ/∂θ for all parameters
        I = np.eye(self.D, dtype=complex)
        drho_dtheta = []
        for a in range(self.n_params):
            drho_dtheta.append(self.rho_derivative(theta, a))
        
        # Compute eigenvalue derivatives using Hellmann-Feynman theorem
        # ∂λ_i/∂θ_a = ⟨i|∂ρ/∂θ_a|i⟩
        d_eigvals_dtheta = np.zeros((self.D, self.n_params))
        for a in range(self.n_params):
            drho_a_eigenbasis = eigvecs.conj().T @ drho_dtheta[a] @ eigvecs
            d_eigvals_dtheta[:, a] = np.diag(drho_a_eigenbasis).real
        
        # Compute eigenvector derivatives using first-order perturbation theory
        # ∂|i⟩/∂θ_a = ∑_{j≠i} (⟨j|∂ρ/∂θ_a|i⟩)/(λ_i - λ_j) |j⟩
        # This is stored as a matrix: d_eigvecs_dtheta[a] @ eigvecs gives the derivative
        d_eigvecs_dtheta = []
        for a in range(self.n_params):
            drho_a_eigenbasis = eigvecs.conj().T @ drho_dtheta[a] @ eigvecs
            d_U = np.zeros((self.D, self.D), dtype=complex)
            for i in range(self.D):
                for j in range(self.D):
                    if i != j:
                        denom = eigvals[i] - eigvals[j]
                        if np.abs(denom) > 1e-10:
                            # ∂U_ji/∂θ_a (column i of U is eigenvector i)
                            d_U[j, i] = drho_a_eigenbasis[j, i] / denom
            d_eigvecs_dtheta.append(d_U)
        
        # Now compute ∂G_ab/∂θ_c for all a,b,c
        # Then contract with θ to get (∇G)[θ]
        contraction = np.zeros((self.n_params, self.n_params))
        
        for a in range(self.n_params):
            for b in range(self.n_params):
                # Compute ∂G_ab/∂θ_c for all c
                dG_ab_dtheta = np.zeros(self.n_params)
                
                # Get centered operators in eigenbasis
                A_a = centered_operator_eigenbasis(self.operators[a])
                A_b = centered_operator_eigenbasis(self.operators[b])
                
                for c in range(self.n_params):
                    # Differentiate the spectral BKM formula
                    # G_ab = ∑_{i,j} k(p_i, p_j) * A_a[i,j] * conj(A_b[i,j])
                    
                    # Three terms from product rule:
                    # 1. ∂k/∂θ_c * A_a * conj(A_b)
                    # 2. k * ∂A_a/∂θ_c * conj(A_b)
                    # 3. k * A_a * conj(∂A_b/∂θ_c)
                    
                    dG_ab_c = 0.0
                    
                    for i in range(self.D):
                        for j in range(self.D):
                            k_ij = bkm_kernel(eigvals[i], eigvals[j])
                            
                            # Term 1: ∂k/∂θ_c
                            # ∂k/∂p_i and ∂k/∂p_j, then chain rule with ∂p_i/∂θ_c
                            if np.abs(eigvals[i] - eigvals[j]) < 1e-14:
                                # k = p_i, so ∂k/∂p_i = 1, ∂k/∂p_j = 0
                                dk_dtheta_c = d_eigvals_dtheta[i, c]
                            else:
                                log_diff = np.log(eigvals[i]) - np.log(eigvals[j])
                                p_diff = eigvals[i] - eigvals[j]
                                # ∂k/∂p_i = (log_diff - p_diff/p_i) / log_diff²
                                # ∂k/∂p_j = -(log_diff - p_diff/p_j) / log_diff²
                                dk_dp_i = (log_diff - p_diff/eigvals[i]) / (log_diff**2)
                                dk_dp_j = -(log_diff - p_diff/eigvals[j]) / (log_diff**2)
                                dk_dtheta_c = dk_dp_i * d_eigvals_dtheta[i, c] + dk_dp_j * d_eigvals_dtheta[j, c]
                            
                            term1 = dk_dtheta_c * A_a[i,j] * np.conj(A_b[i,j])
                            
                            # Terms 2 & 3: ∂A_a/∂θ_c and ∂A_b/∂θ_c
                            # A_a = U† F̃_a U, so ∂A_a/∂θ = (∂U†/∂θ) F̃_a U + U† (∂F̃_a/∂θ) U + U† F̃_a (∂U/∂θ)
                            # This is complex, so for now use the fact that the derivative involves
                            # the eigenvector derivatives we computed
                            
                            # Simplified: Assume the main contribution comes from eigenvalue changes
                            # (This is an approximation - full implementation would include eigenvector derivatives)
                            # For now, just use term 1
                            
                            dG_ab_c += term1.real
                    
                    dG_ab_dtheta[c] = dG_ab_c
                
                # Contract with θ: ∑_c (∂G_ab/∂θ_c) θ_c
                contraction[a, b] = np.dot(dG_ab_dtheta, theta)
        
        return contraction


[docs]
    def constraint_hessian_fd_theta_only(self, theta: np.ndarray, eps: float = 1e-5) -> np.ndarray:
        """
        Compute constraint Hessian ∇²C using finite differences of θ-only gradient.
        
        This is a fast, accurate method that computes:
            ∂²C/∂θ_a∂θ_b ≈ [∇C(θ + eps·e_b) - ∇C(θ - eps·e_b)]_a / (2·eps)
        
        By differentiating the exact θ-only gradient, this achieves better accuracy
        than the current approach which uses exact formulas with approximate second
        derivatives (FD of Duhamel drho).
        
        Expected speedup: 50-100× over current Duhamel-based method.
        Expected accuracy: ~10⁻⁸ (better than current ~10⁻⁶).
        
        Parameters
        ----------
        theta : ndarray
            Natural parameters
        eps : float, optional
            Finite difference step size (default: 1e-5)
            Optimal for central differences: h ≈ (machine_eps)^(1/3) ≈ 1e-5
            
        Returns
        -------
        hess : ndarray, shape (n_params, n_params)
            Hessian matrix ∇²C, symmetric real matrix
            
        Notes
        -----
        **Why this is better than current approach:**

        Current (two approximations): Error ≈ 10⁻⁶

        - ``∂²C/∂θ_a∂θ_b = f(∂²ρ/∂θ_a∂θ_b) = f(FD(∂ρ/∂θ)) = f(FD(Duhamel(ρ)))``

        New (one approximation): Error ≈ 10⁻⁸

        - ``∂²C/∂θ_a∂θ_b ≈ FD(∂C/∂θ_a) = FD(exact_BKM_formula(ρ))``
        
        Key insight: Differentiating an exact gradient is more accurate than
        using an exact formula with approximate second derivatives.
        """
        n = self.n_params
        hess = np.zeros((n, n))
        
        # Compute Hessian by finite differences of θ-only gradient
        for b in range(n):
            # Perturbation in direction b
            e_b = np.zeros(n)
            e_b[b] = eps
            
            # Compute gradients at θ ± eps·e_b using exact θ-only formula
            _, grad_plus = self.marginal_entropy_constraint_theta_only(theta + e_b)
            _, grad_minus = self.marginal_entropy_constraint_theta_only(theta - e_b)
            
            # Central difference for column b
            hess[:, b] = (grad_plus - grad_minus) / (2 * eps)
        
        # Symmetrize to ensure exact symmetry (should already be symmetric to roundoff)
        hess = 0.5 * (hess + hess.T)
        
        return hess



[docs]
    def constraint_hessian(self, theta: np.ndarray, method: str = 'fd_theta_only', 
                          n_points: int = 100, eps: float = 1e-5) -> np.ndarray:
        """
        Compute ∇²C, the Hessian of the constraint C(θ) = ∑ᵢ hᵢ(θ).
        
        This method dispatches to different implementations based on the method parameter.
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        method : str, default='fd_theta_only'
            Method for Hessian computation. Options:

            - ``'fd_theta_only'``: FD of θ-only gradient (default).
              ~50-100× faster, ~10⁻⁸ error, recommended.
            - ``'duhamel'``: FD of Duhamel drho (legacy, slow, ~10⁻⁶ error).
            - ``'sld'``: Analytic formula using SLD (legacy, fast but ~10% error).

        n_points : int, default=100
            Quadrature points for Duhamel (ignored for other methods)
        eps : float, default=1e-5
            Finite difference step size
            
        Returns
        -------
        hessian : ndarray, shape (n_params, n_params)
            Constraint Hessian ∇²C (symmetric matrix)
            
        Notes
        -----
        The default 'fd_theta_only' method uses finite differences of the exact
        θ-only gradient, achieving better accuracy and much better performance
        than the legacy methods which use exact formulas with approximate inputs.
        """
        # Dispatch to appropriate implementation
        if method == 'fd_theta_only':
            return self.constraint_hessian_fd_theta_only(theta, eps=eps)
        
        # Legacy methods: materialize ∂²ρ/∂θ_a∂θ_b for each (a,b)
        from qig.core import partial_trace
        
        rho = self.rho_from_theta(theta)
        I_full = np.eye(self.D, dtype=complex)
        
        # Compute ∂ρ/∂θ for all parameters
        # Use the same method as for ∂²ρ for consistency
        drho_method = 'duhamel' if method == 'duhamel' else 'sld'
        drho_dtheta = []
        for a in range(self.n_params):
            drho_dtheta.append(self.rho_derivative(theta, a, method=drho_method, n_points=n_points))
        
        # Initialise Hessian
        hessian = np.zeros((self.n_params, self.n_params))
        
        # Sum over all marginals
        for i in range(self.n_sites):
            # Compute marginal ρᵢ
            rho_i = partial_trace(rho, self.dims, keep=i)
            d_i = rho_i.shape[0]
            I_i = np.eye(d_i, dtype=complex)
            
            # Eigendecomposition of ρᵢ
            eigvals_i, eigvecs_i = eigh(rho_i)
            eigvals_i = np.maximum(eigvals_i.real, 1e-14)
            
            # Compute log(ρᵢ)
            log_eigvals_i = np.log(eigvals_i)
            log_rho_i = eigvecs_i @ np.diag(log_eigvals_i) @ eigvecs_i.conj().T
            
            # Compute ∂ρᵢ/∂θ for all parameters
            drho_i_dtheta = []
            for a in range(self.n_params):
                drho_i_dtheta.append(partial_trace(drho_dtheta[a], self.dims, keep=i))
            
            # Compute ∂²hᵢ/∂θ_a∂θ_b for all a, b
            for a in range(self.n_params):
                for b in range(a, self.n_params):  # Only upper triangle (symmetric)
                    # Term 1: -Tr(∂²ρᵢ/∂θ_a∂θ_b (I + log ρᵢ))
                    # We need ∂²ρᵢ/∂θ_a∂θ_b
                    
                    if method == 'duhamel':
                        # High-precision: numerical differentiation of Duhamel
                        # Gives ~0.5-2.6% error (30-100× better than SLD)
                        d2rho_dtheta_ab = self.rho_second_derivative(
                            theta, a, b, method='numerical_duhamel', 
                            n_points=n_points, eps=eps
                        )
                    else:  # method == 'sld'
                        # Fast analytic: SLD-based formula
                        # Gives ~8-12% error but much faster
                        # From ∂ρ/∂θ_a = (1/2)[ρ(F_a - ⟨F_a⟩I) + (F_a - ⟨F_a⟩I)ρ]:
                        # ∂²ρ/∂θ_a∂θ_b = (1/2)[∂ρ/∂θ_b (F_a - ⟨F_a⟩I) + (F_a - ⟨F_a⟩I) ∂ρ/∂θ_b]
                        #                 - ρ Cov_sym(F_b, F_a)
                        
                        F_a = self.operators[a]
                        F_b = self.operators[b]
                        mean_Fa = np.trace(rho @ F_a).real
                        mean_Fb = np.trace(rho @ F_b).real
                        
                        F_a_centered = F_a - mean_Fa * I_full
                        
                        # Symmetrized covariance
                        cov_sym = 0.5 * (np.trace(rho @ F_b @ F_a).real + 
                                         np.trace(rho @ F_a @ F_b).real) - mean_Fb * mean_Fa
                        
                        # ∂²ρ/∂θ_a∂θ_b (SLD-based)
                        d2rho_dtheta_ab = (0.5 * (drho_dtheta[b] @ F_a_centered 
                                                  + F_a_centered @ drho_dtheta[b])
                                           - cov_sym * rho)
                    
                    # Partial trace to get ∂²ρᵢ/∂θ_a∂θ_b
                    d2rho_i_dtheta_ab = partial_trace(d2rho_dtheta_ab, self.dims, keep=i)
                    
                    # Term 1
                    term1 = -np.trace(d2rho_i_dtheta_ab @ (I_i + log_rho_i)).real
                    
                    # Term 2: -Tr(∂ρᵢ/∂θ_a ∂(log ρᵢ)/∂θ_b)
                    # Compute ∂(log ρᵢ)/∂θ_b using Daleckii-Krein formula
                    
                    # Transform ∂ρᵢ/∂θ_b to eigenbasis of ρᵢ
                    drho_i_b_eigenbasis = eigvecs_i.conj().T @ drho_i_dtheta[b] @ eigvecs_i
                    
                    # Apply Daleckii-Krein formula element-wise
                    dlog_rho_i_eigenbasis = np.zeros_like(drho_i_b_eigenbasis)
                    for ii in range(d_i):
                        for jj in range(d_i):
                            if ii == jj:
                                # Diagonal: ∂(log ρᵢ)/∂θ_b = (∂ρᵢ/∂θ_b) / ρᵢ
                                dlog_rho_i_eigenbasis[ii, jj] = (
                                    drho_i_b_eigenbasis[ii, jj] / eigvals_i[ii]
                                )
                            else:
                                # Off-diagonal: Daleckii-Krein formula
                                log_diff = log_eigvals_i[ii] - log_eigvals_i[jj]
                                p_diff = eigvals_i[ii] - eigvals_i[jj]
                                if np.abs(p_diff) > 1e-10:
                                    dlog_rho_i_eigenbasis[ii, jj] = (
                                        drho_i_b_eigenbasis[ii, jj] * log_diff / p_diff
                                    )
                                else:
                                    # Limit as p_i → p_j: (log p_i - log p_j)/(p_i - p_j) → 1/p_i
                                    dlog_rho_i_eigenbasis[ii, jj] = (
                                        drho_i_b_eigenbasis[ii, jj] / eigvals_i[ii]
                                    )
                    
                    # Transform back to original basis
                    dlog_rho_i_dtheta_b = (eigvecs_i @ dlog_rho_i_eigenbasis 

[docs]
                                           @ eigvecs_i.conj().T)
                    
                    # Term 2
                    term2 = -np.trace(drho_i_dtheta[a] @ dlog_rho_i_dtheta_b).real
                    
                    # ∂²hᵢ/∂θ_a∂θ_b
                    d2h_i = term1 + term2
                    
                    # Add to Hessian (symmetric, so fill both (a,b) and (b,a))
                    hessian[a, b] += d2h_i
                    if a != b:
                        hessian[b, a] += d2h_i
        
        return hessian

    
    def lagrange_multiplier_gradient(self, theta: np.ndarray, 
                                     method: str = 'sld',
                                     n_points: int = 100) -> np.ndarray:
        """
        Compute ∇ν, the gradient of the Lagrange multiplier ν(θ).
        
        From the paper (equations 831-832), the gradient components are:

        ``∂ν/∂θ_j = (1/||a||²) [a^T G e_j + a^T (∇G)[θ] e_j + (∇a)_j^T G θ - 2ν a^T (∇a)_j]``

        where:

        - ``ν = (a^T G θ)/(a^T a)`` is the Lagrange multiplier
        - ``a = ∇C = ∇(∑h_i)`` is the constraint gradient
        - ``G`` = Fisher information (BKM metric)
        - ``(∇G)[θ]`` = third cumulant tensor contracted with θ
        - ``∇a = ∇²C`` is the constraint Hessian
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        method : str, default='sld'
            'sld' or 'duhamel' for derivative precision
        n_points : int, default=100
            Quadrature points for Duhamel (ignored for 'sld')
            
        Returns
        -------
        grad_nu : ndarray, shape (n_params,)
            Gradient ∇ν
            
        Notes
        -----
        Uses all the high-precision components from Steps 1-3:
        - BKM metric G (spectral formula)
        - Third cumulant (∇G)[θ] (perturbation theory)
        - Constraint Hessian ∇²C (Duhamel for high precision)
        """
        # Get constraint gradient a = ∇C and constraint value (use optimized default)
        C, a = self.marginal_entropy_constraint(theta)  # Uses theta_only by default
        
        # Get BKM metric G
        G = self.fisher_information(theta)
        
        # Compute Lagrange multiplier ν = (a^T G θ)/(||a||²)
        a_norm_sq = np.dot(a, a)
        nu = np.dot(a, G @ theta) / a_norm_sq
        
        # Get third cumulant contraction (∇G)[θ] (use optimized default)
        third_cumulant_contracted = self.third_cumulant_contraction(theta)  # Uses fd by default
        
        # Get constraint Hessian ∇²C = ∇a (use optimized default)
        hessian_C = self.constraint_hessian(theta)  # Uses fd_theta_only by default
        
        # Compute gradient ∇ν for each parameter j
        grad_nu = np.zeros(self.n_params)
        
        for j in range(self.n_params):
            # e_j is the j-th standard basis vector (handled implicitly)
            
            # Term 1: a^T G e_j = (G^T a)_j = (Ga)_j (since G is symmetric)
            term1 = (G @ a)[j]
            
            # Term 2: a^T (∇G)[θ] e_j = [a^T (∇G)[θ]]_j
            # This is the j-th component of a^T times the third cumulant contraction
            term2 = np.dot(a, third_cumulant_contracted[:, j])
            
            # Term 3: (∇a)_j^T G θ
            # (∇a)_j is the j-th column of the Hessian
            grad_a_j = hessian_C[:, j]
            term3 = np.dot(grad_a_j, G @ theta)
            
            # Term 4: -2ν a^T (∇a)_j
            term4 = -2 * nu * np.dot(a, grad_a_j)
            
            # Combine all terms
            grad_nu[j] = (term1 + term2 + term3 + term4) / a_norm_sq
        
        return grad_nu

    

[docs]
    def jacobian(self, theta: np.ndarray,
                method: str = 'duhamel',
                n_points: int = 100) -> np.ndarray:
        """
        Compute the Jacobian M = ∂F/∂θ of the constrained dynamics.
        
        From the paper (equations 824-827):
            F(θ) = -G(θ)θ + ν(θ)a(θ)
            M = ∂F/∂θ = -G - (∇G)[θ] + ν∇²C + a(∇ν)^T
        
        For systems with local operators only (no entanglement):
            - Structural identity Gθ = -a holds
            - ν = -1 always, ∇ν = 0
            - Simplifies to: M = -G - (∇G)[θ] - ∇²C
        
        For systems with entangling operators (pair basis):
            - Structural identity BROKEN: Gθ ≠ -a
            - ν ≠ -1, ∇ν ≠ 0
            - Must use full formula: M = -G - (∇G)[θ] + ν∇²C + a(∇ν)^T
        
        Parameters
        ----------
        theta : ndarray, shape (n_params,)
            Natural parameters
        method : str, default='duhamel'
            'sld' or 'duhamel' for derivative precision
            Changed default to 'duhamel' for better accuracy with entangled states
        n_points : int, default=100
            Quadrature points for Duhamel (ignored for 'sld')
            
        Returns
        -------
        M : ndarray, shape (n_params, n_params)
            Jacobian matrix
            
        Notes
        -----
        This is the full Jacobian for GENERIC dynamics:
            θ̇ = F(θ) = -Gθ + νa
        
        The degeneracy of M determines the geometry of the constraint manifold
        and is central to the paper's analysis of the inaccessible game.
        """
        # Get all components (use optimized defaults for each)
        G = self.fisher_information(theta)
        C, a = self.marginal_entropy_constraint(theta)  # Uses theta_only by default
        third_cumulant = self.third_cumulant_contraction(theta)  # Uses fd by default
        hessian_C = self.constraint_hessian(theta)  # Uses fd_theta_only by default
        
        # Compute Lagrange multiplier
        Gtheta = G @ theta
        nu = np.dot(a, Gtheta) / np.dot(a, a)
        
        # Compute Lagrange multiplier gradient
        grad_nu = self.lagrange_multiplier_gradient(theta, method=method, n_points=n_points)
        
        # Assemble full Jacobian: M = -G - (∇G)[θ] + ν∇²C + a(∇ν)^T
        M = -G - third_cumulant + nu * hessian_C + np.outer(a, grad_nu)
        
        return M

    

[docs]
    def symmetric_part(self, theta: np.ndarray,
                      method: str = 'duhamel',
                      n_points: int = 100) -> np.ndarray:
        """
        Compute symmetric part S of the flow Jacobian M.
        
        The GENERIC decomposition splits M into symmetric and antisymmetric parts:
        M = S + A, where S = (M + M^T)/2
        
        The symmetric part S generates the irreversible (dissipative) dynamics.
        
        Parameters
        ----------
        theta : np.ndarray
            Natural parameters
        method : str
            Method for computing Jacobian
        n_points : int
            Number of points for numerical integration
            
        Returns
        -------
        S : np.ndarray, shape (n_params, n_params)
            Symmetric part of Jacobian, satisfies S = S^T
            
        Notes
        -----
        The symmetric part satisfies key degeneracy conditions:
        - S @ a ≈ 0, where a = ∇C is the constraint gradient
        - θ^T S θ ≥ 0 (entropy production non-negative on tangent space)
        
        See Also
        --------
        antisymmetric_part : Antisymmetric part of Jacobian
        verify_degeneracy_conditions : Verify GENERIC structure
        """
        M = self.jacobian(theta, method=method, n_points=n_points)
        S = 0.5 * (M + M.T)
        return S

    

[docs]
    def antisymmetric_part(self, theta: np.ndarray,
                          method: str = 'duhamel',
                          n_points: int = 100) -> np.ndarray:
        """
        Compute antisymmetric part A of the flow Jacobian M.
        
        The GENERIC decomposition splits M into symmetric and antisymmetric parts:
        M = S + A, where A = (M - M^T)/2
        
        The antisymmetric part A generates the reversible (Hamiltonian) dynamics.
        
        Parameters
        ----------
        theta : np.ndarray
            Natural parameters
        method : str
            Method for computing Jacobian
        n_points : int
            Number of points for numerical integration
            
        Returns
        -------
        A : np.ndarray, shape (n_params, n_params)
            Antisymmetric part of Jacobian, satisfies A = -A^T
            
        Notes
        -----
        The antisymmetric part satisfies key degeneracy conditions:
        - A @ (-G @ theta) ≈ 0, where -G @ theta = ∇H is the entropy gradient
        
        The antisymmetric part encodes the effective Hamiltonian through:
        A_ab θ_b = Σ_c f_abc η_c, where η are the Hamiltonian coefficients.
        
        See Also
        --------
        symmetric_part : Symmetric part of Jacobian
        verify_degeneracy_conditions : Verify GENERIC structure
        """
        M = self.jacobian(theta, method=method, n_points=n_points)
        A = 0.5 * (M - M.T)
        return A

    

[docs]
    def verify_degeneracy_conditions(self, theta: np.ndarray,
                                    method: str = 'duhamel',
                                    n_points: int = 100,
                                    tol: float = 1e-6) -> Dict[str, Any]:
        """
        Verify degeneracy conditions for GENERIC structure.
        
        The GENERIC structure requires that the symmetric and antisymmetric
        parts satisfy specific degeneracy conditions related to the constraint
        and entropy gradients.
        
        Parameters
        ----------
        theta : np.ndarray
            Natural parameters
        method : str
            Method for computing Jacobian
        n_points : int
            Number of points for numerical integration
        tol : float
            Tolerance for degeneracy conditions
            
        Returns
        -------
        diagnostics : dict
            Dictionary containing:
            - 'S': Symmetric part
            - 'A': Antisymmetric part
            - 'constraint_gradient': a = ∇C
            - 'entropy_gradient': ∇H = -G @ theta
            - 'S_annihilates_constraint': ||S @ a||
            - 'A_annihilates_entropy_gradient': ||A @ (-G @ theta)||
            - 'entropy_production': θ^T S θ
            - 'S_symmetric_error': ||S - S^T||
            - 'A_antisymmetric_error': ||A + A^T||
            - 'reconstruction_error': ||M - (S + A)||
            - 'all_passed': bool indicating if all checks passed
            
        Notes
        -----
        Degeneracy conditions (should hold within tolerance):
        1. S @ a ≈ 0: Symmetric part annihilates constraint gradient
        2. A @ ∇H ≈ 0: Antisymmetric part annihilates entropy gradient
        3. θ^T S θ ≥ 0: Entropy production non-negative
        4. S = S^T: Symmetry (machine precision)
        5. A = -A^T: Antisymmetry (machine precision)
        6. M = S + A: Reconstruction (machine precision)
        """
        from qig.validation import ValidationReport
        
        # Compute components
        M = self.jacobian(theta, method=method, n_points=n_points)
        S = self.symmetric_part(theta, method=method, n_points=n_points)
        A = self.antisymmetric_part(theta, method=method, n_points=n_points)
        
        # Get constraint gradient (using marginal entropies constraint)
        marginals = marginal_entropies(self.rho_from_theta(theta), 
                                      [self.d] * (self.n_pairs if self.pair_basis else self.n_sites))
        
        # Gradient of constraint C(θ) = Σ h_i
        # Use finite differences for gradient
        eps = 1e-7
        a = np.zeros(len(theta))
        C0 = np.sum(marginals)
        for i in range(len(theta)):
            theta_plus = theta.copy()
            theta_plus[i] += eps
            rho_plus = self.rho_from_theta(theta_plus)
            marginals_plus = marginal_entropies(rho_plus,
                                               [self.d] * (self.n_pairs if self.pair_basis else self.n_sites))
            C_plus = np.sum(marginals_plus)
            a[i] = (C_plus - C0) / eps
        
        # Entropy gradient: ∇H = -G @ theta
        G = self.fisher_information(theta)
        entropy_gradient = -G @ theta
        
        # Compute degeneracy violations
        S_a = S @ a
        A_entropy_grad = A @ entropy_gradient
        
        S_annihilates_constraint = np.linalg.norm(S_a)
        A_annihilates_entropy_gradient = np.linalg.norm(A_entropy_grad)
        
        # Entropy production
        entropy_production = theta @ S @ theta
        
        # Symmetry/antisymmetry errors
        S_symmetric_error = np.max(np.abs(S - S.T))
        A_antisymmetric_error = np.max(np.abs(A + A.T))
        
        # Reconstruction error
        reconstruction_error = np.max(np.abs(M - (S + A)))
        
        # Check all conditions
        all_passed = (
            S_annihilates_constraint < tol and
            A_annihilates_entropy_gradient < tol and
            entropy_production >= -tol and
            S_symmetric_error < 1e-14 and
            A_antisymmetric_error < 1e-14 and
            reconstruction_error < 1e-14
        )
        
        diagnostics = {
            'S': S,
            'A': A,
            'constraint_gradient': a,
            'entropy_gradient': entropy_gradient,
            'S_annihilates_constraint': S_annihilates_constraint,
            'A_annihilates_entropy_gradient': A_annihilates_entropy_gradient,
            'entropy_production': entropy_production,
            'S_symmetric_error': S_symmetric_error,
            'A_antisymmetric_error': A_antisymmetric_error,
            'reconstruction_error': reconstruction_error,
            'all_passed': all_passed,
            'tolerance': tol
        }
        
        return diagnostics

    
    def _grad_psi(self, theta: np.ndarray) -> np.ndarray:
        """
        Compute ∇ψ(θ), the analytical gradient of the cumulant generating function ψ(θ).

        Since ψ(θ) = log Tr(exp(K(θ))) where K(θ) = ∑ θ_a F_a, differentiate directly:
        ∂ψ/∂θ_a = ∂/∂θ_a [log Tr(exp(K))] = [1/Tr(exp(K))] * Tr( ∂/∂θ_a exp(K) )

        Since ∂/∂θ_a exp(K) = exp(K) * F_a (in the sense of matrix multiplication),
        we get: ∂ψ/∂θ_a = Tr( exp(K) F_a ) / Tr(exp(K))

        This computes the gradient directly from the cumulant generating function
        without materializing ρ(θ) as an intermediate.

        Parameters
        ----------
        theta : ndarray
            Natural parameters

        Returns
        -------
        grad_psi : ndarray, shape (n_params,)
            ∇ψ(θ), the gradient of the cumulant generating function
        """
        # Compute K(θ) = ∑ θ_a F_a
        K = sum(theta_a * F_a for theta_a, F_a in zip(theta, self.operators))

        # Shift for stability. Since exp(K) = exp(c) exp(K-cI), the scalar cancels
        # in Tr(exp(K) F_a) / Tr(exp(K)).
        lambda_max = float(np.max(np.linalg.eigvalsh(K).real))
        K_shift = K - lambda_max * np.eye(self.D, dtype=complex)
        exp_K = expm(K_shift)

        # Compute normalization Z = Tr(exp(K))
        Z = np.trace(exp_K)

        # Compute ∇ψ(θ)_a = Tr(exp(K) F_a) / Z for each a
        return np.array([np.trace(exp_K @ F_a).real / Z.real for F_a in self.operators])


[docs]
    def von_neumann_entropy(self, theta: np.ndarray) -> float:
        """
        Compute the von Neumann entropy.

        For exponential families ρ(θ) = exp(K(θ) - ψ(θ)), we use the fundamental identity:
        H(ρ) = ψ(θ) - θ^T ∇ψ(θ)

        where ∇ψ(θ) is the analytical gradient of the cumulant generating function ψ(θ).

        This directly uses the exponential family structure and is more numerically
        stable than eigendecomposition for states within the exponential family manifold.

        Parameters
        ----------
        theta : ndarray
            Natural parameters

        Returns
        -------
        float
            Von Neumann entropy in nats
        """
        # Use exponential family identity: H(ρ) = ψ(θ) - θ^T ∇ψ(θ)
        psi_theta = self.psi(theta)
        grad_psi = self._grad_psi(theta)
        entropy = psi_theta - np.dot(theta, grad_psi)

        # Ensure non-negative (numerical precision issues)
        return max(0.0, float(entropy.real))

    

[docs]
    def mutual_information(self, theta: np.ndarray) -> float:
        """
        Compute mutual information I = C - H where C = ∑h_i, H = S(ρ).
        
        For separable states: I = 0
        For entangled states: I > 0
        Maximum for Bell states: I = 2log(d)
        
        Parameters
        ----------
        theta : ndarray
            Natural parameters
            
        Returns
        -------
        float
            Mutual information in nats
            
        Notes
        -----
        This only makes sense for pair-based systems. For local operators,
        I ≈ 0 always since those can only create separable states.
        """
        rho = self.rho_from_theta(theta)
        h_marginals = marginal_entropies(rho, self.dims)
        C = np.sum(h_marginals)
        H = self.von_neumann_entropy(theta)
        return C - H

    

[docs]
    def purity(self, theta: np.ndarray) -> float:
        """
        Compute purity Tr(ρ²).
        
        Pure states: Tr(ρ²) = 1
        Maximally mixed: Tr(ρ²) = 1/D
        
        Parameters
        ----------
        theta : ndarray
            Natural parameters
            
        Returns
        -------
        float
            Purity, between 1/D and 1
        """
        rho = self.rho_from_theta(theta)
        return np.trace(rho @ rho).real

    
    # =========================================================================
    # CIP-0008: σ-parametrised regularisation infrastructure
    # =========================================================================
    

[docs]
    def validate_sigma(self, sigma: np.ndarray) -> Tuple[bool, str]:
        """
        Validate that σ is a valid density matrix of correct dimension.
        
        Parameters
        ----------
        sigma : ndarray
            Matrix to validate
            
        Returns
        -------
        is_valid : bool
            True if σ is a valid density matrix
        message : str
            "Valid" or error description
        """
        D = self.D
        
        # Check shape
        if sigma.shape != (D, D):
            return False, f"Shape mismatch: expected ({D}, {D}), got {sigma.shape}"
        
        # Check Hermitian
        if not np.allclose(sigma, sigma.conj().T, atol=1e-10):
            return False, "σ must be Hermitian"
        
        # Check positive semidefinite
        eigvals = np.linalg.eigvalsh(sigma)
        if np.any(eigvals < -1e-10):
            return False, f"σ must be PSD, min eigenvalue: {eigvals.min():.2e}"
        
        # Check unit trace
        tr = np.trace(sigma).real
        if not np.isclose(tr, 1.0, atol=1e-10):
            return False, f"Tr(σ) must be 1, got {tr:.6f}"
        
        return True, "Valid"

    

[docs]
    def is_product_sigma(
        self, sigma: np.ndarray, tol: float = 1e-6
    ) -> Tuple[bool, Optional[List[np.ndarray]]]:
        """
        Check if σ has product structure σ = σ₁⊗σ₂⊗...⊗σₙ.
        
        Parameters
        ----------
        sigma : ndarray
            Density matrix to check
        tol : float
            Tolerance for product structure detection
            
        Returns
        -------
        is_product : bool
            True if σ is (approximately) a product state
        factors : list of ndarray or None
            If product, the individual σₖ matrices; otherwise None
        """
        if self.n_pairs == 1:
            # Single pair is trivially a "product"
            return True, [sigma]
        
        # For multi-pair, check if σ can be factorised
        # This is expensive in general - use partial trace to check consistency
        D_pair = self.d ** 2
        
        # Extract marginals for each pair
        marginals = []
        for k in range(self.n_pairs):
            # Trace out all pairs except k
            sigma_k = self._partial_trace_to_pair(sigma, k)
            marginals.append(sigma_k)
        
        # Reconstruct product state from marginals
        sigma_product = marginals[0]
        for k in range(1, self.n_pairs):
            sigma_product = np.kron(sigma_product, marginals[k])
        
        # Check if reconstruction matches
        if np.allclose(sigma, sigma_product, atol=tol):
            return True, marginals
        
        return False, None

    
    def _partial_trace_to_pair(self, sigma: np.ndarray, pair_idx: int) -> np.ndarray:
        """Trace out all pairs except pair_idx, returning d²×d² density matrix."""
        D_pair = self.d ** 2
        n = self.n_pairs
        
        if n == 1:
            return sigma  # Nothing to trace
        
        # Reshape to tensor with legs for each pair
        # Shape: (D_pair,) * n for bra, then (D_pair,) * n for ket
        sigma_tensor = sigma.reshape([D_pair] * (2 * n))
        
        # Use einsum to trace out all pairs except pair_idx
        # Build index strings: bra indices 0..n-1, ket indices n..2n-1
        # Pairs to trace get same index letter; pair to keep gets distinct letters
        bra_indices = []
        ket_indices = []
        trace_letter = ord('a')
        keep_bra = chr(ord('z') - 1)  # 'y'
        keep_ket = chr(ord('z'))      # 'z'
        
        for k in range(n):
            if k == pair_idx:
                bra_indices.append(keep_bra)
                ket_indices.append(keep_ket)
            else:
                letter = chr(trace_letter)
                bra_indices.append(letter)
                ket_indices.append(letter)  # Same letter = trace over this pair
                trace_letter += 1
        
        input_str = ''.join(bra_indices + ket_indices)
        output_str = keep_bra + keep_ket
        einsum_str = f"{input_str}->{output_str}"
        
        result = np.einsum(einsum_str, sigma_tensor)
        return result.reshape(D_pair, D_pair)
    

[docs]
    def detect_sigma_structure(self, sigma: np.ndarray) -> str:
        """
        Detect structure of σ for efficiency optimisation.
        
        Parameters
        ----------
        sigma : ndarray
            Regularisation matrix
            
        Returns
        -------
        structure : str
            One of: 'isotropic', 'product', 'pure', 'general'
        """
        D = self.D
        
        # Check if isotropic (I/D)
        if np.allclose(sigma, np.eye(D) / D, atol=1e-10):
            return 'isotropic'
        
        # Check if product state (for multi-pair)
        if self.n_pairs > 1:
            is_prod, _ = self.is_product_sigma(sigma)
            if is_prod:
                return 'product'
        
        # Check if rank-1 (pure state)
        eigvals = np.linalg.eigvalsh(sigma)
        n_nonzero = np.sum(eigvals > 1e-10)
        if n_nonzero == 1:
            return 'pure'
        
        return 'general'

    

[docs]
    def regularise_pure_state(
        self,
        psi: np.ndarray,
        epsilon: float,
        sigma: Optional[np.ndarray] = None,
    ) -> np.ndarray:
        """
        Create regularised density matrix from pure state.
        
        ρ_ε = (1-ε)|ψ⟩⟨ψ| + ε σ
        
        Parameters
        ----------
        psi : ndarray
            Pure state vector (length D)
        epsilon : float
            Regularisation strength, must be in (0, 1)
        sigma : ndarray, optional
            Regularisation direction. Default: I/D (isotropic)
            
        Returns
        -------
        rho_epsilon : ndarray
            Regularised density matrix (D×D)
        """
        D = self.D
        
        # Validate psi
        if psi.shape != (D,):
            raise ValueError(f"psi must have length {D}, got {psi.shape}")
        
        # Normalise psi if needed
        norm = np.linalg.norm(psi)
        if not np.isclose(norm, 1.0, atol=1e-10):
            psi = psi / norm
        
        # Validate epsilon
        if not 0 < epsilon < 1:
            raise ValueError(f"epsilon must be in (0, 1), got {epsilon}")
        
        # Default sigma is isotropic
        if sigma is None:
            sigma = np.eye(D, dtype=complex) / D
        else:
            # Validate sigma
            is_valid, msg = self.validate_sigma(sigma)
            if not is_valid:
                raise ValueError(f"Invalid sigma: {msg}")
        
        # Create regularised state
        rho_pure = np.outer(psi, psi.conj())
        rho_epsilon = (1 - epsilon) * rho_pure + epsilon * sigma
        
        return rho_epsilon


    def _bell_parameters_product_sigma(
        self,
        eps_val: float,
        log_eps: float,
        sigma_per_pair: List[np.ndarray],
        bell_indices: Optional[List[int]] = None,
    ) -> np.ndarray:
        """
        Efficient θ computation for product σ = σ₁⊗...⊗σₙ.
        
        For each pair k, the marginal is:
            ρₖ_ε = (1-ε)|Φₖ⟩⟨Φₖ| + ε σₖ
        
        We compute log(ρₖ_ε) via small d²×d² eigendecomposition,
        then project onto the per-pair operators.
        
        Complexity: O(n × d⁶) instead of O(d^(6n))
        
        Parameters
        ----------
        eps_val : float
            Regularisation ε value
        log_eps : float
            log(ε) for numerical stability
        sigma_per_pair : list of ndarray
            List of n density matrices, each d²×d² for one pair
        bell_indices : list of int, optional
            Which Bell state (k=0,...,d-1) to use for each pair.
            Default: all zeros (standard Bell state |Φ₀⟩).
            
        Returns
        -------
        theta : ndarray
            Natural parameters [θ₁, θ₂, ..., θₙ]
        """
        from .pair_operators import pair_basis_generators, bell_state
        
        d = self.d
        D_pair = d ** 2
        n_ops_per_pair = D_pair ** 2 - 1  # d⁴ - 1
        
        # Get single-pair generators
        single_pair_ops = pair_basis_generators(d)
        
        # Handle bell_indices
        if bell_indices is None:
            bell_indices = [0] * self.n_pairs
        
        # Compute θ for each pair
        theta = np.zeros(self.n_params)
        
        for k in range(self.n_pairs):
            sigma_k = sigma_per_pair[k]
            
            # Get Bell state for this pair (may differ per pair via bell_indices)
            psi_bell_k = bell_state(d, bell_indices[k])
            rho_bell_k = np.outer(psi_bell_k, psi_bell_k.conj())
            
            # Form ρₖ_ε = (1-ε)|Φₖ⟩⟨Φₖ| + ε σₖ
            rho_k_eps = (1 - eps_val) * rho_bell_k + eps_val * sigma_k
            
            # Ensure Hermitian
            rho_k_eps = (rho_k_eps + rho_k_eps.conj().T) / 2
            
            # Compute log via eigendecomposition (small d²×d² matrix)
            eigvals, U = eigh(rho_k_eps)
            eigvals = np.maximum(eigvals, np.finfo(float).tiny)
            log_eigvals = np.log(eigvals)
            log_rho_k = U @ np.diag(log_eigvals) @ U.conj().T
            
            # Project onto per-pair operators
            # θₐ = Tr(log(ρₖ) Fₐ) / Tr(Fₐ²)
            start_idx = k * n_ops_per_pair
            for a, F_a in enumerate(single_pair_ops):
                numerator = np.real(np.trace(log_rho_k @ F_a))
                denominator = np.real(np.trace(F_a @ F_a))
                if denominator > 0:
                    theta[start_idx + a] = numerator / denominator
        
        return theta


[docs]
    def get_bell_state_parameters(
        self,
        epsilon: float = 1e-6,
        log_epsilon: Optional[float] = None,
        sigma: Optional[np.ndarray] = None,
        sigma_per_pair: Optional[List[np.ndarray]] = None,
        bell_indices: Optional[List[int]] = None,
    ) -> np.ndarray:
        """
        Get natural parameters θ corresponding to a regularised Bell state.
        
        A Bell state is a pure state (rank 1), which lies at the boundary
        of the exponential family where natural parameters θ → -∞.
        
        For regularised state: ``ρ_ε = (1-ε)|Φ⟩⟨Φ| + ε σ``
        
        We compute θ by solving: ``ρ_ε = exp(Σ θₐFₐ - ψ(θ))``
        
        This gives: ``θₐ ∝ Tr(log(ρ_ε) Fₐ)``
        
        Parameters
        ----------
        epsilon : float, default=1e-6
            Regularisation parameter.
            Smaller epsilon → closer to pure Bell state (more negative θ).
            Must be > 0 to avoid singularities.
        log_epsilon : float, optional
            If provided, overrides ``epsilon`` via log ε = log_epsilon.
            This is numerically convenient when exploring very small ε.
        sigma : ndarray, optional
            Full D×D regularisation matrix. Any valid density matrix.
            Use for entangled σ (inter-pair correlations in perturbation).
            If None and sigma_per_pair is None, uses I/D (isotropic).
        bell_indices : list of int, optional
            Which Bell state (k=0,...,d-1) to use for each pair.
            Default: all zeros (standard Bell state ``|Φ₀⟩``).
            Allows exploring different pure state origins.
        sigma_per_pair : list of ndarray, optional
            List of n density matrices, each d²×d² for one pair.
            Constructs σ = σ₁⊗σ₂⊗...⊗σₙ (product structure).
            Efficient O(n) computation preserved.
            Only valid for multi-pair systems (n_pairs > 1).
            
        Returns
        -------
        theta : ndarray
            Natural parameters for the regularized Bell state.
            Many components will be large and negative (approaching -∞
            for pure state).
            
        Raises
        ------
        ValueError
            If both sigma and sigma_per_pair are provided.
            If sigma_per_pair has wrong length or invalid matrices.
            
        Notes
        -----
        The regularisation matrix σ encodes the "direction of approach" 
        to the pure-state boundary (see CIP-0008 and entropy_time_paths.ipynb):
        
        - Different σ = different "meridians" from the north pole
        - Isotropic σ = I/D gives the "boring" symmetric departure
        - Anisotropic σ reveals the tangent cone of possible departures
        
        For the exponential family ρ(θ) = exp(H(θ))/Z where H = Σ θₐFₐ,
        we have: log(ρ) = H - log(Z)·I
        
        Since Tr(Fₐ) = 0 for our operators, we get:
        θₐ = Tr(log(ρ) Fₐ) / Tr(FₐFₐ)
        
        **Multi-pair note**: For n_pairs > 1, our operator basis is the direct
        sum of per-pair su(d²) algebras, NOT the full su(D) algebra. The 
        regularised Bell state may have components outside this subspace.
        The returned θ is the projection onto our subspace, which is correct
        for the inaccessible game dynamics (which stay in this subspace).
        Reconstruction via ``rho_from_theta(θ)`` will NOT exactly match the
        target ρ_ε, but the dynamics are correct.
        """
        if not self.pair_basis:
            raise ValueError("Bell states only defined for pair_basis=True")
        
        # Validate sigma options
        if sigma is not None and sigma_per_pair is not None:
            raise ValueError("Cannot specify both sigma and sigma_per_pair")
        
        # Handle sigma_per_pair for multi-pair systems (efficient O(n × d⁶) path)
        if sigma_per_pair is not None:
            if self.n_pairs == 1:
                raise ValueError("sigma_per_pair only valid for n_pairs > 1")
            if len(sigma_per_pair) != self.n_pairs:
                raise ValueError(
                    f"sigma_per_pair must have {self.n_pairs} matrices, got {len(sigma_per_pair)}"
                )
            # Validate each per-pair sigma (fast: O(n × d⁶) not O(D³))
            D_pair = self.d ** 2
            for k, sigma_k in enumerate(sigma_per_pair):
                if sigma_k.shape != (D_pair, D_pair):
                    raise ValueError(
                        f"sigma_per_pair[{k}] must be {D_pair}×{D_pair}, got {sigma_k.shape}"
                    )
                # Quick validation (skip full eigenvalue check for speed)
                if not np.allclose(sigma_k, sigma_k.conj().T, atol=1e-10):
                    raise ValueError(f"sigma_per_pair[{k}] must be Hermitian")
                if not np.isclose(np.trace(sigma_k), 1.0, atol=1e-10):
                    raise ValueError(f"sigma_per_pair[{k}] must have unit trace")
            
            # Go directly to efficient per-pair computation (skip full sigma build!)
            if log_epsilon is not None:
                log_eps = float(log_epsilon)
            else:
                log_eps = float(np.log(epsilon))
            eps_val = float(np.exp(log_eps))
            
            return self._bell_parameters_product_sigma(eps_val, log_eps, sigma_per_pair, bell_indices)
        
        # Work with log ε directly for numerical stability near the boundary.
        if log_epsilon is not None:
            log_eps = float(log_epsilon)
            if not np.isfinite(log_eps):
                raise ValueError("log_epsilon must be a finite real number")
        else:
            if epsilon <= 0:
                raise ValueError("epsilon must be > 0 (pure Bell state has θ → -∞)")
            log_eps = float(np.log(epsilon))

        # IMPORTANT:
        # - For σ = I/D (isotropic), we can keep working in log-space even when
        #   log_eps is far below float64 underflow (≈ -708). This avoids artificial
        #   saturation in θ when exploring extreme log_eps.
        # - For more general σ (including per-pair σₖ), we must materialize
        #   (1-ε)ρ + εσ in float64, so ε will underflow; those paths clamp.
        D = self.D
        
        # Get Bell state (product of n_pairs Bell states)
        psi_bell = product_of_bell_states(self.n_pairs, self.d, bell_indices=bell_indices)
        rho_bell = np.outer(psi_bell, psi_bell.conj())
        
        # Determine sigma structure and compute log(ρ_ε) accordingly
        if sigma is None:
            # Isotropic case: σ = I/D (efficient analytic formula)
            sigma_structure = 'isotropic'
        else:
            # Validate sigma
            is_valid, msg = self.validate_sigma(sigma)
            if not is_valid:
                raise ValueError(f"Invalid sigma: {msg}")
            sigma_structure = self.detect_sigma_structure(sigma)
        
        if sigma_structure == 'isotropic':
            # Efficient analytic computation for σ = I/D
            # ρ_ε = (1-ε)|Ψ⟩⟨Ψ| + ε I/D has eigenvalues:
            #   λ₀ = 1 - ε + ε/D (the Bell state direction)
            #   λ_⊥ = ε/D (all D-1 orthogonal directions)
            # This works for any n_pairs!
            eigvals_bell, U = eigh(rho_bell)
            idx_max = int(np.argmax(eigvals_bell.real))
            
            # ε may underflow for very negative log_eps; in that limit λ₀→1 so
            # log(λ₀)→0, while log(λ_⊥)=log_eps-log(D) remains well-defined.
            min_log_eps = float(np.log(np.finfo(float).tiny))
            eps_val = float(np.exp(log_eps)) if log_eps >= min_log_eps else 0.0
            factor = eps_val * (1.0 - 1.0 / D)
            log_lambda0 = np.log1p(-factor) if factor < 1.0 else np.log(np.finfo(float).tiny)
            log_lambda_perp = log_eps - np.log(D)
            
            log_diag = np.full(D, log_lambda_perp, dtype=float)
            log_diag[idx_max] = log_lambda0
            log_rho = U @ np.diag(log_diag) @ U.conj().T
            
        elif sigma_structure == 'product' and self.n_pairs > 1:
            # Product sigma: efficient per-pair computation!
            # For σ = σ₁⊗...⊗σₙ and |Ψ⟩ = |Φ⟩⊗...⊗|Φ⟩:
            # The marginal on pair k is: ρₖ_ε = (1-ε)|Φ⟩⟨Φ| + ε σₖ
            # We compute θ^(k) from log(ρₖ_ε) directly.
            # Complexity: O(n × d⁶) instead of O(d^(6n))
            
            # If sigma_per_pair wasn't provided, extract marginals from sigma
            if sigma_per_pair is None:
                sigma_per_pair = [
                    self._partial_trace_to_pair(sigma, k) 
                    for k in range(self.n_pairs)
                ]
            
            # Need ε in float64 arithmetic in the per-pair path; clamp to avoid
            # underflow producing singular logs.
            min_log_eps = float(np.log(np.finfo(float).tiny))
            log_eps_clamped = max(log_eps, min_log_eps)
            eps_val = float(np.exp(log_eps_clamped))

            return self._bell_parameters_product_sigma(
                eps_val, log_eps, sigma_per_pair, bell_indices
            )
            
        else:
            # General case: compute ρ_ε and take matrix log
            # This is O(D³) but handles arbitrary σ
            import warnings
            if sigma_structure == 'general' and self.n_pairs > 1:
                warnings.warn(
                    f"Using general σ (structure: {sigma_structure}) with {self.n_pairs} pairs. "
                    "This loses O(n) efficiency.",
                    UserWarning
                )
            
            # Need ε in float64 arithmetic; clamp to avoid underflow producing
            # a singular pure state and -inf in the log.
            min_log_eps = float(np.log(np.finfo(float).tiny))
            log_eps_clamped = max(log_eps, min_log_eps)
            eps_val = float(np.exp(log_eps_clamped))

            rho_eps = (1 - eps_val) * rho_bell + eps_val * sigma
            
            # Ensure Hermitian and compute log via eigendecomposition
            rho_eps = (rho_eps + rho_eps.conj().T) / 2
            eigvals, U = eigh(rho_eps)
            
            # Clamp small eigenvalues for numerical stability
            eigvals = np.maximum(eigvals, np.finfo(float).tiny)
            log_eigvals = np.log(eigvals)
            log_rho = U @ np.diag(log_eigvals) @ U.conj().T
        
        # Extract natural parameters by projection onto operator basis
        # θₐ = Tr(log(ρ) Fₐ) / Tr(FₐFₐ)
        theta = np.zeros(self.n_params)
        for a, F_a in enumerate(self.operators):
            numerator = np.real(np.trace(log_rho @ F_a))
            denominator = np.real(np.trace(F_a @ F_a))
            if denominator > 0:
                theta[a] = numerator / denominator
            else:
                theta[a] = 0.0
        
        return theta




__all__ = [
    "pauli_basis",
    "gell_mann_matrices",
    "qutrit_basis",
    "create_operator_basis",
    "MatrixExponentialFamily",
    "QuantumExponentialFamily",  # backward-compatible alias
]

# Backward-compatible alias
QuantumExponentialFamily = MatrixExponentialFamily