API Reference

hNMF.model

hnmf.model.HierarchicalNMF

HierarchicalNMF(
    k: int,
    unbalanced: float = 0.1,
    init: Literal[
        None, "random", "nndsvd", "nndsvda", "nndsvdar"
    ] = None,
    solver: Literal["cd", "mu"] = "cd",
    beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2] = 0,
    alpha_W: float = 0.0,
    alpha_H: Literal["same"] | float = "same",
    random_state: int = 42,
    trial_allowance: int = 100,
    tol: float = 1e-06,
    maxiter: int = 10000,
    dtype: DTypeLike = np.float64,
)

Bases: BaseEstimator

Methods:

Name	Description
`cluster_features`	Returns a mapping of node IDs to the set of feature indices assigned to each node.
`fit`	Fit `HierarchicalNMF` to data
`top_discriminative_samples_in_node`	Returns samples with the largest weight difference between the specified node and the mean of all other nodes, identifying samples most characteristic of this node.
`top_features_in_node`	Returns list of (feature_idx, weight) tuples showing the top N features for a given node, sorted descending by weight from Hs_ matrix.
`top_features_in_nodes`	Returns mapping of each node to its top N features ranked by weight. Each node maps to a list of (feature_idx, weight) tuples sorted descending by weight from Hs_ matrix.
`top_nodes_in_feature`	Returns the top nodes for a specified feature
`top_nodes_in_samples`	Returns mapping of each sample to its top N nodes ranked by weight. Each sample maps to a list of (node_id, weight) tuples sorted descending.
`top_samples_in_nodes`	Returns mapping of each node to its top N samples ranked by weight. Each node maps to a list of (sample_id, weight) tuples sorted descending. Based on soft ranking from Ws_ matrix.

Source code in src/hnmf/model.py

def __init__(
    self,
    k: int,
    unbalanced: float = 0.1,
    init: Literal[None, "random", "nndsvd", "nndsvda", "nndsvdar"] = None,
    solver: Literal["cd", "mu"] = "cd",
    beta_loss: Literal["FRO", 0, "KL", 1, "IS", 2] = 0,
    alpha_W: float = 0.0,
    alpha_H: Literal["same"] | float = "same",
    random_state: int = 42,
    trial_allowance: int = 100,
    tol: float = 1e-6,
    maxiter: int = 10000,
    dtype: npt.DTypeLike = np.float64,
):
    self.k = k
    self.unbalanced = unbalanced
    self.init = init
    self.solver = solver
    self.beta_loss = beta_loss
    self.alpha_W = alpha_W
    self.alpha_H = alpha_H
    self.random_state = np.random.RandomState(seed=random_state)
    self.trial_allowance = trial_allowance
    self.tol = tol
    self.maxiter = maxiter
    self.dtype = dtype

    self.n_samples_ = None
    self.n_features_ = None
    self.n_nodes_ = 0
    self.n_leaves_ = 0
    self.tree_ = None
    self.splits_ = None
    self.is_leaf_ = None
    self.clusters_ = None
    self.Ws_ = None
    self.Hs_ = None
    self.W_buffer_ = None
    self.H_buffer_ = None
    self.priorities_ = None

cluster_features

cluster_features(
    leaves_only: bool = True, include_outliers: bool = True
) -> dict[int, set[int]]

Returns a mapping of node IDs to the set of feature indices assigned to each node.

Parameters:

Name	Type	Description	Default
`leaves_only`	`bool`	Whether to return only leaf nodes. Defaults to True	`True`
`include_outliers`	`bool`	If True, features without a node assignment are returned under the key -1. Defaults to True	`True`

Source code in src/hnmf/model.py

def cluster_features(
    self,
    leaves_only: bool = True,
    include_outliers: bool = True,
) -> dict[int, set[int]]:
    """
    Returns a mapping of node IDs to the set of feature indices assigned to each node.

    Parameters
    ----------
    leaves_only
        Whether to return only leaf nodes. Defaults to True
    include_outliers
        If True, features without a node assignment are returned under the key -1. Defaults to True

    """

    if self.clusters_ is None:
        raise ValueError("Model not fitted, clusters_ is None")

    output = defaultdict(set)

    node_leaf_idx = np.where(self.is_leaf_ == 1)[0]

    clusters = self.clusters_[node_leaf_idx] if leaves_only else self.clusters_

    assignments = np.argwhere(clusters)

    for cluster_idx, feature_idx in assignments:
        output[int(cluster_idx)].add(int(feature_idx))

    if include_outliers:
        outliers = np.where(clusters.sum(axis=0) == 0)[0]
        output[-1] = {int(o) for o in outliers}

    return dict(output)

fit

fit(X: NDArray) -> Self

Fit HierarchicalNMF to data

Source code in src/hnmf/model.py

def fit(self, X: npt.NDArray) -> Self:
    """
    Fit `HierarchicalNMF` to data
    """
    shape: tuple[int, int] = X.shape
    n_samples, n_features = shape
    self.n_samples_ = n_samples
    self.n_features_ = n_features

    # TODO Expect different sized ranks
    clusters: list[npt.NDArray[np.int64] | None] = [None] * (2 * (self.k - 1))
    Ws = [None] * (2 * (self.k - 1))
    Hs = [None] * (2 * (self.k - 1))
    W_buffer = [None] * (2 * (self.k - 1))
    H_buffer = [None] * (2 * (self.k - 1))
    priorities = np.zeros(2 * (self.k - 1), dtype=self.dtype)
    is_leaf = np.zeros(2 * (self.k - 1), dtype=np.bool)  # No leaves at start
    tree = np.zeros((2, 2 * (self.k - 1)), dtype=np.int64)
    splits = -np.ones(self.k - 1, dtype=np.int64)

    # Where X has at least one non-zero
    term_subset = np.flatnonzero(np.sum(X, axis=1))

    W, H = self._init_fit(X, term_subset)

    result_used = 0

    with ProgressTree() as pt:
        for i in range(self.k - 1):
            if i == 0:
                split_node = 0
                new_nodes = [0, 1]
                min_priority = 1e40
                split_subset = np.arange(n_features)
            else:
                leaves = np.where(is_leaf == 1)[0]
                temp_priority = priorities[leaves]

                if len(np.where(temp_priority > 0)[0]) > 0:
                    min_priority = np.min(temp_priority[temp_priority > 0])
                    split_node = np.argmax(temp_priority)
                else:  # There are no more candidates stop early
                    min_priority = -1
                    split_node = 0

                if temp_priority[split_node] < 0 or min_priority == -1:
                    logger.warning(
                        f"Cannot generate all {self.k} leaf clusters, stopping at {i} leaf clusters"
                    )

                    Ws = [i for i in Ws if i is not None]
                    W_buffer = [i for i in W_buffer if i is not None]

                    Hs = [i for i in Hs if i is not None]
                    H_buffer = [i for i in H_buffer if i is not None]

                    # Resize attributes
                    tree = tree[:, :result_used]
                    splits = splits[:result_used]
                    is_leaf = is_leaf[:result_used]
                    clusters = clusters[:result_used]
                    priorities = priorities[:result_used]

                    self.tree_ = tree.T
                    self.splits_ = splits
                    self.is_leaf_ = is_leaf
                    self.n_nodes_ = self.is_leaf_.shape[0]
                    self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
                    self.clusters_ = self._stack_clusters(clusters)
                    self.Ws_ = np.array(Ws)
                    self.Hs_ = np.array(Hs)
                    self.W_buffer_ = np.array(W_buffer)
                    self.H_buffer_ = self._stack_H_buffer(H_buffer)
                    self.priorities_ = priorities
                    return self

                split_node = leaves[split_node]  # Attempt to split this node
                is_leaf[split_node] = 0
                W = W_buffer[split_node]
                H = H_buffer[split_node]

                # Find which features are clustered on this node
                split_subset = clusters[split_node]
                new_nodes = [result_used, result_used + 1]
                tree[:, split_node] = new_nodes

            result_used += 2
            # For each row find where it is more greatly represented
            cluster_subset = np.argmax(H, axis=0)

            subset_0 = np.flatnonzero(cluster_subset == 0)
            subset_1 = np.flatnonzero(cluster_subset == 1)
            ls0 = len(subset_0)
            ls1 = len(subset_1)

            if i == 0:
                pt.add_branch("Root", new_nodes[0], ls0)
                pt.add_branch("Root", new_nodes[1], ls1)
            else:
                pt.add_branch(split_node, new_nodes[0], ls0)
                pt.add_branch(split_node, new_nodes[1], ls1)

            clusters[new_nodes[0]] = split_subset[subset_0]
            clusters[new_nodes[1]] = split_subset[subset_1]
            Ws[new_nodes[0]] = W[:, 0]
            Ws[new_nodes[1]] = W[:, 1]

            # These will not have shape of (2, n_features) because they are fitting a subset
            # Create zero filled array of shape (2, n_features)
            h_temp = np.zeros(shape=(2, self.n_features_), dtype=self.dtype)
            # Which features are present in H

            h_temp[0, split_subset] = H[0]
            h_temp[1, split_subset] = H[1]

            Hs[new_nodes[0]] = h_temp[0]
            Hs[new_nodes[1]] = h_temp[1]

            splits[i] = split_node
            is_leaf[new_nodes] = 1

            subset = clusters[new_nodes[0]]
            (
                subset,
                W_buffer_one,
                H_buffer_one,
                priority_one,
            ) = trial_split_sklearn(
                min_priority=min_priority,
                X=X,
                subset=subset,  # ty:ignore[invalid-argument-type]
                W_parent=W[:, 0],
                random_state=self.random_state,
                trial_allowance=self.trial_allowance,
                unbalanced=self.unbalanced,
                dtype=self.dtype,
                tol=self.tol,
                maxiter=self.maxiter,
                init=self.init,
                alpha_W=self.alpha_W,
                alpha_H=self.alpha_H,
            )
            clusters[new_nodes[0]] = subset
            W_buffer[new_nodes[0]] = W_buffer_one
            H_buffer[new_nodes[0]] = H_buffer_one
            priorities[new_nodes[0]] = priority_one

            subset = clusters[new_nodes[1]]
            (
                subset,
                W_buffer_one,
                H_buffer_one,
                priority_one,
            ) = trial_split_sklearn(
                min_priority=min_priority,
                X=X,
                subset=subset,  # ty:ignore[invalid-argument-type]
                W_parent=W[:, 1],
                random_state=self.random_state,
                trial_allowance=self.trial_allowance,
                unbalanced=self.unbalanced,
                dtype=self.dtype,
                tol=self.tol,
                maxiter=self.maxiter,
                init=self.init,
                alpha_W=self.alpha_W,
                alpha_H=self.alpha_H,
            )
            clusters[new_nodes[1]] = subset
            W_buffer[new_nodes[1]] = W_buffer_one
            H_buffer[new_nodes[1]] = H_buffer_one
            priorities[new_nodes[1]] = priority_one
    self.tree_ = tree.T
    self.splits_ = splits
    self.is_leaf_ = is_leaf
    self.clusters_ = self._stack_clusters(clusters)
    self.Ws_ = np.array(Ws)
    self.Hs_ = np.array(Hs)
    self.W_buffer_ = np.array(W_buffer)
    self.H_buffer_ = self._stack_H_buffer(H_buffer)
    self.priorities_ = priorities
    self.n_nodes_ = self.is_leaf_.shape[0]
    self.n_leaves_ = int(np.count_nonzero(self.is_leaf_))
    return self

top_discriminative_samples_in_node

top_discriminative_samples_in_node(
    node: int,
    n: int = 10,
    sign: Literal["positive", "negative", "abs"] = "abs",
) -> list[DiscriminatedSample]

Returns samples with the largest weight difference between the specified node and the mean of all other nodes, identifying samples most characteristic of this node.

Parameters:

Name	Type	Description	Default
`node`	`int`	The index of the node to compute discriminative samples for	required
`n`	`int`	The number of features to return	`10`
`sign`	`Literal['positive', 'negative', 'abs']`	One of `['positive', 'negative', 'abs']`.	`'abs'`

Source code in src/hnmf/model.py

def top_discriminative_samples_in_node(
    self,
    node: int,
    n: int = 10,
    sign: Literal["positive", "negative", "abs"] = "abs",
) -> "list[DiscriminatedSample]":
    """Returns samples with the largest weight difference between the specified node and the mean of all other nodes, identifying samples most characteristic of this node.

    Parameters
    ----------
    node
        The index of the node to compute discriminative samples for
    n
        The number of features to return
    sign
        One of `['positive', 'negative', 'abs']`.

    """

    if self.Ws_ is None:
        raise ValueError("Model not fitted, Ws_ is None")
    if sign not in ("positive", "negative", "abs"):
        raise ValueError("Sign must be one of 'positive', 'negative' or 'abs'")

    # Masks
    member_mask = np.array(node, dtype=np.int64)
    non_member_mask = np.array(
        [x for x in np.arange(0, self.n_nodes_) if x != node]
    )

    member_values = self.Ws_[member_mask].ravel()
    other_means = self.Ws_[non_member_mask].mean(axis=0)

    diffs = (
        np.abs(member_values - other_means)
        if sign == "positive"
        else member_values - other_means
        if sign == "positive"
        else other_means - member_values
    )

    diff_tops = diffs.argsort()[::-1][:n]

    return [
        DiscriminatedSample(
            sample=diff,
            node=node,
            node_value=member_values[diff],
            others_value=other_means[diff],
        )
        for diff in diff_tops
    ]

top_features_in_node

top_features_in_node(
    node: int, n: int = 10
) -> list[tuple[int, float]]

Returns list of (feature_idx, weight) tuples showing the top N features for a given node, sorted descending by weight from Hs_ matrix.

Source code in src/hnmf/model.py

def top_features_in_node(self, node: int, n: int = 10) -> list[tuple[int, float]]:
    """Returns list of (feature_idx, weight) tuples showing the top N features for a given node, sorted descending by weight from Hs_ matrix."""

    if self.Hs_ is None:
        raise ValueError("Model not fitted, Hs_ is None")

    node_i = self.Hs_[node]
    ranks = node_i.argsort()[::-1][:n]
    return [(int(i), float(node_i[i])) for i in ranks if node_i[i] > 0]

top_features_in_nodes

top_features_in_nodes(
    n: int = 10,
    leaves_only: bool = True,
    min_samples: int = 0,
) -> dict[int, list[tuple[int, float]]]

Returns mapping of each node to its top N features ranked by weight. Each node maps to a list of (feature_idx, weight) tuples sorted descending by weight from Hs_ matrix.

Source code in src/hnmf/model.py

def top_features_in_nodes(self, n: int = 10, leaves_only: bool = True, min_samples: int = 0) -> dict[int, list[tuple[int, float]]]:
    """Returns mapping of each node to its top N features ranked by weight. Each node maps to a list of (feature_idx, weight) tuples sorted descending by weight from Hs_ matrix."""

    if self.Hs_ is None:
        raise ValueError("Model not fitted, Hs_ is None")

    node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
    output = {}
    weights = self.Hs_

    node_tops = weights.argsort()[:, ::-1][:, :n]
    node_top_weights = np.take_along_axis(weights, node_tops, axis=1)

    for node_idx, (feature_ids, feature_weights) in enumerate(
        zip(node_tops, node_top_weights, strict=True)
    ):
        if leaves_only and node_idx not in node_leaf_idx:
            continue

        if min_samples > 0:
            sample_count = np.count_nonzero(self.Ws_[node_idx])
            if sample_count < min_samples:
                continue

        tops = [
            (int(feature_id), float(weight))
            for feature_id, weight in zip(feature_ids, feature_weights, strict=True)
            if weight > 0
        ]
        tops.sort(key=itemgetter(1), reverse=True)
        output[node_idx] = tops

    return output

top_nodes_in_feature

top_nodes_in_feature(
    feature_idx: int | str,
    n: int = 10,
    leaves_only: bool = True,
) -> list[tuple]

Returns the top nodes for a specified feature

Source code in src/hnmf/model.py

def top_nodes_in_feature(
    self,
    feature_idx: int | str,
    n: int = 10,
    leaves_only: bool = True,
) -> list[tuple]:
    """
    Returns the top nodes for a specified feature
    """
    if self.Hs_ is None:
        raise ValueError("Model not fitted, Hs_ is None")

    node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
    node_weights = self.Hs_.T[feature_idx]
    ranks = node_weights.argsort()[::-1]
    if leaves_only:
        ranks = ranks[np.isin(ranks, node_leaf_idx)]

    ranks = ranks[:n]

    return [(i, node_weights[i]) for i in ranks if node_weights[i] > 0]

top_nodes_in_samples

top_nodes_in_samples(
    n: int = 10, leaves_only: bool = True
) -> dict[int, list[tuple[int, float]]]

Returns mapping of each sample to its top N nodes ranked by weight. Each sample maps to a list of (node_id, weight) tuples sorted descending. Based on soft ranking from Ws_ matrix.

Source code in src/hnmf/model.py

def top_nodes_in_samples(
    self, n: int = 10, leaves_only: bool = True
) -> dict[int, list[tuple[int, float]]]:
    """Returns mapping of each sample to its top N nodes ranked by weight. Each sample maps to a list of (node_id, weight) tuples sorted descending.
    Based on soft ranking from Ws_ matrix."""

    if self.Ws_ is None or self.n_nodes_ is None:
        raise ValueError("Model not fitted, Ws_ is None")

    # Idx of leaves
    node_leaf_idx = np.where(self.is_leaf_ == 1)[0]
    # Keep map of enumerated -> actual cluster
    if leaves_only:
        node_map = dict(enumerate(node_leaf_idx))
    else:
        node_map = dict(enumerate(range(self.n_nodes_)))

    # A dictionary of {sample : [top_nodes]}

    output = {}

    # Ws_ is shape n_nodes, n_samples
    # Transpose weights so it has samples as rows, nodes as columns

    weights = self.Ws_.T[node_leaf_idx].T if leaves_only else self.Ws_.T

    # The ellipsis indicates that the selection is done row wise
    sample_tops = weights.argsort()[:, ::-1][:, :n]

    # Create an array with samples as rows, top n weights as columns
    sample_top_weights = np.take_along_axis(weights, sample_tops, axis=1)

    for sample_idx, (node_ids, node_weights) in enumerate(
        zip(sample_tops, sample_top_weights, strict=True)
    ):
        tops = [
            (node_map[node_id], weight)
            for node_id, weight in zip(node_ids, node_weights, strict=True)
            if weight > 0
        ]
        tops.sort(key=itemgetter(1), reverse=True)
        output[sample_idx] = tops

    return output

top_samples_in_nodes

top_samples_in_nodes(
    n: int = 10, leaves_only: bool = True
) -> dict[int, list[tuple[int, float]]]

Returns mapping of each node to its top N samples ranked by weight. Each node maps to a list of (sample_id, weight) tuples sorted descending. Based on soft ranking from Ws_ matrix.

Source code in src/hnmf/model.py

def top_samples_in_nodes(self, n: int = 10, leaves_only: bool = True) -> dict[int, list[tuple[int, float]]]:
    """Returns mapping of each node to its top N samples ranked by weight. Each node maps to a list of (sample_id, weight) tuples sorted descending. Based on soft ranking from Ws_ matrix."""

    if self.Ws_ is None:
        raise ValueError("Model not fitted, Ws_ is None")

    # Idx of leaves
    node_leaf_idx = np.where(self.is_leaf_ == 1)[0]

    # A dictionary of {nodes : [sample]}

    output = {}

    # Ws_ is shape n_nodes, n_samples

    weights = self.Ws_

    # The ellipsis indicates that the selection is done row wise
    node_tops = weights.argsort()[:, ::-1][:, :n]

    # Create an array with samples as rows, top n weights as columns
    node_top_weights = np.take_along_axis(weights, node_tops, axis=1)

    for node_idx, (sample_ids, sample_weights) in enumerate(
        zip(node_tops, node_top_weights, strict=True)
    ):
        if leaves_only and node_idx not in node_leaf_idx:
            continue
        tops = [
            (sample_id, weight)
            for sample_id, weight in zip(sample_ids, sample_weights, strict=True)
            if weight > 0
        ]
        tops.sort(key=itemgetter(1), reverse=True)
        # Decode samples if available

        output[node_idx] = tops

    return output

hNMF.helpers

hnmf.helpers

nmfsh_comb_rank2

nmfsh_comb_rank2(
    A: NDArray,
    Winit: NDArray,
    Hinit: NDArray,
    anls_alg: AnlsAlgorithm,
    vec_norm: float,
    normW: bool,
    tol: float,
    maxiter: int,
    dtype: DTypeLike,
) -> tuple[npt.NDArray, npt.NDArray]

Source code in src/hnmf/helpers.py

def nmfsh_comb_rank2(
    A: npt.NDArray,
    Winit: npt.NDArray,
    Hinit: npt.NDArray,
    anls_alg: AnlsAlgorithm,
    vec_norm: float,
    normW: bool,
    tol: float,
    maxiter: int,
    dtype: npt.DTypeLike,
) -> tuple[npt.NDArray, npt.NDArray]:
    """"""
    eps = 1e-6
    shape: tuple[int, int] = A.shape
    m, n = shape
    W, H = Winit, Hinit

    if W.shape[1] != 2:
        warnings.warn(
            f"Error: Wrong size of W! Expected shape of (n, 2) but received W of shape ({W.shape[0]}, {W.shape[1]})",
            stacklevel=2,
        )

    if H.shape[0] != 2:
        warnings.warn(
            f"Error: Wrong size of H! Expected shape of (2, n) but received H of shape ({H.shape[0]}, {H.shape[1]})",
            stacklevel=2,
        )

    left = H.dot(H.T)
    right = A.dot(H.T)
    for iter_ in range(maxiter):
        if matrix_rank(left) < 2:
            W = np.zeros((m, 2), dtype=dtype)
            H = np.zeros((2, n), dtype=dtype)
            if sp.issparse(A):
                U, _S, V = svd(A.toarray(), full_matrices=False)  # type: ignore[attr-defined]  # A can be sparse
            else:
                U, _S, V = svd(A, full_matrices=False)
            U, V = U[:, 0], V[0, :]
            if sum(U) < 0:
                U, V = -U, -V

            W[:, 0] = U
            H[0, :] = V

            return W, H

        W = anls_alg(left, right, W, dtype)
        norms_W = norm(W, axis=0)
        if np.min(norms_W) < eps:
            logger.warning("Error: Some column of W is essentially zero")

        W *= 1.0 / norms_W
        left = W.T.dot(W)
        right = A.T.dot(W)
        if matrix_rank(left) < 2:
            W = np.zeros((m, 2), dtype=dtype)
            H = np.zeros((2, n), dtype=dtype)
            if sp.issparse(A):
                U, _S, V = svd(A.toarray(), full_matrices=False)  # type: ignore[attr-defined]  # A can be sparse
            else:
                U, _S, V = svd(A, full_matrices=False)
            U, V = U[:, 0], V[0, :]
            if sum(U) < 0:
                U, V = -U, -V

            W[:, 0] = U
            H[0, :] = V

            return W, H

        H = anls_alg(left, right, H.T, dtype).T
        gradH = left.dot(H) - right.T
        left = H.dot(H.T)
        right = A.dot(H.T)
        gradW = W.dot(left) - right
        initgrad = 1
        if iter_ == 0:
            gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
            gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
            initgrad = np.sqrt(gradW_square + gradH_square)
            continue
        gradW_square = np.sum(np.power(gradW[np.logical_or(gradW <= 0, W > 0)], 2))
        gradH_square = np.sum(np.power(gradH[np.logical_or(gradH <= 0, H > 0)], 2))
        projnorm = np.sqrt(gradW_square + gradH_square)

        if projnorm < tol * initgrad:
            break

    if vec_norm != 0:
        if normW:
            norms = np.power(np.sum(np.power(W, vec_norm), axis=0), 1 / vec_norm)
            W /= norms
            H *= norms[:, None]
        else:
            norms = np.power(np.sum(np.power(H, vec_norm), axis=1), 1 / vec_norm)
            W *= norms[None, :]
            H /= norms

    return W, H