Source code for seqme.models.pca

from collections.abc import Callable

import numpy as np
import sklearn.decomposition


[docs] class PCA: """Utility projecting sequence embeddings onto principle components."""
[docs] def __init__(self, embedder: Callable[[list[str]], np.ndarray], reference: list[str], n_components: int): """Initialize principle component analysis. Args: embedder: Embedding function. reference: Reference sequences to fit PCA on. n_components: Number of principle components. """ self.embedder = embedder reference_embeddings = self.embedder(reference) if n_components > reference_embeddings.shape[-1]: raise ValueError("n_components cannot exceed embedding dimensionality") self.pca = sklearn.decomposition.PCA(n_components=n_components, random_state=0).fit(reference_embeddings)
[docs] def __call__(self, sequences: list[str]) -> np.ndarray: """Project sequences into PCA space. Args: sequences: Sequences to embed. Returns: A NumPy array of shape (n_sequences, n_components) containing the embeddings. """ return self.pca.transform(self.embedder(sequences))
@property def variance_explained(self) -> np.ndarray: """Per-component explained variance ratio.""" return self.pca.explained_variance_ratio_