Source code for seqme.models.kmers

import numpy as np


[docs] class KmerFrequencyEmbedding: """Computes normalized k-mer frequency embeddings for sequences."""
[docs] def __init__(self, kmers: list[str]): """Initialize model. Args: kmers: List of valid k-mers (same length). """ ks = {len(s) for s in kmers} if len(ks) > 1: raise ValueError("Not all kmers have the same length") self.k = list(ks)[0] self.kmer_to_idx = {kmer: idx for idx, kmer in enumerate(kmers)}
[docs] def __call__(self, sequences: list[str]) -> np.ndarray: """Embed a list of sequences as k-mer frequency vectors. Args: sequences: Sequences to embed. Returns: A NumPy array of shape (n_sequences, total_kmers) containing the embeddings. """ return np.array([self._embed(seq, self.kmer_to_idx, self.k) for seq in sequences])
def _embed(self, sequence: str, kmer_to_idx: dict[str, int], k: int) -> np.ndarray: """Embed one sequence as a k-mer frequency vector.""" embedding = np.zeros(len(kmer_to_idx)) total = max(len(sequence) - k + 1, 0) for i in range(total): kmer = sequence[i : i + k] idx = kmer_to_idx.get(kmer) if idx is not None: embedding[idx] += 1 return embedding / total if total > 0 else embedding