Source code for seqme.metrics.jaccard_similarity

from typing import Literal

from seqme.core.base import Metric, MetricResult


[docs] class NGramJaccardSimilarity(Metric): r""" Average Jaccard similarity between each generated sequence and a reference corpus, based on n-grams of size ``n``, using \|A ∩ R\| / \|A ∪ R\|. You can choose to ``'minimize'`` (novelty) or ``'maximize'`` (overlap) via the ``objective`` parameter. """
[docs] def __init__( self, reference: list[str], n: int, *, objective: Literal["minimize", "maximize"] = "minimize", name: str = "Jaccard-similarity", ): """Initialize the metric. Args: reference: list of strings to build the reference n-gram set. n: size of the n-grams. objective: ``"minimize"`` to reward novelty, ``"maximize"`` to reward overlap. name: Metric name. """ if n < 1: raise ValueError("Expected n >= 1.") self.n = n self._objective = objective self.reference_ngrams = self._make_ngram_set(reference) self._name = name
def _make_ngram_set(self, corpus: list[str]) -> set[str]: all_ngrams: set[str] = set() for seq in corpus: all_ngrams |= self._ngrams(seq) return all_ngrams def _ngrams(self, seq: str) -> set[str]: L = len(seq) if L < self.n: return set() return {seq[i : i + self.n] for i in range(L - self.n + 1)}
[docs] def __call__(self, sequences: list[str]) -> MetricResult: """Compute the average Jaccard similarity between each generated sequence and a reference corpus, based on n-grams of size ``n``. Args: sequences: Sequences to evaluate. Returns: MetricResult: Jaccard similarity. """ total = len(sequences) if total == 0: return MetricResult(0.0) sim_sum = 0.0 R = self.reference_ngrams for seq in sequences: A = self._ngrams(seq) union = A | R if not union: # both A and R empty → define similarity = 0 continue sim_sum += len(A & R) / len(union) score = sim_sum / total return MetricResult(score)
@property def name(self) -> str: return self._name @property def objective(self) -> Literal["minimize", "maximize"]: return self._objective