Source code for seqme.metrics.diversity
from typing import Literal
import numpy as np
from polyleven import levenshtein
from seqme.core.base import Metric, MetricResult
[docs]
class Diversity(Metric):
"""
Measures the diversity of synthetic sequences using normalized pairwise Levenshtein distance.
Evaluates how similar or different the synthetic sequences are relative to each other
in the sequence space. Higher values indicate greater diversity, while lower values
indicate more similarity or redundancy among sequences.
"""
[docs]
def __init__(
self,
k: int | None = None,
*,
seed: int = 0,
name: str = "Diversity",
):
"""
Initialize the metric.
Args:
k: If not ``None`` randomly sample ``k`` other sequences to compute diversity against.
seed: For deterministic sampling. Only used if ``k`` is not ``None``.
name: Metric name.
"""
self.k = k
self.seed = seed
self._name = name
if (self.k is not None) and (self.k <= 0):
raise ValueError("Expected k > 0.")
[docs]
def __call__(self, sequences: list[str]) -> MetricResult:
"""
Compute the diversity.
Note: For a large number of ``sequences``, a small value for ``k`` (e.g., 10) usually provides a stable approximation of the diversity.
Args:
sequences: Sequences to evaluate.
Returns:
MetricResult: Diversity score.
"""
score = compute_diversity(sequences, k=self.k, seed=self.seed)
return MetricResult(score)
@property
def name(self) -> str:
return self._name
@property
def objective(self) -> Literal["minimize", "maximize"]:
return "maximize"
def compute_diversity(
sequences: list[str],
*,
k: int | None = None,
seed: int = 0,
) -> float:
"""
Compute diversity.
Args:
sequences: Sequences to compute diversity on.
k: If not ``None`` randomly sample ``k`` other sequences to compute diversity against.
seed: For deterministic sampling. Only used if k is not ``None``.
Returns:
Diversity.
"""
if k:
rng = np.random.default_rng(seed)
divs = []
for i, sequence in enumerate(sequences):
others = sequences[:i] + sequences[i + 1 :]
if k and k < len(others):
idxs = rng.choice(np.arange(len(others)), size=k, replace=False)
others = [others[i] for i in idxs]
norms = np.maximum(len(sequence), [len(seq) for seq in others])
edits = np.array([levenshtein(sequence, seq) for seq in others])
norm_edits = edits / norms
div = norm_edits.mean()
divs.append(div)
return np.mean(divs).item()