Source code for seqme.utils.sequences

import random

import numpy as np


[docs] def shuffle_characters(sequences: list[str], seed: int | None = 0) -> list[str]: """ Randomly shuffle characters within each sequence. Args: sequences: Sequences to shuffle. seed: Local seed when sampling. If ``None``, no fixed local seed is used. Returns: A new list where each sequences characters have been shuffled. """ rng = random.Random(seed) shuffled = [] for seq in sequences: chars = list(seq) rng.shuffle(chars) shuffled.append("".join(chars)) return shuffled
[docs] def subsample( sequences: list[str], n_samples: int, *, return_indices: bool = False, seed: int | None = 0, ) -> list[str] | tuple[list[str], np.ndarray]: """ Sample a subset of the sequences without replacement. Args: sequences: Sequences to sample from. n_samples: Number of sequences to sample. return_indices: If ``True``, return a tuple of the sequence subset and indices else return only the sequence subset. seed: Local seed when sampling. If ``None``, no fixed local seed is used. Returns: A list of ``n_samples`` randomly chosen sequences. Optionally, including the indices. Raises: ValueError: If ``n_samples`` exceeds the number of available sequences. """ if n_samples > len(sequences): raise ValueError(f"Cannot sample {n_samples} sequences from a list of length {len(sequences)}.") rng = np.random.default_rng(seed) indices = rng.choice(np.arange(len(sequences), dtype=int), size=n_samples, replace=False) subset = [sequences[idx] for idx in indices] if return_indices: return subset, indices return subset