Source code for seqme.utils.sequences
import random
import numpy as np
[docs]
def shuffle_characters(sequences: list[str], seed: int | None = 0) -> list[str]:
"""
Randomly shuffle characters within each sequence.
Args:
sequences: Sequences to shuffle.
seed: Local seed when sampling. If ``None``, no fixed local seed is used.
Returns:
A new list where each sequences characters have been shuffled.
"""
rng = random.Random(seed)
shuffled = []
for seq in sequences:
chars = list(seq)
rng.shuffle(chars)
shuffled.append("".join(chars))
return shuffled
[docs]
def subsample(
sequences: list[str],
n_samples: int,
*,
return_indices: bool = False,
seed: int | None = 0,
) -> list[str] | tuple[list[str], np.ndarray]:
"""
Sample a subset of the sequences without replacement.
Args:
sequences: Sequences to sample from.
n_samples: Number of sequences to sample.
return_indices: If ``True``, return a tuple of the sequence subset and indices else return only the sequence subset.
seed: Local seed when sampling. If ``None``, no fixed local seed is used.
Returns:
A list of ``n_samples`` randomly chosen sequences. Optionally, including the indices.
Raises:
ValueError: If ``n_samples`` exceeds the number of available sequences.
"""
if n_samples > len(sequences):
raise ValueError(f"Cannot sample {n_samples} sequences from a list of length {len(sequences)}.")
rng = np.random.default_rng(seed)
indices = rng.choice(np.arange(len(sequences), dtype=int), size=n_samples, replace=False)
subset = [sequences[idx] for idx in indices]
if return_indices:
return subset, indices
return subset