Beyond proteins primary structure#
In this notebook, we show how to evaluate (and visualize) sequences when we also have access to their 3D-structure.
Protein’s 3D-structure may be available from an online database and stored in a PDB-file. However, since the primary purpose of seqme is to evaluate novel proteins, we will assume that the 3D-structure is not available. Thus, we will use ESM-fold to predict the protein’s 3D-structure. ESM-fold is included in seqme.
# !pip seqme[esmfold, esmif1] BioPython py3Dmol
from collections.abc import Callable
from functools import partial
from typing import Literal
import numpy as np
import py3Dmol
from Bio.Align import PairwiseAligner
import seqme as sm
Let’s define a metric which uses atomic positions. Here we use RMSD.
class RMSD(sm.Metric):
"""Root mean square deviation of atomic positions."""
def __init__(self, reference: str, folder: Callable[[list[str]], np.ndarray]):
self.reference = reference
self.folder = folder
def __call__(self, sequences: list[str]) -> sm.MetricResult:
ref_coords = self.folder([self.reference])[0]
sequences_coords = self.folder(sequences)
scores = np.array(
[
compute_rmsd(seq_coords, ref_coords, seq, self.reference)
for seq, seq_coords in zip(sequences, sequences_coords, strict=True)
]
)
return sm.MetricResult(scores.mean().item())
@property
def name(self) -> str:
return "RMSD"
@property
def objective(self) -> Literal["minimize", "maximize"]:
return "minimize"
Let’s define our protein folding model.
cache = sm.Cache(models={"esm-fold": partial(sm.models.ESMFold().fold, convention="atom37", compute_ptm=True)})
Note: We will only extract the position of the amino-acids “Cα” from the fold prediction to compute the RMSD metric.
esm_fold = cache.model("esm-fold", stack=False)
ptm_fn = lambda sequences: np.array([fold["ptm"] for fold in esm_fold(sequences)])
positions_fn = lambda sequences: [fold["positions"][:, 1, :] for fold in esm_fold(sequences)] # CA's index = 1
plddt_fn = lambda sequences: np.array([fold["plddt"].mean() for fold in esm_fold(sequences)])
Let’s also compute the self-consistency perplexity (scPerplexity). To do so, we need a folding model (we again use ESM-fold) and an inverse-folding model (we now also use ESM-IF1).
ESM-IF1 expects the coordinates of the proteins amino-acids atoms: N, CA, C. So we extract those from ESM-Fold’s predictions below first.
# Protein folding
atom_indices = [0, 1, 2] # atoms: N, CA, C
folder = lambda sequences: [fold["positions"][:, atom_indices, :] for fold in esm_fold(sequences)]
# Inverse protein folding
inverse_folder = sm.models.ESMIF1().compute_perplexity
cache.add("scPerplexity", lambda sequences: inverse_folder(folder(sequences), sequences))
/Users/rasmus.larsen/work/hackathon-2025/seqme/.conda/lib/python3.11/site-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results.
warnings.warn(
Notice esm-fold is stored in the cache and we reuse it for ptm, plddt, positions and scPerplexity. Hence, we only call esm-fold once per sequence in total!
Let’s create the metric and sequences. We will evaluate 4 proteins.
sequences = {
"SAV_STRAV": [
"MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSAPATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNNGNPLDAVQQ"
],
"AVID_CHICK": [
"MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQPTFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE"
],
"GNAT1_HUMAN": [
"MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
],
}
sequences["GNAT1_HUMAN (shuffled)"] = sm.utils.shuffle_characters(sequences["GNAT1_HUMAN"])
metrics = [
RMSD(reference=sequences["SAV_STRAV"][0], folder=positions_fn),
sm.metrics.ID(predictor=ptm_fn, name="pTM", objective="maximize"),
sm.metrics.ID(predictor=plddt_fn, name="pLDDT", objective="maximize"),
sm.metrics.ID(predictor=cache.model("scPerplexity"), name="scPerplexity", objective="minimize"),
]
Let’s compute the metrics.
df = sm.evaluate(sequences, metrics)
100%|██████████| 16/16 [11:21<00:00, 42.60s/it, data=GNAT1_HUMAN (shuffled), metric=scPerplexity]
sm.show(df, color_style="bar")
| RMSD↓ | pTM↑ | pLDDT↑ | scPerplexity↓ | |
|---|---|---|---|---|
| SAV_STRAV | 0.00 | 0.74 | 0.82 | 4.38 |
| AVID_CHICK | 15.98 | 0.71 | 0.79 | 5.19 |
| GNAT1_HUMAN | 20.24 | 0.95 | 0.97 | 2.74 |
| GNAT1_HUMAN (shuffled) | 20.08 | 0.16 | 0.27 | 13.51 |
Let’s visualize the 3D-structure of a protein.
def visualize_structure(
pdb: str,
min_plddt: float = 0.0,
max_plddt: float = 1.0,
rotation: float = 0,
zoom: float | None = None,
width: int = 400,
height: int = 400,
):
"""Visualize 3D-structure and color amino-acids by confidence (plddt)."""
view = py3Dmol.view(js="https://3dmol.org/build/3Dmol.js", width=width, height=height)
view.addModel(pdb, "pdb")
view.setStyle({"cartoon": {"colorscheme": {"prop": "b", "gradient": "roygb", "min": min_plddt, "max": max_plddt}}})
view.rotate(rotation)
view.zoom(zoom) if zoom is not None else view.zoomTo()
view.show()
name = "GNAT1_HUMAN"
folds = {name: esm_fold(seqs) for name, seqs in sequences.items()}
pdb = folds[name][0]["pdb"]
visualize_structure(pdb, rotation=90, min_plddt=0.8)
print(f"{pdb[:600]} \t ... etc.")
PARENT N/A
ATOM 1 N MET A 1 6.652 17.555 65.585 1.00 0.84 N
ATOM 2 CA MET A 1 5.417 17.036 65.006 1.00 0.87 C
ATOM 3 C MET A 1 5.213 17.569 63.591 1.00 0.85 C
ATOM 4 CB MET A 1 4.216 17.401 65.880 1.00 0.79 C
ATOM 5 O MET A 1 4.859 18.735 63.408 1.00 0.74 O
ATOM 6 CG MET A 1 4.219 16.721 67.240 1.00 0.72 C
ATOM 7 SD MET A 1 2.775 17.198 68.267 1.00 0.77 S
ATOM 8 CE MET A ... etc.
You have reached the end of this tutorial!
Last thing. To get a better idea of what metrics to use for evaluating protein sequences check out: Towards Robust Evaluation of Protein Generative Models: A Systematic Analysis of Metrics .