Beyond proteins primary structure

Beyond proteins primary structure#

In this notebook, we show how to evaluate (and visualize) sequences when we also have access to their 3D-structure.

Protein’s 3D-structure may be available from an online database and stored in a PDB-file. However, since the primary purpose of seqme is to evaluate novel proteins, we will assume that the 3D-structure is not available. Thus, we will use ESM-fold to predict the protein’s 3D-structure. ESM-fold is included in seqme.

# !pip seqme[esmfold, esmif1] BioPython py3Dmol
from collections.abc import Callable
from functools import partial
from typing import Literal

import numpy as np
import py3Dmol
from Bio.Align import PairwiseAligner

import seqme as sm

Hide code cell content

# IMPORTANT! Don't use this to compute RMSD (we use it to stay BSD 3-clause compatible).


def transform(coords1: np.ndarray, coords2: np.ndarray, *, with_scale: bool = True, return_params: bool = False):
    """Align coords2 to coords1 (both arrays shape (N,3) or (N,d)).

    Minimizes least-squares error for rotation, translation and optional uniform scale.

    Args:
        coords1: target points, shape (N, d)
        coords2: source points to transform, shape (N, d)
        with_scale: if True, estimate uniform scale; if False, force scale=1
        return_params: if True, return (aligned_coords2, R, t, s); otherwise just aligned coords

    Returns:
        aligned_coords2 (and optionally) rotation matrix R (dxd), translation vector t (d,),
        scale s (float).
    """
    A = np.asarray(coords1, dtype=float)
    B = np.asarray(coords2, dtype=float)

    if A.shape != B.shape:
        raise ValueError("coords1 and coords2 must have the same shape")

    N, d = A.shape
    if N == 0:
        raise ValueError("need at least one point")

    # centroids
    mu_A = A.mean(axis=0)
    mu_B = B.mean(axis=0)

    AA = A - mu_A
    BB = B - mu_B

    # covariance / cross-covariance
    cov = (AA.T @ BB) / N

    # SVD
    U, D, Vt = np.linalg.svd(cov)
    V = Vt.T

    # handle reflection
    S = np.eye(d)
    if np.linalg.det(U) * np.linalg.det(V) < 0:
        S[-1, -1] = -1

    R = U @ S @ V.T

    if with_scale:
        var_B = (BB * BB).sum() / N
        # trace(D * S) is sum(D * diag(S))
        s = float(np.sum(D * np.diag(S)) / var_B)
    else:
        s = 1.0

    t = mu_A - s * (R @ mu_B)

    transformed = (s * (R @ B.T)).T + t

    if return_params:
        return transformed, R, t, s
    return transformed


def rmsd(a: np.ndarray, b: np.ndarray) -> float:
    a = np.asarray(a)
    b = np.asarray(b)
    return float(np.sqrt(((a - b) ** 2).sum() / a.shape[0]))


def indices(s1: str, s2: str) -> np.ndarray:
    res = []
    i = 0
    for c1, c2 in zip(s1, s2, strict=True):
        if c1 != "-":
            if c2 != "-":
                res.append(i)
            i += 1
    return np.array(res, dtype=np.int32)


def compute_rmsd(coords1: np.ndarray, coords2: np.ndarray, seq1: str, seq2: str) -> float:
    align = PairwiseAligner().align(seq1, seq2)[0]
    a_seq1, a_seq2 = align[0], align[1]
    coords1 = coords1[indices(a_seq1, a_seq2)]
    coords2 = coords2[indices(a_seq2, a_seq1)]

    coords2_aligned = transform(coords1, coords2)
    return rmsd(coords1, coords2_aligned)

Let’s define a metric which uses atomic positions. Here we use RMSD.

class RMSD(sm.Metric):
    """Root mean square deviation of atomic positions."""

    def __init__(self, reference: str, folder: Callable[[list[str]], np.ndarray]):
        self.reference = reference
        self.folder = folder

    def __call__(self, sequences: list[str]) -> sm.MetricResult:
        ref_coords = self.folder([self.reference])[0]
        sequences_coords = self.folder(sequences)

        scores = np.array(
            [
                compute_rmsd(seq_coords, ref_coords, seq, self.reference)
                for seq, seq_coords in zip(sequences, sequences_coords, strict=True)
            ]
        )

        return sm.MetricResult(scores.mean().item())

    @property
    def name(self) -> str:
        return "RMSD"

    @property
    def objective(self) -> Literal["minimize", "maximize"]:
        return "minimize"

Let’s define our protein folding model.

cache = sm.Cache(models={"esm-fold": partial(sm.models.ESMFold().fold, convention="atom37", compute_ptm=True)})

Hide code cell output

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Note: We will only extract the position of the amino-acids “Cα” from the fold prediction to compute the RMSD metric.

esm_fold = cache.model("esm-fold", stack=False)

ptm_fn = lambda sequences: np.array([fold["ptm"] for fold in esm_fold(sequences)])
positions_fn = lambda sequences: [fold["positions"][:, 1, :] for fold in esm_fold(sequences)]  # CA's index = 1
plddt_fn = lambda sequences: np.array([fold["plddt"].mean() for fold in esm_fold(sequences)])

Let’s also compute the self-consistency perplexity (scPerplexity). To do so, we need a folding model (we again use ESM-fold) and an inverse-folding model (we now also use ESM-IF1).

ESM-IF1 expects the coordinates of the proteins amino-acids atoms: N, CA, C. So we extract those from ESM-Fold’s predictions below first.

# Protein folding
atom_indices = [0, 1, 2]  # atoms: N, CA, C
folder = lambda sequences: [fold["positions"][:, atom_indices, :] for fold in esm_fold(sequences)]

# Inverse protein folding
inverse_folder = sm.models.ESMIF1().compute_perplexity

cache.add("scPerplexity", lambda sequences: inverse_folder(folder(sequences), sequences))
/Users/rasmus.larsen/work/hackathon-2025/seqme/.conda/lib/python3.11/site-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results.
  warnings.warn(

Notice esm-fold is stored in the cache and we reuse it for ptm, plddt, positions and scPerplexity. Hence, we only call esm-fold once per sequence in total!

Let’s create the metric and sequences. We will evaluate 4 proteins.

sequences = {
    "SAV_STRAV": [
        "MRKIVVAAIAVSLTTVSITASASADPSKDSKAQVSAAEAGITGTWYNQLGSTFIVTAGADGALTGTYESAVGNAESRYVLTGRYDSAPATDGSGTALGWTVAWKNNYRNAHSATTWSGQYVGGAEARINTQWLLTSGTTEANAWKSTLVGHDTFTKVKPSAASIDAAKKAGVNNGNPLDAVQQ"
    ],
    "AVID_CHICK": [
        "MVHATSPLLLLLLLSLALVAPGLSARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTINKRTQPTFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE"
    ],
    "GNAT1_HUMAN": [
        "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"
    ],
}

sequences["GNAT1_HUMAN (shuffled)"] = sm.utils.shuffle_characters(sequences["GNAT1_HUMAN"])
metrics = [
    RMSD(reference=sequences["SAV_STRAV"][0], folder=positions_fn),
    sm.metrics.ID(predictor=ptm_fn, name="pTM", objective="maximize"),
    sm.metrics.ID(predictor=plddt_fn, name="pLDDT", objective="maximize"),
    sm.metrics.ID(predictor=cache.model("scPerplexity"), name="scPerplexity", objective="minimize"),
]

Let’s compute the metrics.

df = sm.evaluate(sequences, metrics)
100%|██████████| 16/16 [11:21<00:00, 42.60s/it, data=GNAT1_HUMAN (shuffled), metric=scPerplexity]
sm.show(df, color_style="bar")
  RMSD↓ pTM↑ pLDDT↑ scPerplexity↓
SAV_STRAV 0.00 0.74 0.82 4.38
AVID_CHICK 15.98 0.71 0.79 5.19
GNAT1_HUMAN 20.24 0.95 0.97 2.74
GNAT1_HUMAN (shuffled) 20.08 0.16 0.27 13.51

Let’s visualize the 3D-structure of a protein.

def visualize_structure(
    pdb: str,
    min_plddt: float = 0.0,
    max_plddt: float = 1.0,
    rotation: float = 0,
    zoom: float | None = None,
    width: int = 400,
    height: int = 400,
):
    """Visualize 3D-structure and color amino-acids by confidence (plddt)."""
    view = py3Dmol.view(js="https://3dmol.org/build/3Dmol.js", width=width, height=height)
    view.addModel(pdb, "pdb")
    view.setStyle({"cartoon": {"colorscheme": {"prop": "b", "gradient": "roygb", "min": min_plddt, "max": max_plddt}}})
    view.rotate(rotation)
    view.zoom(zoom) if zoom is not None else view.zoomTo()
    view.show()
name = "GNAT1_HUMAN"

folds = {name: esm_fold(seqs) for name, seqs in sequences.items()}
pdb = folds[name][0]["pdb"]

visualize_structure(pdb, rotation=90, min_plddt=0.8)
print(f"{pdb[:600]} \t ... etc.")
PARENT N/A
ATOM      1  N   MET A   1       6.652  17.555  65.585  1.00  0.84           N  
ATOM      2  CA  MET A   1       5.417  17.036  65.006  1.00  0.87           C  
ATOM      3  C   MET A   1       5.213  17.569  63.591  1.00  0.85           C  
ATOM      4  CB  MET A   1       4.216  17.401  65.880  1.00  0.79           C  
ATOM      5  O   MET A   1       4.859  18.735  63.408  1.00  0.74           O  
ATOM      6  CG  MET A   1       4.219  16.721  67.240  1.00  0.72           C  
ATOM      7  SD  MET A   1       2.775  17.198  68.267  1.00  0.77           S  
ATOM      8  CE  MET A 	 ... etc.

You have reached the end of this tutorial!

Last thing. To get a better idea of what metrics to use for evaluating protein sequences check out: Towards Robust Evaluation of Protein Generative Models: A Systematic Analysis of Metrics .