Source code for molecular_simulations.simulate.cph_simulation

"""Constant pH molecular dynamics simulations using OpenMM.

This module provides classes and functions for running constant pH simulations
with replica exchange across different pH values. It uses Parsl for distributed
execution of simulation replicas.
"""

from datetime import datetime
import json
import logging
import MDAnalysis as mda
import numpy as np
from openmm import LangevinIntegrator
from openmm.app import (AmberPrmtopFile,
                        AmberInpcrdFile,
                        CutoffNonPeriodic, 
                        HBonds,
                        ForceField, 
                        PDBFile, 
                        PME,
                        Topology)
from openmm.unit import (amu,
                         kelvin, 
                         kilojoules_per_mole,
                         nanometers, 
                         picosecond)
import parsl
from parsl import Config, python_app
from pathlib import Path
from typing import Any, Optional
import uuid
from .constantph.constantph import ConstantPH
from .constantph.logging import setup_task_logger

@python_app
def run_cph_sim(params: dict[str, Any],
                temperature: kelvin,
                n_cycles: int,
                n_steps: int,
                log_params: dict[str, str],
                path: str) -> None:
    """Run a constant pH simulation as a Parsl python app.

    This function executes a single constant pH simulation replica,
    performing Monte Carlo titration steps between MD propagation cycles.

    Args:
        params: Dictionary of ConstantPH parameters including topology,
            coordinates, residue variants, and reference energies.
        temperature: Simulation temperature with OpenMM units.
        n_cycles: Number of MD/MC cycles to perform.
        n_steps: Number of MD steps per cycle.
        log_params: Dictionary containing logging configuration with
            run_id, task_id, and log_dir.
        path: Path to the simulation directory for logging purposes.
    """
    variants = params['residueVariants']

    logger = setup_task_logger(**log_params)
    
    cph = ConstantPH(**params)
    cph.simulation.minimizeEnergy()
    cph.simulation.context.setVelocitiesToTemperature(temperature)

    resids = list(variants.keys())
    
    logger.info(f'Running {n_cycles} cycles of constant pH simulation!',
                extra={'path': path, 'pH': None, 'step': None, 'resids': resids})
    for i in range(n_cycles):
        cph.simulation.step(n_steps)
        cph.attemptMCStep(temperature)
        pH = cph.pH[cph.currentPHIndex]
        current_variants = [variants[index][cph.titrations[index].currentIndex] 
                            for index in variants]
        logger.info(f'Step complete {i}!',
                    extra={'path': path, 'pH': pH, 'step': i, 'resnames': current_variants})

[docs] def setup_worker_logger(self, worker_id: str, log_dir: Path) -> logging.Logger: """Set up a JSON logger for a simulation worker. Creates a logger that writes JSON-formatted log entries to a worker-specific file. Args: worker_id: Unique identifier for the worker. log_dir: Directory path where log files will be written. Returns: Configured logging.Logger instance for the worker. """ log_path = log_dir / f'{worker_id}.jsonl' logger = logging.getLogger(f'task.{task_id}') logger.setLevel(logging.INFO) logger.handlers.clear() handler = logging.FileHandler(log_path) handler.setFormatter(JsonFormatter(task_id=worker_id)) logger.addHandler(handler) return logger
[docs] class ConstantPHEnsemble: """Orchestrator for running constant pH simulation ensembles. Manages distributed execution of multiple constant pH simulation replicas using Parsl. Each replica can sample across a range of pH values with Monte Carlo titration moves. """
[docs] def __init__(self, paths: list[Path], reference_energies: dict[str, list[float]], parsl_config: Config, log_dir: Path, pHs: list[float]=[x+0.5 for x in range(14)], temperature: float=300., variant_sel: Optional[str]=None,): """Initialize the constant pH ensemble. Args: paths: List of paths to simulation directories, each containing system.prmtop and system.inpcrd files. reference_energies: Dictionary mapping residue names to lists of reference free energies for each protonation state. parsl_config: Parsl configuration for distributed execution. log_dir: Directory path for writing simulation logs. pHs: List of pH values to sample. Defaults to [0.5, 1.5, ..., 13.5]. temperature: Simulation temperature in Kelvin. Defaults to 300. variant_sel: Optional MDAnalysis selection string to limit which titratable residues are included. Defaults to None. """ self.paths = paths self.ref_energies = reference_energies self.parsl_config = parsl_config self.dfk = None self.log_dir = log_dir self.pHs = pHs self.variant_sel = variant_sel self.temperature = temperature * kelvin self.run_id = datetime.now().strftime('%Y%m%d_%H%M%S')
[docs] def initialize(self) -> None: """Initialize Parsl for distributed simulation execution.""" self.dfk = parsl.load(self.parsl_config)
[docs] def shutdown(self) -> None: """Clean up Parsl resources after simulation completion.""" if self.dfk: self.dfk.cleanup() self.dfk = None parsl.clear()
[docs] def load_files(self, path: Path) -> tuple[Topology, np.ndarray]: """Load topology and coordinates from AMBER files. Args: path: Directory containing system.prmtop and system.inpcrd files. Returns: Tuple of (OpenMM Topology, atomic positions array). """ prmtop = AmberPrmtopFile(str(path / 'system.prmtop')) inpcrd = AmberInpcrdFile(str(path / 'system.inpcrd')) return prmtop.topology, inpcrd.positions
[docs] def build_dicts(self, path: Path, top: Topology) -> tuple[dict[str, list[str]], dict[int, list[float]]]: """Build residue variant and reference energy dictionaries. Identifies titratable residues in the topology and maps them to their possible protonation states and corresponding reference energies. Args: path: Path to the structure file for MDAnalysis loading. top: OpenMM Topology object containing residue information. Returns: Tuple of (variants dict, reference_energies dict) where variants maps residue indices to lists of residue name variants and reference_energies maps residue indices to lists of reference free energies in kJ/mol. """ _variants = { 'CYS': ['CYS', 'CYX'], 'ASP': ['ASH', 'ASP'], 'GLU': ['GLH', 'GLU'], 'LYS': ['LYS', 'LYN'], 'HIS': ['HIP', 'HID', 'HIE'], } names = list(_variants.keys()) variants = {} reference_energies = {} for residue in top.residues(): if residue.name in names: variants[residue.index] = _variants[residue.name] reference_energies[residue.index] = [x * kilojoules_per_mole for x in self.ref_energies[residue.name]] u = mda.Universe(str(path / 'system.prmtop'), str(path / 'system.inpcrd')) sel = u.select_atoms('protein') bad_keys = [sel[0].resid, sel[-1].resid] # termini if self.variant_sel is not None: var = sel.select_atoms(self.variant_sel) bad_keys = [resid for resid in sel.residues.resids if resid not in var.residues.resids] for bad_key in bad_keys: if bad_key in variants: del variants[bad_key] if bad_key in reference_energies: del reference_energies[bad_key] return variants, reference_energies
[docs] def run(self, n_cycles: int=500, n_steps: int=500) -> None: """Run the constant pH simulation ensemble. Distributes simulation replicas across available resources using Parsl and waits for all replicas to complete. Args: n_cycles: Number of MD/MC cycles per replica. Defaults to 500. n_steps: Number of MD steps per cycle. Defaults to 500. """ futures = [] for i, path in enumerate(self.paths): top, pos = self.load_files(path) variants, reference_energies = self.build_dicts(path, top) cph_params = self.get_params(path) cph_params.update({ 'residueVariants': variants, 'referenceEnergies': reference_energies, }) log_params = { 'run_id': self.run_id, 'task_id': f'{i:05d}', 'log_dir': self.log_dir, } futures.append( run_cph_sim(cph_params, self.temperature, n_cycles, n_steps, log_params, str(path)) ) _ = [x.result() for x in futures]
[docs] def get_params(self, path: Path) -> dict[str, Any]: """Build the parameter dictionary for ConstantPH initialization. Args: path: Directory containing system.prmtop and system.inpcrd files. Returns: Dictionary containing all parameters needed to initialize a ConstantPH simulation including file paths, pH values, integrators, and force field parameters for both explicit and implicit solvent. """ expl_params = dict(nonbondedMethod=PME, nonbondedCutoff=0.9*nanometers, constraints=HBonds, hydrogenMass=1.5*amu) impl_params = dict(nonbondedMethod=CutoffNonPeriodic, nonbondedCutoff=2.0*nanometers, constraints=HBonds) integrator = LangevinIntegrator(self.temperature, 1.0/picosecond, 0.004*picosecond) relaxation_integrator = LangevinIntegrator(self.temperature, 10.0/picosecond, 0.002*picosecond) params = { 'prmtop_file': path / 'system.prmtop', 'inpcrd_file': path / 'system.inpcrd', 'pH': self.pHs, 'relaxationSteps': 1000, 'explicitArgs': expl_params, 'implicitArgs': impl_params, 'integrator': integrator, 'relaxationIntegrator': relaxation_integrator, } return params