diff --git a/.gitignore b/.gitignore index 005142f1..cdeca9e2 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,9 @@ ipython_config.py dmypy.json +# IDE +.idea + # vim [._]*.s[a-v][a-z] [._]*.sw[a-p] diff --git a/setup.py b/setup.py index e0204a18..a43ca67b 100644 --- a/setup.py +++ b/setup.py @@ -121,6 +121,7 @@ def extensions(): "tqdm", "pandas", "tables", + "importlib-resources;python_version<'3.10'", ] EXTRAS_REQUIRE = { @@ -145,7 +146,7 @@ def extensions(): entry_points={'console_scripts': console_scripts}, install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_REQUIRE, - package_data={}, + package_data={"westpa": ["data/*.xml"]}, packages=find_packages(where='src'), package_dir={"": "src"}, description='WESTPA is a package for constructing and running stochastic simulations using the "weighted ensemble" approach of Huber and Kim (1996).', diff --git a/src/westpa/core/data_manager.py b/src/westpa/core/data_manager.py index 0b6dbecf..54d80e17 100644 --- a/src/westpa/core/data_manager.py +++ b/src/westpa/core/data_manager.py @@ -47,9 +47,13 @@ import sys import threading import time +import re import builtins from operator import attrgetter -from os.path import relpath, dirname +from os.path import relpath, dirname, exists +from os import remove +from shutil import copyfile, move +from subprocess import run, CalledProcessError import h5py from h5py import h5s @@ -247,8 +251,17 @@ def process_config(self): ['west', 'data', 'aux_compression_threshold'], self.default_aux_compression_threshold ) self.flush_period = config.get(['west', 'data', 'flush_period'], self.default_flush_period) - self.iter_ref_h5_template = config.get(['west', 'data', 'data_refs', 'iteration'], None) - self.store_h5 = self.iter_ref_h5_template is not None + + # Path to per-iter h5 file + self.iter_ref_h5_path_template = config.get(['west', 'data', 'data_refs', 'iteration'], None) + try: + # Generating path to a template file for per-iter h5 file + self.iter_ref_h5_template = re.sub(r'\{(.*?)\}', 'template', self.iter_ref_h5_path_template) + except TypeError: + self.iter_ref_h5_template = None + + # If not provided, turn HDF5 Framework off. + self.store_h5 = self.iter_ref_h5_path_template is not None # Process dataset options dsopts_list = config.get(['west', 'data', 'datasets']) or [] @@ -280,8 +293,10 @@ def __init__(self, rc=None): self.last_flush = 0 self._system = None - self.iter_ref_h5_template = None - self.store_h5 = False + self.iter_ref_h5_path_template = None # Template for per-iter H5 file Path + self.iter_ref_h5_template = None # Path to per-iter H5 template file + self.store_h5 = False # Indicates HDF5 Framework is activated or not + self.template_copy_flag = False # Flag indicating the template file was made this iteration self.dataset_options = {} self.process_config() @@ -569,13 +584,37 @@ def update_iter_h5file(self, n_iter, segments): return west_h5_file = makepath(self.we_h5filename) - iter_ref_h5_file = makepath(self.iter_ref_h5_template, {'n_iter': n_iter}) + iter_ref_h5_file = makepath(self.iter_ref_h5_path_template, {'n_iter': n_iter}) iter_ref_rel_path = relpath(iter_ref_h5_file, dirname(west_h5_file)) + if self.iter_ref_h5_template: + # Make path to per-iter H5 File + iter_ref_h5_file_template = makepath(self.iter_ref_h5_template, {'n_iter': n_iter}) + + # Copy the template per-iter H5 file with topology + if exists(iter_ref_h5_file_template) and not exists(iter_ref_h5_file): + copyfile(iter_ref_h5_file_template, iter_ref_h5_file) with h5io.WESTIterationFile(iter_ref_h5_file, 'a') as outf: for segment in segments: outf.write_segment(segment, True) + if self.iter_ref_h5_template and not exists(iter_ref_h5_file_template): + # If template per-iter H5 file does not exist, copy and scrub out old data + copyfile(iter_ref_h5_file, iter_ref_h5_file_template) + with h5io.WESTIterationFile(iter_ref_h5_file_template, 'a') as outf: + outf.scrub_data() + + # Launch a subprocess to repack the file to reclaim space, replacing template with smaller file + try: + run( + f'h5repack {iter_ref_h5_file_template} {iter_ref_h5_file_template + "_repacked"}', shell=True + ).check_returncode() + move(f'{iter_ref_h5_file_template}_repacked', iter_ref_h5_file_template) + except CalledProcessError as e: # Unsuccessful in repacking file + log.warning(f'Unable to repack into {iter_ref_h5_file_template}_repacked.h5: {e}') + if exists(f'{iter_ref_h5_file_template+"_repacked.h5"}'): + remove(f'{iter_ref_h5_file_template+"_repacked.h5"}') + iter_group = self.get_iter_group(n_iter) if 'trajectories' not in iter_group: @@ -983,7 +1022,7 @@ def update_segments(self, n_iter, segments): si_fsel.select_hyperslab((seg_id,), (1,), op=op) pc_fsel.select_hyperslab((seg_id, 0, 0), (1, pcoord_len, pcoord_ndim), op=op) - # read summary data so that we have valud parent and weight transfer information + # read summary data so that we have value, parent and weight transfer information si_dsid.read(si_msel, si_fsel, seg_index_entries) for iseg, (segment, ientry) in enumerate(zip(segments, seg_index_entries)): @@ -1161,7 +1200,7 @@ def prepare_segment_restarts(self, segments, basis_states=None, initial_states=N parent = Segment(n_iter=segment.n_iter - 1, seg_id=segment.parent_id) try: - parent_iter_ref_h5_file = makepath(self.iter_ref_h5_template, {'n_iter': parent.n_iter}) + parent_iter_ref_h5_file = makepath(self.iter_ref_h5_path_template, {'n_iter': parent.n_iter}) with h5io.WESTIterationFile(parent_iter_ref_h5_file, 'r') as outf: outf.read_restart(parent) diff --git a/src/westpa/core/h5io.py b/src/westpa/core/h5io.py index 2d018f5e..f662b853 100644 --- a/src/westpa/core/h5io.py +++ b/src/westpa/core/h5io.py @@ -447,15 +447,15 @@ def get_iter_group(self, n_iter, group=None): class WESTIterationFile(HDF5TrajectoryFile): def __init__(self, file, mode='r', force_overwrite=True, compression='zlib', link=None): - if isinstance(file, str): + if isinstance(file, str): # Create new file from string super(WESTIterationFile, self).__init__(file, mode, force_overwrite, compression) else: try: - self._init_from_handle(file) + self._init_from_handle(file) # If a WESTIterationFile object, just make sure it's open correctly except AttributeError: raise ValueError('unknown input type: %s' % str(type(file))) - def _init_from_handle(self, handle): + def _init_from_handle(self, handle: HDF5TrajectoryFile): self._handle = handle self._open = handle.isopen != 0 self.mode = mode = handle.mode # the mode in which the file was opened? @@ -483,6 +483,14 @@ def _init_from_handle(self, handle): self._frame_index = 0 self._needs_initialization = False + def __contains__(self, path): + try: + self._get_node('/', path) + except self.tables.NoSuchNodeError: + return False + + return True + def read(self, frame_indices=None, atom_indices=None): _check_mode(self.mode, ('r',)) @@ -666,9 +674,17 @@ def write_segment(self, segment, pop=False): restart = get_data('iterh5/restart', None) slog = get_data('iterh5/log', None) + # topology + if self.mode == 'a': + if not self.has_topology(): + self.topology = traj.topology + elif self.mode == 'w': + self.topology = traj.topology + if traj is not None: - # create trajectory object - traj = WESTTrajectory(traj, iter_labels=n_iter, seg_labels=segment.seg_id) + # create trajectory object or if already is, skip. + if not isinstance(traj, WESTTrajectory): + traj = WESTTrajectory(traj, iter_labels=n_iter, seg_labels=segment.seg_id) if traj.n_frames == 0: # we may consider logging warnings instead throwing errors for later. # right now this is good for debugging purposes @@ -702,13 +718,6 @@ def write_segment(self, segment, pop=False): cell_angles=traj.unitcell_angles, ) - # topology - if self.mode == 'a': - if not self.has_topology(): - self.topology = traj.topology - elif self.mode == 'w': - self.topology = traj.topology - # restart if restart is not None: if self.has_restart(segment): @@ -734,6 +743,17 @@ def write_segment(self, segment, pop=False): createparents=True, ) + def scrub_data(self): + '''Method to remove existing coordinates, pointers etc. while preserving topology''' + for node in ['log', 'restart', 'time', 'coordinates', 'pointer', 'cell_angles', 'cell_lengths']: + try: + self._remove_node('/', node, recursive=True) + except self.tables.exceptions.NoSuchNodeError: + pass + self._frame_index = 0 + self.root._v_attrs.n_iter = 0 + self.flush() + @property def _create_group(self): if self.tables.__version__ >= '3.0.0': diff --git a/src/westpa/core/propagators/executable.py b/src/westpa/core/propagators/executable.py index 6c8045f7..6d151127 100644 --- a/src/westpa/core/propagators/executable.py +++ b/src/westpa/core/propagators/executable.py @@ -20,7 +20,7 @@ from westpa.core.segment import Segment from westpa.core.yamlcfg import check_bool -from westpa.core.trajectory import load_trajectory +from westpa.core.trajectory import load_mdtraj, load_netcdf, load_mda from westpa.core.h5io import safe_extract log = logging.getLogger(__name__) @@ -52,9 +52,8 @@ def pcoord_loader(fieldname, pcoord_return_filename, destobj, single_point): pcoord.shape = expected_shape if pcoord.shape != expected_shape: raise ValueError( - 'progress coordinate data has incorrect shape {!r} [expected {!r}] Check pcoord.err or seg_logs for more information.'.format( - pcoord.shape, expected_shape - ) + 'progress coordinate data has incorrect shape {!r} [expected {!r}] Check pcoord.err or seg_logs for more ' + 'information.'.format(pcoord.shape, expected_shape) ) destobj.pcoord = pcoord @@ -83,17 +82,41 @@ def pickle_data_loader(fieldname, coord_file, segment, single_point): raise ValueError('could not read any data for {}'.format(fieldname)) -def trajectory_loader(fieldname, coord_folder, segment, single_point): - '''Load data from the trajectory return. ``coord_folder`` should be the path to a folder +def mdtraj_trajectory_loader(fieldname, coord_folder, segment, single_point): + '''Load data from the trajectory return using MDTraj. ``coord_folder`` should be the path to a folder containing trajectory files. ``segment`` is the ``Segment`` object that the data is associated with. Please see ``load_trajectory`` for more details. ``single_point`` is not used by this loader.''' try: - data = load_trajectory(coord_folder) + data = load_mdtraj(coord_folder) segment.data['iterh5/trajectory'] = data except Exception as e: log.warning('could not read any {} data for HDF5 Framework: {}'.format(fieldname, str(e))) +def netcdf_trajectory_loader(fieldname, coord_folder, segment, single_point): + '''Load Amber .nc data from the trajectory return. ``coord_folder`` should be the path to a folder + containing trajectory files. ``segment`` is the ``Segment`` object that the data is associated with. + Please see ``load_netcdf`` for more details. ``single_point`` is not used by this loader.''' + try: + data = load_netcdf(coord_folder) + segment.data['iterh5/trajectory'] = data + except Exception as e: + log.warning('Falling back to default loader for {}: {}'.format(fieldname, str(e))) + mdtraj_trajectory_loader(fieldname, coord_folder, segment, single_point) + + +def mda_trajectory_loader(fieldname, coord_folder, segment, single_point): + '''Load data from the trajectory return. ``coord_folder`` should be the path to a folder + containing trajectory files. ``segment`` is the ``Segment`` object that the data is associated with. + Please see ``load_mda`` for more details. ``single_point`` is not used by this loader.''' + try: + data = load_mda(coord_folder) + segment.data['iterh5/trajectory'] = data + except Exception as e: + log.warning('Falling back to default loader for {}: {}'.format(fieldname, str(e))) + mdtraj_trajectory_loader(fieldname, coord_folder, segment, single_point) + + def restart_loader(fieldname, restart_folder, segment, single_point): '''Load data from the restart return. The loader will tar all files in ``restart_folder`` and store it in the per-iteration HDF5 file. ``segment`` is the ``Segment`` object that @@ -154,7 +177,7 @@ def seglog_loader(fieldname, log_file, segment, single_point): d.close() -# Dictionary with all the possible loaders +# Dictionary with all the possible aux dataset loaders data_loaders = { 'default': aux_data_loader, 'auxdata_loader': aux_data_loader, @@ -165,6 +188,19 @@ def seglog_loader(fieldname, log_file, segment, single_point): 'pickle_data_loader': pickle_data_loader, } +# Dictionary with all the possible trajectory loaders +trajectory_loaders = { + 'default': mdtraj_trajectory_loader, + 'trajectory_loader': mdtraj_trajectory_loader, + 'MDTraj_trajectory_loader': mdtraj_trajectory_loader, + 'mdtraj_trajectory_loader': mdtraj_trajectory_loader, + 'amber_trajectory_loader': netcdf_trajectory_loader, + 'netcdf_trajectory_loader': netcdf_trajectory_loader, + 'mda_trajectory_loader': mda_trajectory_loader, + 'MDAnalysis_trajectory_loaderr': mda_trajectory_loader, + 'mdanalysis_trajectory_loaderr': mda_trajectory_loader, +} + class ExecutablePropagator(WESTPropagator): ENV_CURRENT_ITER = 'WEST_CURRENT_ITER' @@ -191,6 +227,21 @@ class ExecutablePropagator(WESTPropagator): ENV_RAND128 = 'WEST_RAND128' ENV_RANDFLOAT = 'WEST_RANDFLOAT' + @staticmethod + def makepath(template, template_args=None, expanduser=True, expandvars=True, abspath=False, realpath=False): + template_args = template_args or {} + path = template.format(**template_args) + if expandvars: + path = os.path.expandvars(path) + if expanduser: + path = os.path.expanduser(path) + if realpath: + path = os.path.realpath(path) + if abspath: + path = os.path.abspath(path) + path = os.path.normpath(path) + return path + def __init__(self, rc=None): super().__init__(rc) @@ -209,8 +260,7 @@ def __init__(self, rc=None): # A mapping of data set name ('pcoord', 'coord', 'com', etc) to a dictionary of # attributes like 'loader', 'dtype', etc - self.data_info = {} - self.data_info['pcoord'] = {} + self.data_info = {'pcoord': {}} # Validate configuration config = self.rc.config @@ -261,9 +311,10 @@ def __init__(self, rc=None): # Load configuration items relating to dataset input self.data_info['pcoord'] = {'name': 'pcoord', 'loader': pcoord_loader, 'enabled': True, 'filename': None, 'dir': False} + self.data_info['trajectory'] = { 'name': 'trajectory', - 'loader': trajectory_loader, + 'loader': mdtraj_trajectory_loader, 'enabled': store_h5, 'filename': None, 'dir': True, @@ -292,15 +343,23 @@ def __init__(self, rc=None): check_bool(dsinfo.setdefault('enabled', True)) loader_directive = dsinfo.get('loader', None) - if callable(loader_directive): + if 'module_path' in dsinfo: + dspath = self.makepath(dsinfo['module_path']) + else: + dspath = None + + if callable(loader_directive): # If directly callable, then use it loader = loader_directive - elif loader_directive in data_loaders.keys(): - if dsname not in ['pcoord', 'seglog', 'restart', 'trajectory']: + elif dsname in ['trajectory']: # Special dataset for saving trajectory coordinates in HDF5 Framework + if loader_directive in trajectory_loaders: + loader = trajectory_loaders[loader_directive] + else: + loader = get_object(loader_directive, path=dspath) + elif dsname not in ['pcoord', 'seglog', 'restart']: # If not a "protected" dataset names + if loader_directive in data_loaders: loader = data_loaders[loader_directive] else: - loader = get_object(loader_directive) - elif dsname not in ['pcoord', 'seglog', 'restart', 'trajectory']: - loader = aux_data_loader + loader = get_object(loader_directive, path=dspath) else: # YOLO. Or maybe it wasn't specified. loader = loader_directive @@ -311,21 +370,6 @@ def __init__(self, rc=None): log.debug('data_info: {!r}'.format(self.data_info)) - @staticmethod - def makepath(template, template_args=None, expanduser=True, expandvars=True, abspath=False, realpath=False): - template_args = template_args or {} - path = template.format(**template_args) - if expandvars: - path = os.path.expandvars(path) - if expanduser: - path = os.path.expanduser(path) - if realpath: - path = os.path.realpath(path) - if abspath: - path = os.path.abspath(path) - path = os.path.normpath(path) - return path - def random_val_env_vars(self): '''Return a set of environment variables containing random seeds. These are returned as a dictionary, suitable for use in ``os.environ.update()`` or as the ``env`` argument to diff --git a/src/westpa/core/sim_manager.py b/src/westpa/core/sim_manager.py index f45a17c7..dfd5fc8a 100644 --- a/src/westpa/core/sim_manager.py +++ b/src/westpa/core/sim_manager.py @@ -55,19 +55,17 @@ def __init__(self, rc=None): # A table of function -> list of (priority, name, callback) tuples self._callback_table = {} - self._valid_callbacks = set( - ( - self.prepare_run, - self.finalize_run, - self.prepare_iteration, - self.finalize_iteration, - self.pre_propagation, - self.post_propagation, - self.pre_we, - self.post_we, - self.prepare_new_iteration, - ) - ) + self._valid_callbacks = { + self.prepare_run, + self.finalize_run, + self.prepare_iteration, + self.finalize_iteration, + self.pre_propagation, + self.post_propagation, + self.pre_we, + self.post_we, + self.prepare_new_iteration, + } self._callbacks_by_name = {fn.__name__: fn for fn in self._valid_callbacks} self.n_propagated = 0 diff --git a/src/westpa/core/trajectory.py b/src/westpa/core/trajectory.py index 2a6ef962..fdf371b7 100644 --- a/src/westpa/core/trajectory.py +++ b/src/westpa/core/trajectory.py @@ -1,16 +1,114 @@ import numpy as np import os -from mdtraj import Trajectory, load as load_traj, FormatRegistry, formats as mdformats -from mdtraj.core.trajectory import _TOPOLOGY_EXTS, _get_extension as get_extension +from mdtraj import Trajectory + + +def parseResidueAtoms(residue, map): + '''Parse all atoms from residue. Taken from MDTraj.''' + for atom in residue.findall('Atom'): + name = atom.attrib['name'] + for id in atom.attrib: + map[atom.attrib[id]] = name + + +def loadNameReplacementTables(): + '''Load the list of atom and residue name replacements. Taken from MDTraj.''' + + # importing things here because they're only used in this function + try: + from importlib.resources import files + except ImportError: + from importlib_resources import files + + import xml.etree.ElementTree as etree + from copy import copy + + residueNameReplacements = {} + atomNameReplacements = {} + + # This XML file is a to map all sorts of atom names/ residue names to the PDB 3.0 convention. + # Taken from MDTraj. + tree = etree.parse(files('westpa') / 'data/pdbNames.xml') + allResidues = {} + proteinResidues = {} + nucleicAcidResidues = {} + for residue in tree.getroot().findall('Residue'): + name = residue.attrib['name'] + if name == 'All': + parseResidueAtoms(residue, allResidues) + elif name == 'Protein': + parseResidueAtoms(residue, proteinResidues) + elif name == 'Nucleic': + parseResidueAtoms(residue, nucleicAcidResidues) + for atom in allResidues: + proteinResidues[atom] = allResidues[atom] + nucleicAcidResidues[atom] = allResidues[atom] + for residue in tree.getroot().findall('Residue'): + name = residue.attrib['name'] + for id in residue.attrib: + if id == 'name' or id.startswith('alt'): + residueNameReplacements[residue.attrib[id]] = name + if 'type' not in residue.attrib: + atoms = copy(allResidues) + elif residue.attrib['type'] == 'Protein': + atoms = copy(proteinResidues) + elif residue.attrib['type'] == 'Nucleic': + atoms = copy(nucleicAcidResidues) + else: + atoms = copy(allResidues) + parseResidueAtoms(residue, atoms) + atomNameReplacements[name] = atoms + + return residueNameReplacements, atomNameReplacements + + +def convert_mda_top_to_mdtraj(universe): + '''Convert a MDAnalysis Universe object's topology to a ``mdtraj.Topology`` object.''' + + from mdtraj import Topology + from mdtraj.core.element import get_by_symbol + from MDAnalysis.exceptions import NoDataError + + top = Topology() # Empty topology object + residueNameReplacements, atomNameReplacements = loadNameReplacementTables() + + # Add in all the chains (called segments in MDAnalysis) + for chain_segment in universe.segments: + top.add_chain() + + all_chains = list(top.chains) + + # Add in all the residues + for residue in universe.residues: + try: + resname = residueNameReplacements[residue.resname] + except KeyError: + resname = residue.resname + + top.add_residue(name=resname, chain=all_chains[residue.segindex], resSeq=residue.resid) -FormatRegistry.loaders['.rst'] = mdformats.amberrst.load_restrt -FormatRegistry.fileobjects['.rst'] = mdformats.AmberRestartFile + all_residues = list(top.residues) -TRAJECTORY_EXTS = list(FormatRegistry.loaders.keys()) -TOPOLOGY_EXTS = list(_TOPOLOGY_EXTS) -for ext in [".h5", ".hdf5", ".lh5"]: - TOPOLOGY_EXTS.remove(ext) + # Add in all the atoms + for atom, resid in zip(universe.atoms, universe.atoms.resindices): + try: + atomname = residueNameReplacements[atom.resname][atom.name] + except (KeyError, TypeError): + atomname = atom.name + + top.add_atom(name=atomname, element=get_by_symbol(atom.element), residue=all_residues[resid]) + + all_atoms = list(top.atoms) + + # Add in all the bonds. Depending on the topology type (e.g., pdb), there might not be bond information. + try: + for b_idx in universe.bonds._bix: + top.add_bond(all_atoms[b_idx[0]], all_atoms[b_idx[1]]) + except NoDataError: + top.create_standard_bonds() + + return top class WESTTrajectory(Trajectory): @@ -286,13 +384,48 @@ def slice(self, key, copy=True): return traj -def load_trajectory(folder): - '''Load trajectory from ``folder`` using ``mdtraj`` and return a ``mdtraj.Trajectory`` - object. The folder should contain a trajectory and a topology file (with a recognizable - extension) that is supported by ``mdtraj``. The topology file is optional if the - trajectory file contains topology data (e.g., HDF5 format). +def get_extension(filename): + '''A function to get the format extension of a file.''' + (base, extension) = os.path.splitext(filename) + + # Return the other part of the extension as well if it's a gzip. + if extension == '.gz': + return os.path.splitext(base)[1] + extension + + return extension + + +def find_top_traj_file(folder, eligible_top, eligible_traj): + '''A general (reusable) function for identifying and returning the appropriate + file names in ``folder`` which are toplogy and trajectory. Useful when writing custom loaders. + Note that it's possible that the topology_file and trajectory_file are identical. + + Parameters + ---------- + folder : str or os.Pathlike + A string or Pathlike to the folder to search. + + eligible_top : list of strings + A list of accepted topology extensions. + + eligible_traj : list of strings + A list of accepted topology extensions. + + + Returns + ------- + top_file : str + Path to topology file + + traj_file : str + Path to trajectory file + ''' - traj_file = top_file = None + + # Setting up the return variables + top_file = traj_file = None + + # Extract a list of all files, ignoring hidden files that start with a '.' file_list = [f_name for f_name in os.listdir(folder) if not f_name.startswith('.')] for filename in file_list: @@ -302,15 +435,15 @@ def load_trajectory(folder): ext = get_extension(filename).lower() # Catching trajectory formats that can be topology and trajectories at the same time. - # Only activates when there is a single file. - if len(file_list) < 2 and ext in TOPOLOGY_EXTS and ext in TRAJECTORY_EXTS: + # Only activates when there is a single file in the folder. + if len(file_list) < 2 and ext in eligible_top and ext in eligible_traj: top_file = filename traj_file = filename # Assuming topology file is copied first. - if ext in TOPOLOGY_EXTS and top_file is None: + if ext in eligible_top and top_file is None: top_file = filename - elif ext in TRAJECTORY_EXTS and traj_file is None: + elif ext in eligible_traj and traj_file is None: traj_file = filename if top_file is not None and traj_file is not None: @@ -321,10 +454,125 @@ def load_trajectory(folder): traj_file = os.path.join(folder, traj_file) - kwargs = {} if top_file is not None: top_file = os.path.join(folder, top_file) - kwargs['top'] = top_file - traj = load_traj(traj_file, **kwargs) + return top_file, traj_file + + +def mdtraj_supported_extensions(): + from mdtraj import FormatRegistry, formats as mdformats + from mdtraj.core.trajectory import _TOPOLOGY_EXTS + + FormatRegistry.loaders['.rst'] = mdformats.amberrst.load_restrt + FormatRegistry.fileobjects['.rst'] = mdformats.AmberRestartFile + FormatRegistry.loaders['.ncrst'] = mdformats.amberrst.load_ncrestrt + FormatRegistry.fileobjects['.ncrst'] = mdformats.AmberRestartFile + + TRAJECTORY_EXTS = list(FormatRegistry.loaders.keys()) + TOPOLOGY_EXTS = list(_TOPOLOGY_EXTS) + + for ext in [".h5", ".hdf5", ".lh5"]: + TOPOLOGY_EXTS.remove(ext) + + return TOPOLOGY_EXTS, TRAJECTORY_EXTS + + +def mdanalysis_supported_extensions(): + import MDAnalysis as mda + + TRAJECTORY_EXTS = [reader.format if isinstance(reader.format, list) else [reader.format] for reader in mda._READERS.values()] + TRAJECTORY_EXTS = list(set(f'.{ext.lower()}' for ilist in TRAJECTORY_EXTS for ext in ilist)) + + TOPOLOGY_EXTS = [parser.format if isinstance(parser.format, list) else [parser.format] for parser in mda._PARSERS.values()] + TOPOLOGY_EXTS = list(set(f'.{ext.lower()}' for ilist in TOPOLOGY_EXTS for ext in ilist)) + + return TOPOLOGY_EXTS, TRAJECTORY_EXTS + + +def load_mdtraj(folder): + '''Load trajectory from ``folder`` using ``mdtraj`` and return a ``mdtraj.Trajectory`` + object. The folder should contain a trajectory and a topology file (with a recognizable + extension) that is supported by ``mdtraj``. The topology file is optional if the + trajectory file contains topology data (e.g., HDF5 format). + ''' + from mdtraj import load as load_traj + + TOPOLOGY_EXTS, TRAJECTORY_EXTS = mdtraj_supported_extensions() + + top_file, traj_file = find_top_traj_file(folder, TOPOLOGY_EXTS, TRAJECTORY_EXTS) + + # MDTraj likes the (optional) topology part to be provided within a dictionary + traj = load_traj(traj_file, **{'top': top_file}) + + return traj + + +def load_netcdf(folder): + '''Load netcdf file from ``folder`` using ``scipy.io`` and return a ``mdtraj.Trajectory`` + object. The folder should contain a Amber trajectory file with extensions `.nc` or `.ncdf`. + + Note coordinates and box lengths are all divided by 10 to change from Angstroms to nanometers. + ''' + from scipy.io import netcdf_file + + _, traj_file = find_top_traj_file(folder, [], ['.nc', '.ncdf']) + + # Extracting these datasets + datasets = {'coordinates': None, 'cell_lengths': None, 'cell_angles': None, 'time': None} + convert = ['coordinates', 'cell_lengths'] # Length-based datasets that need to be converted from Å to nm + + with netcdf_file(traj_file) as rootgrp: + for key, val in datasets.items(): + if key in convert and key in rootgrp.variables: + datasets[key] = rootgrp.variables[key][:].copy() / 10 # From Å to nm + else: + datasets[key] = rootgrp.variables[key][:].copy() # noqa: F841 + + map_dataset = { + 'coordinates': datasets['coordinates'], + 'unitcell_lengths': datasets['cell_lengths'], + 'unitcell_angles': datasets['cell_angles'], + 'time': datasets['time'], + } + + return WESTTrajectory(**map_dataset) + + +def load_mda(folder): + '''Load a file from ``folder`` using ``MDAnalysis`` and return a ``mdtraj.Trajectory`` + object. The folder should contain a trajectory and a topology file (with a recognizable + extension) that is supported by ``MDAnalysis``. The topology file is optional if the + trajectory file contains topology data (e.g., H5MD format). + + Note coordinates and box lengths are all divided by 10 to change from Angstroms to nanometers. + ''' + import MDAnalysis as mda + + TOPOLOGY_EXTS, TRAJECTORY_EXTS = mdanalysis_supported_extensions() + + top_file, traj_file = find_top_traj_file(folder, TOPOLOGY_EXTS, TRAJECTORY_EXTS) + + u = mda.Universe(top_file, traj_file) + + tot_frames = len(u.trajectory) + coords = np.zeros((tot_frames, len(u.atoms), 3)) + cell_lengths = np.zeros((tot_frames, 3)) + cell_angles = np.zeros((tot_frames, 3)) + time = np.zeros((tot_frames)) + + convert = [coords, cell_lengths] # Length-based datasets that need to be converted + + # Loop through each frame and add that frame to the relevant datasets + for iframe, frame in enumerate(u.trajectory): + coords[iframe] = frame._pos + cell_lengths[iframe] = frame.dimensions[:3] + cell_angles[iframe] = frame.dimensions[3:] + time[iframe] = frame.time + + for dset in convert: + dset = mda.units.convert(dset, 'angstrom', 'nanometer') + + traj = WESTTrajectory(coordinates=coords, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles, time=time) + return traj diff --git a/src/westpa/data/pdbNames.xml b/src/westpa/data/pdbNames.xml new file mode 100644 index 00000000..6fb222dc --- /dev/null +++ b/src/westpa/data/pdbNames.xml @@ -0,0 +1,277 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +