Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hdf5 speedup #28

Open
wants to merge 29 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d983cc4
modifications to allow template files + netcdf loader
jeremyleung521 Nov 22, 2023
bcb53b6
Merge branch 'develop' of github.com:westpa/westpa into hdf5-speedup
jeremyleung521 Nov 22, 2023
f54398d
Update load_netcdf to output WESTTrajectory class objects, which supp…
jeremyleung521 Nov 26, 2023
45260c9
fix declaration of valid_callbacks
jeremyleung521 Nov 27, 2023
bfb1352
Merge executable_mod into the main executable.py
jeremyleung521 Nov 30, 2023
7558fe8
Create trajectory/restart/log datasets regardless
jeremyleung521 Dec 8, 2023
56167ad
lint and additional checks for h5repack
jeremyleung521 Dec 13, 2023
b609fd1
Fix tables import error in h5io
jeremyleung521 Dec 13, 2023
c1518ad
Units! Make sure everything is in nm.
jeremyleung521 Dec 15, 2023
76a1c11
Bug fixes for the units conversion thing.
jeremyleung521 Jan 23, 2024
f612ad6
A cleaner check for unit conversion
jeremyleung521 Jan 26, 2024
4ae0ce6
Merge branch 'develop' of github.com:westpa/westpa into hdf5-speedup
jeremyleung521 Feb 13, 2024
8b4ce70
Merge branch 'develop' of github.com:westpa/westpa into hdf5-speedup
jeremyleung521 Feb 19, 2024
5ad4a6c
remove need for netCDF4, add in mda_trajectory_loader
jeremyleung521 Apr 11, 2024
9300805
new load_netcdf should now work properly
jeremyleung521 Apr 12, 2024
ba7d999
convert mda topology to mdtraj topology, tested on pdb, prmtop
jeremyleung521 Apr 15, 2024
935cac0
better docstrings
jeremyleung521 Apr 15, 2024
3516257
remove a lengthy mdtraj.formats import
jeremyleung521 Apr 15, 2024
9bdfb93
Merge branch 'develop' into hdf5-speedup
jeremyleung521 May 13, 2024
16a6d3c
shoddy merge
jeremyleung521 May 13, 2024
7235ed0
fix loader specification
jeremyleung521 May 13, 2024
b7c1ba2
ignore NaturalNameWarning (#416)
jeremyleung521 May 14, 2024
6b1f2c1
fix summary tables in westpa.analysis (#418)
jeremyleung521 May 22, 2024
d84ed74
fix w_states --replace reporting (#421)
jeremyleung521 May 22, 2024
926018e
README update (#420)
jeremyleung521 May 22, 2024
f663e8e
Merge branch 'develop' of github.com:jeremyleung521/westpa into hdf5-…
jeremyleung521 Jun 10, 2024
0b194ab
Merge branch 'develop' of github.com:westpa/westpa into hdf5-speedup
jeremyleung521 Sep 4, 2024
d61f31f
Merge branch 'develop' of ssh://github.com/westpa/westpa into hdf5-sp…
jeremyleung521 Oct 30, 2024
2d08058
Merge branch 'develop' of ssh://github.com/westpa/westpa into hdf5-sp…
jeremyleung521 Nov 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ ipython_config.py
dmypy.json


# IDE
.idea

# vim
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def extensions():
"tqdm",
"pandas",
"tables",
"importlib-resources;python_version<'3.10'",
]

EXTRAS_REQUIRE = {
Expand All @@ -145,7 +146,7 @@ def extensions():
entry_points={'console_scripts': console_scripts},
install_requires=INSTALL_REQUIRES,
extras_require=EXTRAS_REQUIRE,
package_data={},
package_data={"westpa": ["data/*.xml"]},
packages=find_packages(where='src'),
package_dir={"": "src"},
description='WESTPA is a package for constructing and running stochastic simulations using the "weighted ensemble" approach of Huber and Kim (1996).',
Expand Down
55 changes: 47 additions & 8 deletions src/westpa/core/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@
import sys
import threading
import time
import re
import builtins
from operator import attrgetter
from os.path import relpath, dirname
from os.path import relpath, dirname, exists
from os import remove
from shutil import copyfile, move
from subprocess import run, CalledProcessError

import h5py
from h5py import h5s
Expand Down Expand Up @@ -247,8 +251,17 @@ def process_config(self):
['west', 'data', 'aux_compression_threshold'], self.default_aux_compression_threshold
)
self.flush_period = config.get(['west', 'data', 'flush_period'], self.default_flush_period)
self.iter_ref_h5_template = config.get(['west', 'data', 'data_refs', 'iteration'], None)
self.store_h5 = self.iter_ref_h5_template is not None

# Path to per-iter h5 file
self.iter_ref_h5_path_template = config.get(['west', 'data', 'data_refs', 'iteration'], None)
try:
# Generating path to a template file for per-iter h5 file
self.iter_ref_h5_template = re.sub(r'\{(.*?)\}', 'template', self.iter_ref_h5_path_template)
except TypeError:
self.iter_ref_h5_template = None

# If not provided, turn HDF5 Framework off.
self.store_h5 = self.iter_ref_h5_path_template is not None

# Process dataset options
dsopts_list = config.get(['west', 'data', 'datasets']) or []
Expand Down Expand Up @@ -280,8 +293,10 @@ def __init__(self, rc=None):
self.last_flush = 0

self._system = None
self.iter_ref_h5_template = None
self.store_h5 = False
self.iter_ref_h5_path_template = None # Template for per-iter H5 file Path
self.iter_ref_h5_template = None # Path to per-iter H5 template file
self.store_h5 = False # Indicates HDF5 Framework is activated or not
self.template_copy_flag = False # Flag indicating the template file was made this iteration

self.dataset_options = {}
self.process_config()
Expand Down Expand Up @@ -569,13 +584,37 @@ def update_iter_h5file(self, n_iter, segments):
return

west_h5_file = makepath(self.we_h5filename)
iter_ref_h5_file = makepath(self.iter_ref_h5_template, {'n_iter': n_iter})
iter_ref_h5_file = makepath(self.iter_ref_h5_path_template, {'n_iter': n_iter})
iter_ref_rel_path = relpath(iter_ref_h5_file, dirname(west_h5_file))
if self.iter_ref_h5_template:
# Make path to per-iter H5 File
iter_ref_h5_file_template = makepath(self.iter_ref_h5_template, {'n_iter': n_iter})

# Copy the template per-iter H5 file with topology
if exists(iter_ref_h5_file_template) and not exists(iter_ref_h5_file):
copyfile(iter_ref_h5_file_template, iter_ref_h5_file)

with h5io.WESTIterationFile(iter_ref_h5_file, 'a') as outf:
for segment in segments:
outf.write_segment(segment, True)

if self.iter_ref_h5_template and not exists(iter_ref_h5_file_template):
# If template per-iter H5 file does not exist, copy and scrub out old data
copyfile(iter_ref_h5_file, iter_ref_h5_file_template)
with h5io.WESTIterationFile(iter_ref_h5_file_template, 'a') as outf:
outf.scrub_data()

# Launch a subprocess to repack the file to reclaim space, replacing template with smaller file
try:
run(
f'h5repack {iter_ref_h5_file_template} {iter_ref_h5_file_template + "_repacked"}', shell=True
).check_returncode()
move(f'{iter_ref_h5_file_template}_repacked', iter_ref_h5_file_template)
except CalledProcessError as e: # Unsuccessful in repacking file
log.warning(f'Unable to repack into {iter_ref_h5_file_template}_repacked.h5: {e}')
if exists(f'{iter_ref_h5_file_template+"_repacked.h5"}'):
remove(f'{iter_ref_h5_file_template+"_repacked.h5"}')

iter_group = self.get_iter_group(n_iter)

if 'trajectories' not in iter_group:
Expand Down Expand Up @@ -983,7 +1022,7 @@ def update_segments(self, n_iter, segments):
si_fsel.select_hyperslab((seg_id,), (1,), op=op)
pc_fsel.select_hyperslab((seg_id, 0, 0), (1, pcoord_len, pcoord_ndim), op=op)

# read summary data so that we have valud parent and weight transfer information
# read summary data so that we have value, parent and weight transfer information
si_dsid.read(si_msel, si_fsel, seg_index_entries)

for iseg, (segment, ientry) in enumerate(zip(segments, seg_index_entries)):
Expand Down Expand Up @@ -1161,7 +1200,7 @@ def prepare_segment_restarts(self, segments, basis_states=None, initial_states=N
parent = Segment(n_iter=segment.n_iter - 1, seg_id=segment.parent_id)

try:
parent_iter_ref_h5_file = makepath(self.iter_ref_h5_template, {'n_iter': parent.n_iter})
parent_iter_ref_h5_file = makepath(self.iter_ref_h5_path_template, {'n_iter': parent.n_iter})

with h5io.WESTIterationFile(parent_iter_ref_h5_file, 'r') as outf:
outf.read_restart(parent)
Expand Down
44 changes: 32 additions & 12 deletions src/westpa/core/h5io.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,15 +447,15 @@ def get_iter_group(self, n_iter, group=None):

class WESTIterationFile(HDF5TrajectoryFile):
def __init__(self, file, mode='r', force_overwrite=True, compression='zlib', link=None):
if isinstance(file, str):
if isinstance(file, str): # Create new file from string
super(WESTIterationFile, self).__init__(file, mode, force_overwrite, compression)
else:
try:
self._init_from_handle(file)
self._init_from_handle(file) # If a WESTIterationFile object, just make sure it's open correctly
except AttributeError:
raise ValueError('unknown input type: %s' % str(type(file)))

def _init_from_handle(self, handle):
def _init_from_handle(self, handle: HDF5TrajectoryFile):
self._handle = handle
self._open = handle.isopen != 0
self.mode = mode = handle.mode # the mode in which the file was opened?
Expand Down Expand Up @@ -483,6 +483,14 @@ def _init_from_handle(self, handle):
self._frame_index = 0
self._needs_initialization = False

def __contains__(self, path):
try:
self._get_node('/', path)
except self.tables.NoSuchNodeError:
return False

return True

def read(self, frame_indices=None, atom_indices=None):
_check_mode(self.mode, ('r',))

Expand Down Expand Up @@ -666,9 +674,17 @@ def write_segment(self, segment, pop=False):
restart = get_data('iterh5/restart', None)
slog = get_data('iterh5/log', None)

# topology
if self.mode == 'a':
if not self.has_topology():
self.topology = traj.topology
elif self.mode == 'w':
self.topology = traj.topology

if traj is not None:
# create trajectory object
traj = WESTTrajectory(traj, iter_labels=n_iter, seg_labels=segment.seg_id)
# create trajectory object or if already is, skip.
if not isinstance(traj, WESTTrajectory):
traj = WESTTrajectory(traj, iter_labels=n_iter, seg_labels=segment.seg_id)
if traj.n_frames == 0:
# we may consider logging warnings instead throwing errors for later.
# right now this is good for debugging purposes
Expand Down Expand Up @@ -702,13 +718,6 @@ def write_segment(self, segment, pop=False):
cell_angles=traj.unitcell_angles,
)

# topology
if self.mode == 'a':
if not self.has_topology():
self.topology = traj.topology
elif self.mode == 'w':
self.topology = traj.topology

# restart
if restart is not None:
if self.has_restart(segment):
Expand All @@ -734,6 +743,17 @@ def write_segment(self, segment, pop=False):
createparents=True,
)

def scrub_data(self):
'''Method to remove existing coordinates, pointers etc. while preserving topology'''
for node in ['log', 'restart', 'time', 'coordinates', 'pointer', 'cell_angles', 'cell_lengths']:
try:
self._remove_node('/', node, recursive=True)
except self.tables.exceptions.NoSuchNodeError:
pass
self._frame_index = 0
self.root._v_attrs.n_iter = 0
self.flush()

@property
def _create_group(self):
if self.tables.__version__ >= '3.0.0':
Expand Down
Loading
Loading