From 74a45d93a731d56e730821466c38d2f3a8fd3a31 Mon Sep 17 00:00:00 2001 From: Logan Ward Date: Fri, 22 Nov 2024 10:36:30 -0500 Subject: [PATCH] Add ability to append to existing datasets (#119) * Add method for writing table * Add functions for appending tables * Document new features * Improve test coverage --- battdat/io/hdf.py | 61 +++++++++++++++++++++++++++++++------ docs/user-guide/formats.rst | 40 ++++++++++++++++++++++++ tests/io/test_hdf.py | 36 +++++++++++++++++++++- 3 files changed, 127 insertions(+), 10 deletions(-) diff --git a/battdat/io/hdf.py b/battdat/io/hdf.py index 517c985..71788ae 100644 --- a/battdat/io/hdf.py +++ b/battdat/io/hdf.py @@ -235,22 +235,65 @@ def write_to_hdf(self, dataset: BatteryDataset, file: File, prefix: Optional[str root_node._f_setattr('json_schema', dataset.metadata.model_json_schema()) root_node._f_setattr('battdat_version', __version__) - # Move to the group in which to store the data + # Create the group if needed if prefix is not None: - group: Group = file.create_group('/', name=prefix) - else: - group = file.root + file.create_group('/', name=prefix) # Store the various datasets # Note that we use the "table" format to allow for partial reads / querying - filters = Filters(complevel=self.complevel, complib=self.complib) for key, schema in dataset.schemas.items(): if (data := dataset.tables.get(key)) is not None: - table = write_df_to_table(file, group, key, data, filters=filters) + self.add_table(file, key, data, schema, prefix) + + def add_table(self, file: File, name: str, data: pd.DataFrame, schema: ColumnSchema, prefix: Optional[str] = None): + """Add a table to an existing dataset + + Args: + file: HDF file open via pytables + name: Name of the data table + data: Data table to be saved + schema: Description of the columns in battdat format + prefix: Prefix of the battery dataset if saving multiple per file + """ + # Write dataset + group = file.root if prefix is None else file.get_node('/' + prefix) + filters = Filters(complevel=self.complevel, complib=self.complib) + table = write_df_to_table(file, group, name, data, filters=filters) + + # Write the schema, mark as dataset + table.attrs.metadata = schema.model_dump_json() + table.attrs.json_schema = schema.model_json_schema() + + def append_to_table(self, file: File, name: str, data: pd.DataFrame, prefix: Optional[str] = None): + """Add to an existing table - # Write the schema, mark as dataset - table.attrs.metadata = schema.model_dump_json() - table.attrs.json_schema = schema.model_json_schema() + Args: + file: HDF file open via pytables + name: Name of the data table + data: Data table to be saved + prefix: Prefix of the battery dataset if saving multiple per file + """ + + # Get the table + if prefix is None: + group = file.root + else: + if '/' + prefix not in file: + raise ValueError(f'No data available for prefix: {prefix}') + group: Group = file.get_node('/' + prefix) + table: Table = group[name] + + # Check tables + new_dtype = make_numpy_dtype_from_pandas(data) + cur_dtype = table.dtype + if new_dtype != cur_dtype: + raise ValueError(f'Existing and new data types differ. Existing={cur_dtype}, New={new_dtype}') + + row = np.empty((1,), dtype=cur_dtype) # TODO (wardlt): Consider a batched write (pytables might batch internally) + for _, df_row in data.iterrows(): + for c in cur_dtype.names: + row[c] = df_row[c] + table.append(row) def export(self, dataset: BatteryDataset, path: PathLike): with File(path, mode='w') as file: diff --git a/docs/user-guide/formats.rst b/docs/user-guide/formats.rst index 9cd9529..83f6dc6 100644 --- a/docs/user-guide/formats.rst +++ b/docs/user-guide/formats.rst @@ -135,6 +135,46 @@ Load all cells by iterating over them: for name, cell in BatteryDataset.all_cells_from_hdf('test.h5'): do_some_processing(cell) + +Appending to Existing File +++++++++++++++++++++++++++ + +The :class:`~battdat.io.hdf.HDF5Writer` class facilitates adding to existing datasets. +Start by creating the writer with the desired compression settings + +.. code-block:: python + + from battdat.io.hdf import HDFWriter + + writer = HDFWriter(complevel=9) + +Add a new table to an existing dataset with :meth:`~battdat.io.hdf.HDF5Writer.add_table`, +which requires the name of a dataset and a `column schema `_. + +.. code-block:: python + + import pandas as pd + import tables + + + # Make dataset and column + df = pd.DataFrame({'a': [1., 0.]}) + schema = ColumnSchema() + schema.add_column('a', 'A column') + + with tables.open_file('example.h5', mode='a') as file: + writer.add_table(file, 'example_table', df, schema) + +Add data to an existing table with :meth:`~battdat.io.hdf.HDF5Writer.append_to_table` + +.. code-block:: python + + with tables.open_file('example.h5', mode='a') as file: + writer.append_to_table(file, 'example_table', df) + +The new table must match the existing table's contents exactly. +Any compression settings or metadata from the existing table will be re-used. + Parquet ------- diff --git a/tests/io/test_hdf.py b/tests/io/test_hdf.py index a0261f1..53ddb40 100644 --- a/tests/io/test_hdf.py +++ b/tests/io/test_hdf.py @@ -1,8 +1,12 @@ +from pathlib import Path + +from pytest import raises, mark import numpy as np import pandas as pd import tables -from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table +from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table, HDF5Writer +from battdat.schemas.column import ColumnSchema example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge'], 'array': [[[1.]], [[0.]]]}) @@ -22,3 +26,33 @@ def test_store_df(tmpdir): df_copy = read_df_from_table(table) assert (df_copy.columns == ['a', 'b', 'c', 'array']).all() assert np.allclose(df_copy['b'], [1., 3.]) + + +@mark.parametrize('prefix', [None, 'a']) +def test_append(tmpdir, prefix): + writer = HDF5Writer() + out_file = Path(tmpdir) / 'example.h5' + + # Write the initial data + with tables.open_file(out_file, mode='w') as file: + if prefix is not None: + file.create_group(file.root, prefix) + + writer.add_table(file, 'example_table', example_df, ColumnSchema(), prefix) + + # Append the data again + with tables.open_file(out_file, mode='a') as file: + writer.append_to_table(file, 'example_table', example_df, prefix) + + table = file.get_node('/example_table' if prefix is None else f'/{prefix}/example_table') + df_copy = read_df_from_table(table) + assert len(df_copy) == len(example_df) * 2 + assert np.allclose(df_copy['a'], [1, 2, 1, 2]) + + # Test data check + with raises(ValueError, match='Existing and new'): + writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix) + + # Test bad prefix + with raises(ValueError, match='No data available for prefix'): + writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix='b')