From 74a45d93a731d56e730821466c38d2f3a8fd3a31 Mon Sep 17 00:00:00 2001
From: Logan Ward <WardLT@users.noreply.github.com>
Date: Fri, 22 Nov 2024 10:36:30 -0500
Subject: [PATCH] Add ability to append to existing datasets (#119)

* Add method for writing table

* Add functions for appending tables

* Document new features

* Improve test coverage
---
 battdat/io/hdf.py           | 61 +++++++++++++++++++++++++++++++------
 docs/user-guide/formats.rst | 40 ++++++++++++++++++++++++
 tests/io/test_hdf.py        | 36 +++++++++++++++++++++-
 3 files changed, 127 insertions(+), 10 deletions(-)

diff --git a/battdat/io/hdf.py b/battdat/io/hdf.py
index 517c985..71788ae 100644
--- a/battdat/io/hdf.py
+++ b/battdat/io/hdf.py
@@ -235,22 +235,65 @@ def write_to_hdf(self, dataset: BatteryDataset, file: File, prefix: Optional[str
         root_node._f_setattr('json_schema', dataset.metadata.model_json_schema())
         root_node._f_setattr('battdat_version', __version__)
 
-        # Move to the group in which to store the data
+        # Create the group if needed
         if prefix is not None:
-            group: Group = file.create_group('/', name=prefix)
-        else:
-            group = file.root
+            file.create_group('/', name=prefix)
 
         # Store the various datasets
         #  Note that we use the "table" format to allow for partial reads / querying
-        filters = Filters(complevel=self.complevel, complib=self.complib)
         for key, schema in dataset.schemas.items():
             if (data := dataset.tables.get(key)) is not None:
-                table = write_df_to_table(file, group, key, data, filters=filters)
+                self.add_table(file, key, data, schema, prefix)
+
+    def add_table(self, file: File, name: str, data: pd.DataFrame, schema: ColumnSchema, prefix: Optional[str] = None):
+        """Add a table to an existing dataset
+
+        Args:
+            file: HDF file open via pytables
+            name: Name of the data table
+            data: Data table to be saved
+            schema: Description of the columns in battdat format
+            prefix: Prefix of the battery dataset if saving multiple per file
+        """
+        # Write dataset
+        group = file.root if prefix is None else file.get_node('/' + prefix)
+        filters = Filters(complevel=self.complevel, complib=self.complib)
+        table = write_df_to_table(file, group, name, data, filters=filters)
+
+        # Write the schema, mark as dataset
+        table.attrs.metadata = schema.model_dump_json()
+        table.attrs.json_schema = schema.model_json_schema()
+
+    def append_to_table(self, file: File, name: str, data: pd.DataFrame, prefix: Optional[str] = None):
+        """Add to an existing table
 
-                # Write the schema, mark as dataset
-                table.attrs.metadata = schema.model_dump_json()
-                table.attrs.json_schema = schema.model_json_schema()
+        Args:
+            file: HDF file open via pytables
+            name: Name of the data table
+            data: Data table to be saved
+            prefix: Prefix of the battery dataset if saving multiple per file
+        """
+
+        # Get the table
+        if prefix is None:
+            group = file.root
+        else:
+            if '/' + prefix not in file:
+                raise ValueError(f'No data available for prefix: {prefix}')
+            group: Group = file.get_node('/' + prefix)
+        table: Table = group[name]
+
+        # Check tables
+        new_dtype = make_numpy_dtype_from_pandas(data)
+        cur_dtype = table.dtype
+        if new_dtype != cur_dtype:
+            raise ValueError(f'Existing and new data types differ. Existing={cur_dtype}, New={new_dtype}')
+
+        row = np.empty((1,), dtype=cur_dtype)  # TODO (wardlt): Consider a batched write (pytables might batch internally)
+        for _, df_row in data.iterrows():
+            for c in cur_dtype.names:
+                row[c] = df_row[c]
+            table.append(row)
 
     def export(self, dataset: BatteryDataset, path: PathLike):
         with File(path, mode='w') as file:
diff --git a/docs/user-guide/formats.rst b/docs/user-guide/formats.rst
index 9cd9529..83f6dc6 100644
--- a/docs/user-guide/formats.rst
+++ b/docs/user-guide/formats.rst
@@ -135,6 +135,46 @@ Load all cells by iterating over them:
     for name, cell in BatteryDataset.all_cells_from_hdf('test.h5'):
         do_some_processing(cell)
 
+
+Appending to Existing File
+++++++++++++++++++++++++++
+
+The :class:`~battdat.io.hdf.HDF5Writer` class facilitates adding to existing datasets.
+Start by creating the writer with the desired compression settings
+
+.. code-block:: python
+
+    from battdat.io.hdf import HDFWriter
+
+    writer = HDFWriter(complevel=9)
+
+Add a new table to an existing dataset with :meth:`~battdat.io.hdf.HDF5Writer.add_table`,
+which requires the name of a dataset and a `column schema <schemas/column-schema.html>`_.
+
+.. code-block:: python
+
+    import pandas as pd
+    import tables
+
+
+    # Make dataset and column
+    df = pd.DataFrame({'a': [1., 0.]})
+    schema = ColumnSchema()
+    schema.add_column('a', 'A column')
+
+    with tables.open_file('example.h5', mode='a') as file:
+        writer.add_table(file, 'example_table', df, schema)
+
+Add data to an existing table with :meth:`~battdat.io.hdf.HDF5Writer.append_to_table`
+
+.. code-block:: python
+
+    with tables.open_file('example.h5', mode='a') as file:
+        writer.append_to_table(file, 'example_table', df)
+
+The new table must match the existing table's contents exactly.
+Any compression settings or metadata from the existing table will be re-used.
+
 Parquet
 -------
 
diff --git a/tests/io/test_hdf.py b/tests/io/test_hdf.py
index a0261f1..53ddb40 100644
--- a/tests/io/test_hdf.py
+++ b/tests/io/test_hdf.py
@@ -1,8 +1,12 @@
+from pathlib import Path
+
+from pytest import raises, mark
 import numpy as np
 import pandas as pd
 import tables
 
-from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table
+from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table, HDF5Writer
+from battdat.schemas.column import ColumnSchema
 
 example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge'], 'array': [[[1.]], [[0.]]]})
 
@@ -22,3 +26,33 @@ def test_store_df(tmpdir):
         df_copy = read_df_from_table(table)
         assert (df_copy.columns == ['a', 'b', 'c', 'array']).all()
         assert np.allclose(df_copy['b'], [1., 3.])
+
+
+@mark.parametrize('prefix', [None, 'a'])
+def test_append(tmpdir, prefix):
+    writer = HDF5Writer()
+    out_file = Path(tmpdir) / 'example.h5'
+
+    # Write the initial data
+    with tables.open_file(out_file, mode='w') as file:
+        if prefix is not None:
+            file.create_group(file.root, prefix)
+
+        writer.add_table(file, 'example_table', example_df, ColumnSchema(), prefix)
+
+    # Append the data again
+    with tables.open_file(out_file, mode='a') as file:
+        writer.append_to_table(file, 'example_table', example_df, prefix)
+
+        table = file.get_node('/example_table' if prefix is None else f'/{prefix}/example_table')
+        df_copy = read_df_from_table(table)
+        assert len(df_copy) == len(example_df) * 2
+        assert np.allclose(df_copy['a'], [1, 2, 1, 2])
+
+        # Test data check
+        with raises(ValueError, match='Existing and new'):
+            writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix)
+
+        # Test bad prefix
+        with raises(ValueError, match='No data available for prefix'):
+            writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix='b')