Skip to content

Commit

Permalink
Add ability to append to existing datasets (#119)
Browse files Browse the repository at this point in the history
* Add method for writing table

* Add functions for appending tables

* Document new features

* Improve test coverage
  • Loading branch information
WardLT authored Nov 22, 2024
1 parent bb4a87a commit 74a45d9
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 10 deletions.
61 changes: 52 additions & 9 deletions battdat/io/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,22 +235,65 @@ def write_to_hdf(self, dataset: BatteryDataset, file: File, prefix: Optional[str
root_node._f_setattr('json_schema', dataset.metadata.model_json_schema())
root_node._f_setattr('battdat_version', __version__)

# Move to the group in which to store the data
# Create the group if needed
if prefix is not None:
group: Group = file.create_group('/', name=prefix)
else:
group = file.root
file.create_group('/', name=prefix)

# Store the various datasets
# Note that we use the "table" format to allow for partial reads / querying
filters = Filters(complevel=self.complevel, complib=self.complib)
for key, schema in dataset.schemas.items():
if (data := dataset.tables.get(key)) is not None:
table = write_df_to_table(file, group, key, data, filters=filters)
self.add_table(file, key, data, schema, prefix)

def add_table(self, file: File, name: str, data: pd.DataFrame, schema: ColumnSchema, prefix: Optional[str] = None):
"""Add a table to an existing dataset
Args:
file: HDF file open via pytables
name: Name of the data table
data: Data table to be saved
schema: Description of the columns in battdat format
prefix: Prefix of the battery dataset if saving multiple per file
"""
# Write dataset
group = file.root if prefix is None else file.get_node('/' + prefix)
filters = Filters(complevel=self.complevel, complib=self.complib)
table = write_df_to_table(file, group, name, data, filters=filters)

# Write the schema, mark as dataset
table.attrs.metadata = schema.model_dump_json()
table.attrs.json_schema = schema.model_json_schema()

def append_to_table(self, file: File, name: str, data: pd.DataFrame, prefix: Optional[str] = None):
"""Add to an existing table
# Write the schema, mark as dataset
table.attrs.metadata = schema.model_dump_json()
table.attrs.json_schema = schema.model_json_schema()
Args:
file: HDF file open via pytables
name: Name of the data table
data: Data table to be saved
prefix: Prefix of the battery dataset if saving multiple per file
"""

# Get the table
if prefix is None:
group = file.root
else:
if '/' + prefix not in file:
raise ValueError(f'No data available for prefix: {prefix}')
group: Group = file.get_node('/' + prefix)
table: Table = group[name]

# Check tables
new_dtype = make_numpy_dtype_from_pandas(data)
cur_dtype = table.dtype
if new_dtype != cur_dtype:
raise ValueError(f'Existing and new data types differ. Existing={cur_dtype}, New={new_dtype}')

row = np.empty((1,), dtype=cur_dtype) # TODO (wardlt): Consider a batched write (pytables might batch internally)
for _, df_row in data.iterrows():
for c in cur_dtype.names:
row[c] = df_row[c]
table.append(row)

def export(self, dataset: BatteryDataset, path: PathLike):
with File(path, mode='w') as file:
Expand Down
40 changes: 40 additions & 0 deletions docs/user-guide/formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,46 @@ Load all cells by iterating over them:
for name, cell in BatteryDataset.all_cells_from_hdf('test.h5'):
do_some_processing(cell)
Appending to Existing File
++++++++++++++++++++++++++

The :class:`~battdat.io.hdf.HDF5Writer` class facilitates adding to existing datasets.
Start by creating the writer with the desired compression settings

.. code-block:: python
from battdat.io.hdf import HDFWriter
writer = HDFWriter(complevel=9)
Add a new table to an existing dataset with :meth:`~battdat.io.hdf.HDF5Writer.add_table`,
which requires the name of a dataset and a `column schema <schemas/column-schema.html>`_.

.. code-block:: python
import pandas as pd
import tables
# Make dataset and column
df = pd.DataFrame({'a': [1., 0.]})
schema = ColumnSchema()
schema.add_column('a', 'A column')
with tables.open_file('example.h5', mode='a') as file:
writer.add_table(file, 'example_table', df, schema)
Add data to an existing table with :meth:`~battdat.io.hdf.HDF5Writer.append_to_table`

.. code-block:: python
with tables.open_file('example.h5', mode='a') as file:
writer.append_to_table(file, 'example_table', df)
The new table must match the existing table's contents exactly.
Any compression settings or metadata from the existing table will be re-used.

Parquet
-------

Expand Down
36 changes: 35 additions & 1 deletion tests/io/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from pathlib import Path

from pytest import raises, mark
import numpy as np
import pandas as pd
import tables

from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table
from battdat.io.hdf import make_numpy_dtype_from_pandas, write_df_to_table, read_df_from_table, HDF5Writer
from battdat.schemas.column import ColumnSchema

example_df = pd.DataFrame({'a': [1, 2], 'b': [1., 3.], 'c': ['charge', 'discharge'], 'array': [[[1.]], [[0.]]]})

Expand All @@ -22,3 +26,33 @@ def test_store_df(tmpdir):
df_copy = read_df_from_table(table)
assert (df_copy.columns == ['a', 'b', 'c', 'array']).all()
assert np.allclose(df_copy['b'], [1., 3.])


@mark.parametrize('prefix', [None, 'a'])
def test_append(tmpdir, prefix):
writer = HDF5Writer()
out_file = Path(tmpdir) / 'example.h5'

# Write the initial data
with tables.open_file(out_file, mode='w') as file:
if prefix is not None:
file.create_group(file.root, prefix)

writer.add_table(file, 'example_table', example_df, ColumnSchema(), prefix)

# Append the data again
with tables.open_file(out_file, mode='a') as file:
writer.append_to_table(file, 'example_table', example_df, prefix)

table = file.get_node('/example_table' if prefix is None else f'/{prefix}/example_table')
df_copy = read_df_from_table(table)
assert len(df_copy) == len(example_df) * 2
assert np.allclose(df_copy['a'], [1, 2, 1, 2])

# Test data check
with raises(ValueError, match='Existing and new'):
writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix)

# Test bad prefix
with raises(ValueError, match='No data available for prefix'):
writer.append_to_table(file, 'example_table', pd.DataFrame({'a': [1., 2.]}), prefix='b')

0 comments on commit 74a45d9

Please sign in to comment.