diff --git a/setup.cfg b/setup.cfg index d1c7494..c6c1195 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,9 +49,9 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - dolomite-base + dolomite-base>=0.2.0-alpha5 h5py - delayedarray>=0.3.2 + delayedarray>=0.3.3 numpy filebackedarray @@ -71,6 +71,7 @@ testing = pytest pytest-cov scipy + dask [options.entry_points] # Add here console scripts like: diff --git a/src/dolomite_matrix/DelayedMask.py b/src/dolomite_matrix/DelayedMask.py new file mode 100644 index 0000000..8a94f8e --- /dev/null +++ b/src/dolomite_matrix/DelayedMask.py @@ -0,0 +1,136 @@ +from typing import Tuple, Optional, Sequence +import delayedarray +import numpy + +class DelayedMask(delayedarray.DelayedOp): + """ + Delayed mask to replace the missing value placeholder with a NumPy masked array. + """ + + def __init__(self, seed, placeholder, dtype: Optional[numpy.dtype] = None): + """ + Args: + seed: + Any object that satisfies the seed contract, + see :py:class:`~delayedarray.DelayedArray.DelayedArray` for details. + + placeholder: + Placeholder value for defining masked values, of the same type + as ``seed.dtype`` (or coercible into that type). All values + equal to the placeholder are considered to be missing. + + dtype: + Desired type of the masked output, defaults to ``seed.dtype``. + """ + self._seed = seed + + if numpy.issubdtype(seed.dtype, numpy.str_) and isinstance(placeholder, bytes): + self._placeholder = numpy.str_(placeholder.decode("UTF8")) + else: + self._placeholder = seed.dtype.type(placeholder) + + if dtype is None: + dtype = seed.dtype + self._dtype = dtype + + @property + def shape(self) -> Tuple[int, ...]: + """ + Returns: + Tuple of integers specifying the extent of each dimension of this + object. This is the same as the ``seed`` object. + """ + return self._seed.shape + + @property + def dtype(self) -> numpy.dtype: + """ + Returns: + NumPy type for the contents after masking. + """ + return self._dtype + + @property + def seed(self): + """ + Returns: + The seed object. + """ + return self._seed + + @property + def placeholder(self): + """ + Returns: + The placeholder value. + """ + return self._placeholder + + +def _create_mask(x: numpy.ndarray, placeholder): + if numpy.issubdtype(placeholder.dtype, numpy.floating) and numpy.isnan(placeholder): + return numpy.isnan(x) + else: + return (x == placeholder) + + +@delayedarray.extract_dense_array.register +def extract_dense_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None): + """See :py:meth:`~delayedarray.extract_dense_array.extract_dense_array`.""" + out = delayedarray.extract_dense_array(x._seed, subset) + mask = _create_mask(out, x._placeholder) # do this before type coercion, as the placeholder is assumed to be of the same underlying seed type. + out = out.astype(x._dtype, copy=False) + if mask.any(): + out = numpy.ma.MaskedArray(out, mask=mask) + return out + + +def _mask_SparseNdarray(contents, placeholder, dtype): + if not isinstance(contents, list): + indices, values = contents + mask = _create_mask(values, placeholder) # do this before type coercion, again. + values = values.astype(dtype, copy=False) + if mask.any(): + values = numpy.ma.MaskedArray(values, mask=mask) + return indices, values + else: + output = [] + for val in contents: + if val is None: + output.append(val) + else: + output.append(_mask_SparseNdarray(val, placeholder, dtype)) + return output + + +@delayedarray.extract_sparse_array.register +def extract_sparse_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None): + """See :py:meth:`~delayedarray.extract_sparse_array.extract_sparse_array`.""" + out = delayedarray.extract_sparse_array(x._seed, subset) + contents = out.contents + if contents is not None: + contents = _mask_SparseNdarray(contents, x._placeholder, x._dtype) + return delayedarray.SparseNdarray(x.shape, contents, dtype=x._dtype, index_dtype=out.index_dtype, check=False) + + +@delayedarray.create_dask_array.register +def create_dask_array_DelayedMask(x: DelayedMask): + """See :py:meth:`~delayedarray.create_dask_array.create_dask_array`.""" + target = delayedarray.create_dask_array(x._seed) + mask = (target == x._placeholder) + target = target.astype(x._dtype) + import dask.array + return dask.array.ma.masked_array(target, mask=mask) + + +@delayedarray.chunk_shape.register +def chunk_shape_DelayedMask(x: DelayedMask): + """See :py:meth:`~delayedarray.chunk_shape.chunk_shape`.""" + return delayedarray.chunk_shape(x._seed) + + +@delayedarray.is_sparse.register +def is_sparse_DelayedMask(x: DelayedMask): + """See :py:meth:`~delayedarray.is_sparse.is_sparse`.""" + return delayedarray.is_sparse(x._seed) + diff --git a/src/dolomite_matrix/ReloadedArray.py b/src/dolomite_matrix/ReloadedArray.py new file mode 100644 index 0000000..7c99225 --- /dev/null +++ b/src/dolomite_matrix/ReloadedArray.py @@ -0,0 +1,124 @@ +import delayedarray +from dolomite_base import save_object +import os +import shutil + +from .WrapperArraySeed import WrapperArraySeed +from .save_compressed_sparse_matrix import _save_compressed_sparse_matrix +from .save_dense_array import _save_dense_array + + +class ReloadedArraySeed(WrapperArraySeed): + """ + Seed for the :py:class:`~ReloadedArray` class. This is a subclass + of :py:class:`~dolomite_matrix.WrapperArraySeed.WrapperArraySeed`. + """ + + def __init__(self, seed, path: str): + """ + Args: + seed: The contents of the reloaded array. + path: Path to the directory containing the on-disk representation. + """ + super(ReloadedArraySeed, self).__init__(seed) + self._path = path + + @property + def path(self) -> str: + """ + Returns: + Path to the directory containing the on-disk representation. + """ + return self._path + + +class ReloadedArray(delayedarray.DelayedArray): + """ + An array that was reloaded from disk by the + :py:func:`~dolomite_base.read_object.read_object` function, and remembers + the path from which it was loaded. This class allows methods to refer to + the existing on-disk representation by inspecting the path. For example, + :py:func:`~dolomite_base.save_object.save_object` can just copy/link to the + existing files instead of repeating the saving process. + """ + + def __init__(self, seed, path: str): + """ + To construct a ``ReloadedArray`` from an existing + :py:class:`~ReloadedArraySeed`, use :py:meth:`~delayedarray.wrap.wrap` + instead. + + Args: + seed: The contents of the reloaded array. + path: Path to the directory containing the on-disk representation. + """ + if not isinstance(seed, ReloadedArraySeed): + seed = ReloadedArraySeed(seed, path) + super(ReloadedArray, self).__init__(seed) + + @property + def path(self) -> str: + """ + Returns: + Path to the directory containing the on-disk representation. + """ + return self.seed._path + + +@delayedarray.wrap.register +def wrap_ReloadedArraySeed(x: ReloadedArraySeed) -> ReloadedArray: + """See :py:func:`~delayedarray.wrap.wrap`.""" + return ReloadedArray(x) + + +@save_object.register +def save_object_ReloadedArray(x: ReloadedArray, path: str, reloaded_array_reuse_mode: str = "link", **kwargs): + """ + Method for saving :py:class:`~ReloadedArray.ReloadedArray` objects to disk, + see :py:meth:`~dolomite_base.save_object.save_object` for details. + + Args: + x: Object to be saved. + + path: Path to a directory to save ``x``. + + reloaded_array_reuse_mode: + How the files in ``x.path`` should be re-used when populating + ``path``. This can be ``"link"``, to create a hard link to each + file; ``"symlink"``, to create a symbolic link to each file; + ``"copy"``, to create a copy of each file; or ``"none"``, to + perform a fresh save of ``x`` without relying on ``x.path``. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + if reloaded_array_reuse_mode == "none": + if delayedarray.is_sparse(x): + return _save_compressed_sparse_matrix(x, path, **kwargs) + else: + return _save_dense_array(x, path, **kwargs) + + if reloaded_array_reuse_mode == "link": + def FUN(src, dest): + try: + os.link(src, dest) + except: + shutil.copyfile(src, dest) + elif reloaded_array_reuse_mode == "symlink": + def FUN(src, dest): + try: + os.symlink(src, dest) + except: + shutil.copyfile(src, dest) + elif reloaded_array_reuse_mode == "copy": + FUN = shutil.copyfile + else: + raise ValueError("invalid reuse mode '" + reloaded_array_reuse_mode + "'") + + for root, dirs, files in os.walk(x.path): + newpath = os.path.join(path, os.path.relpath(root, x.path)) + os.makedirs(newpath) + for f in files: + FUN(os.path.join(root, f), os.path.join(newpath, f)) diff --git a/src/dolomite_matrix/WrapperArraySeed.py b/src/dolomite_matrix/WrapperArraySeed.py new file mode 100644 index 0000000..0066d37 --- /dev/null +++ b/src/dolomite_matrix/WrapperArraySeed.py @@ -0,0 +1,73 @@ +from typing import Tuple, Sequence, Optional +import numpy +import delayedarray + + +class WrapperArraySeed: + """ + Wrapper for a DelayedArray seed, which forwards all of the required + operations to the seed object. This is expected to be used as a base for + concrete subclasses that attach more provenance-tracking information - see + :py:class:`~dolomite_base.ReloadedArray.ReloadedArray` for an example. + """ + + def __init__(self, seed): + """ + Args: + seed: The underlying seed instance to be wrapped. + """ + self._seed = seed + + @property + def seed(self): + """ + Returns: + The underlying seed instance. + """ + return self._seed + + @property + def shape(self) -> Tuple[int, ...]: + """ + Returns: + The shape of the seed. + """ + return self._seed.shape + + @property + def dtype(self) -> numpy.dtype: + """ + Returns: + The type of the seed. + """ + return self._seed.dtype + + +@delayedarray.is_sparse.register +def is_sparse_WrapperArraySeed(x: WrapperArraySeed) -> bool: + """See :py:func:`~delayedarray.is_sparse.is_sparse` for details.""" + return delayedarray.is_sparse(x._seed) + + +@delayedarray.chunk_shape.register +def chunk_shape_WrapperArraySeed(x: WrapperArraySeed) -> Tuple[int, ...]: + """See :py:func:`~delayedarray.chunk_shape.chunk_shape` for details.""" + return delayedarray.chunk_shape(x._seed) + + +@delayedarray.extract_dense_array.register +def extract_dense_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> numpy.ndarray: + """See :py:func:`~delayedarray.extract_dense_array.extract_dense_array` for details.""" + return delayedarray.extract_dense_array(x._seed, subset) + + +@delayedarray.extract_sparse_array.register +def extract_sparse_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> delayedarray.SparseNdarray: + """See :py:func:`~delayedarray.extract_sparse_array.extract_sparse_array` for details.""" + return delayedarray.extract_sparse_array(x._seed, subset) + + +@delayedarray.create_dask_array.register +def create_dask_array_WrapperArraySeed(x: WrapperArraySeed): + """See :py:func:`~delayedarray.create_dask_array.create_dask_array` for details.""" + return delayedarray.create_dask_array(x._seed) diff --git a/src/dolomite_matrix/__init__.py b/src/dolomite_matrix/__init__.py index 6ce14bc..fa06f3e 100644 --- a/src/dolomite_matrix/__init__.py +++ b/src/dolomite_matrix/__init__.py @@ -16,10 +16,13 @@ del version, PackageNotFoundError -from .choose_dense_chunk_sizes import choose_dense_chunk_sizes -from .stage_ndarray import stage_ndarray -from .stage_DelayedArray import stage_DelayedArray -from .load_hdf5_dense_array import load_hdf5_dense_array -from .write_sparse_matrix import write_sparse_matrix -from .stage_sparse import * -from .load_hdf5_sparse_matrix import load_hdf5_sparse_matrix +from .choose_chunk_dimensions import choose_chunk_dimensions +from .save_dense_array import save_dense_array_from_ndarray +from .read_dense_array import read_dense_array +from .save_compressed_sparse_matrix import * +from .read_compressed_sparse_matrix import read_compressed_sparse_matrix +from .save_delayed_array import save_delayed_array + +from .DelayedMask import DelayedMask +from .WrapperArraySeed import WrapperArraySeed +from .ReloadedArray import ReloadedArray, ReloadedArraySeed diff --git a/src/dolomite_matrix/_optimize_storage.py b/src/dolomite_matrix/_optimize_storage.py new file mode 100644 index 0000000..68bc01b --- /dev/null +++ b/src/dolomite_matrix/_optimize_storage.py @@ -0,0 +1,624 @@ +from delayedarray import SparseNdarray, is_sparse, apply_over_blocks +from typing import Callable, Any, Optional, Union, Tuple +from functools import singledispatch, reduce +from collections import namedtuple +from dataclasses import dataclass +import numpy +import dolomite_base as dl +import h5py + + +has_scipy = False +try: + import scipy.sparse + has_scipy = True +except: + pass + + +def _aggregate_any(collated: list, name: str): + for y in collated: + val = getattr(y, name) + if val is not None and val: + return True + return False + + +def _aggregate_min(collated: list, name: str): + mval = None + for y in collated: + val = getattr(y, name) + if val is not None: + if mval is None or mval > val: + mval = val + return mval + + +def _aggregate_max(collated: list, name: str): + mval = None + for y in collated: + val = getattr(y, name) + if val is not None: + if mval is None or mval < val: + mval = val + return mval + + +def _aggregate_sum(collated: list, name: str): + mval = 0 + for y in collated: + val = getattr(y, name) + if val is not None: + mval += val + return mval + + +def _blockwise_any(x: numpy.ndarray, condition: Callable): + y = x.ravel() + step = 100000 + limit = len(y) + for i in range(0, limit, step): + if condition(y[i : min(limit, i + step)]).any(): + return True + return False + + + +def _collect_from_Sparse2darray(contents, fun: Callable, dtype: Callable): + if contents is None: + attrs = fun(numpy.array([], dtype=dtype)) + attrs.non_zero = 0 + return [attrs] + + output = [] + for i, node in enumerate(contents): + if node is None: + val = numpy.array([], dtype=dtype) + else: + val = node[1] + attrs = fun(val) + attrs.non_zero = len(val) + output.append(attrs) + + return output + + +_OptimizedStorageParameters = namedtuple("_OptimizedStorageParameters", ["type", "placeholder", "non_zero"]) + + +def _unique_values_from_ndarray(position: Tuple, contents: numpy.ndarray) -> set: + if not numpy.ma.is_masked(contents): + return set(contents) + output = set() + for y in contents: + if not numpy.ma.is_masked(y): + output.add(y) + return output + + +def _unique_values_from_Sparse2darray(position: Tuple, contents: SparseNdarray) ->set: + output = set() + if contents is not None: + for i, node in enumerate(contents): + if node is not None: + output |= _unique_values_from_ndarray(node[[2]]) + return output + + +def _unique_values(x) -> set: + if is_sparse(x): + uniq_sets = apply_over_blocks(x, _unique_values_from_Sparse2darray, allow_sparse=True) + else: + uniq_sets = apply_over_blocks(x, _unique_values_from_ndarray) + return reduce(lambda a, b : a | b, uniq_sets) + + +################################################### +################################################### + + +@singledispatch +def collect_integer_attributes(x: Any): + if is_sparse(x): + collated = apply_over_blocks(x, lambda pos, block : _collect_integer_attributes_from_Sparse2darray(block), allow_sparse=True) + else: + collated = apply_over_blocks(x, lambda pos, block : _collect_integer_attributes_from_ndarray(block)) + return _combine_integer_attributes(collated) + + +@dataclass +class _IntegerAttributes: + minimum: Optional[int] + maximum: Optional[int] + missing: bool + non_zero: int = 0 + + +def _simple_integer_collector(x: numpy.ndarray) -> _IntegerAttributes: + if x.size == 0: + return _IntegerAttributes(minimum = None, maximum = None, missing = False) + + missing = False + if numpy.ma.is_masked(x): + if x.mask.all(): + return _IntegerAttributes(minimum = None, maximum = None, missing = True) + if x.mask.any(): + missing = True + + return _IntegerAttributes(minimum=x.min(), maximum=x.max(), missing=missing) + + +def _combine_integer_attributes(x: list[_IntegerAttributes]): + return _IntegerAttributes( + minimum=_aggregate_min(x, "minimum"), + maximum=_aggregate_max(x, "maximum"), + missing=_aggregate_any(x, "missing"), + non_zero=_aggregate_sum(x, "non_zero"), + ) + + +@collect_integer_attributes.register +def _collect_integer_attributes_from_ndarray(x: numpy.ndarray) -> _IntegerAttributes: + return _simple_integer_collector(x) + + +@collect_integer_attributes.register +def _collect_integer_attributes_from_Sparse2darray(x: SparseNdarray) -> _IntegerAttributes: + collected = _collect_from_Sparse2darray(x.contents, _simple_integer_collector, x.dtype) + return _combine_integer_attributes(collected) + + +if has_scipy: + @collect_integer_attributes.register + def _collect_integer_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix): + output = _simple_integer_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_integer_attributes.register + def _collect_integer_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix): + output = _simple_integer_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_integer_attributes.register + def _collect_integer_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix): + output = _simple_integer_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + +def optimize_integer_storage(x) -> _OptimizedStorageParameters: + attr = collect_integer_attributes(x) + lower = attr.minimum + upper = attr.maximum + has_missing = attr.missing + + if has_missing: + # If it's None, that means that there are only missing values in + # 'x', otherwise there should have been at least one finite value + # available. In any case, it means we can just do whatever we want so + # we'll just use the smallest type. + if lower is None: + return _OptimizedStorageParameters(type="i1", placeholder=-2**7, non_zero=attr.non_zero) + + if lower < 0: + if lower > -2**7 and upper < 2**7: + return _OptimizedStorageParameters(type="i1", placeholder=-2**7, non_zero=attr.non_zero) + elif lower > -2**15 and upper < 2**15: + return _OptimizedStorageParameters(type="i2", placeholder=-2**15, non_zero=attr.non_zero) + elif lower > -2**31 and upper < 2**31: + return _OptimizedStorageParameters(type="i4", placeholder=-2**31, non_zero=attr.non_zero) + else: + if upper < 2**8 - 1: + return _OptimizedStorageParameters(type="u1", placeholder=2**8-1, non_zero=attr.non_zero) + elif upper < 2**16 - 1: + return _OptimizedStorageParameters(type="u2", placeholder=2**16-1, non_zero=attr.non_zero) + elif upper < 2**31 - 1: # Yes, this is deliberate, as integer storage maxes out at 32-bit signed integers. + return _OptimizedStorageParameters(type="i4", placeholder=2**31-1, non_zero=attr.non_zero) + + return _OptimizedStorageParameters(type="f8", placeholder=numpy.NaN, non_zero=attr.non_zero) + + else: + # If it's infinite, that means that 'x' is of length zero, otherwise + # there should have been at least one finite value available. Here, + # the type doesn't matter, so we'll just use the smallest. + if lower is None: + return _OptimizedStorageParameters(type="i1", placeholder=None, non_zero=attr.non_zero) + + if lower < 0: + if lower >= -2**7 and upper < 2**7: + return _OptimizedStorageParameters(type="i1", placeholder=None, non_zero=attr.non_zero) + elif lower >= -2**15 and upper < 2**15: + return _OptimizedStorageParameters(type="i2", placeholder=None, non_zero=attr.non_zero) + elif lower >= -2**31 and upper < 2**31: + return _OptimizedStorageParameters(type="i4", placeholder=None, non_zero=attr.non_zero) + else: + if upper < 2**8: + return _OptimizedStorageParameters(type="u1", placeholder=None, non_zero=attr.non_zero) + elif upper < 2**16: + return _OptimizedStorageParameters(type="u2", placeholder=None, non_zero=attr.non_zero) + elif upper < 2**31: # Yes, this is deliberate, as integer storage maxes out at 32-bit signed integers. + return _OptimizedStorageParameters(type="i4", placeholder=None, non_zero=attr.non_zero) + + return _OptimizedStorageParameters(type="f8", placeholder=None, non_zero=attr.non_zero) + + +################################################### +################################################### + + +@dataclass +class _FloatAttributes: + minimum: Optional[int] + maximum: Optional[int] + missing: bool + non_integer: bool + has_nan: bool + has_positive_inf: bool + has_negative_inf: bool + has_zero: Optional[bool] + has_lowest: Optional[bool] + has_highest: Optional[bool] + non_zero: int = 0 + + +@singledispatch +def collect_float_attributes(x: Any, no_missing: bool) -> _FloatAttributes: + if is_sparse(x): + collated = apply_over_blocks(x, lambda pos, block : _collect_float_attributes_from_Sparse2darray(block, no_missing), allow_sparse=True) + else: + collated = apply_over_blocks(x, lambda pos, block : _collect_float_attributes_from_ndarray(block, no_missing)) + return _combine_float_attributes(collated) + + +def _simple_float_collector(x: numpy.ndarray, no_missing: bool) -> _FloatAttributes: + output = _FloatAttributes( + minimum = None, + maximum = None, + missing = False, + non_integer = False, + has_nan = False, + has_positive_inf = False, + has_negative_inf = False, + has_zero = None, + has_lowest = None, + has_highest = None + ) + + if x.size == 0: + return output + + if not no_missing: + if numpy.ma.is_masked(x): + if x.mask.all(): + output.missing = True + return output + if x.mask.any(): + output.missing = True + + # While these are technically only used if there are missing values, we + # still need them to obtain 'non_integer', so we just compute them. + output.has_nan = _blockwise_any(x, numpy.isnan) + output.has_positive_inf = numpy.inf in x + output.has_negative_inf = -numpy.inf in x + + if output.has_nan or output.has_positive_inf or output.has_negative_inf: + output.non_integer = True + else: + output.non_integer = _blockwise_any(x, lambda b : (b % 1 != 0)) + + # Minimum and maximum are only used if all floats contain integers. + if not output.non_integer: + output.minimum = x.min() + output.maximum = x.max() + + # Highest/lowest are only used when there might be missing values. + if not no_missing: + fstats = numpy.finfo(x.dtype) + output.has_lowest = fstats.min in x + output.has_highest = fstats.max in x + output.has_zero = 0 in x + + return output + + +@collect_float_attributes.register +def _collect_float_attributes_from_ndarray(x: numpy.ndarray, no_missing: bool) -> _FloatAttributes: + return _simple_float_collector(x, no_missing) + + +@collect_float_attributes.register +def _collect_float_attributes_from_Sparse2darray(x: SparseNdarray, no_missing: bool) -> _FloatAttributes: + collected = _collect_from_Sparse2darray(x.contents, lambda block : _simple_float_collector(block, no_missing), x.dtype) + return _combine_float_attributes(collected) + + +if has_scipy: + @collect_float_attributes.register + def _collect_float_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix, no_missing: bool): + output = _simple_float_collector(x.data, no_missing) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_float_attributes.register + def _collect_float_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix, no_missing: bool): + output = _simple_float_collector(x.data, no_missing) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_float_attributes.register + def _collect_float_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix, no_missing: bool): + output = _simple_float_collector(x.data, no_missing) + output.non_zero = int(x.data.shape[0]) + return output + + +def _combine_float_attributes(x: list[_FloatAttributes]) -> _FloatAttributes: + return _FloatAttributes( + minimum=_aggregate_min(x, "minimum"), + maximum=_aggregate_max(x, "maximum"), + non_integer=_aggregate_any(x, "non_integer"), + missing=_aggregate_any(x, "missing"), + has_nan=_aggregate_any(x, "has_nan"), + has_positive_inf=_aggregate_any(x, "has_positive_inf"), + has_negative_inf=_aggregate_any(x, "has_negative_inf"), + has_lowest=_aggregate_any(x, "has_lowest"), + has_highest=_aggregate_any(x, "has_highest"), + has_zero=_aggregate_any(x, "has_zero"), + non_zero=_aggregate_sum(x, "non_zero"), + ) + + +def optimize_float_storage(x) -> _OptimizedStorageParameters: + attr = collect_float_attributes(x, isinstance(x, numpy.ndarray) and not numpy.ma.is_masked(x)) + + if attr.missing: + if not attr.non_integer: + lower = attr.minimum + upper = attr.maximum + + # See logic in optimize_integer_storage(). + if lower is None: + return _OptimizedStorageParameters(type="i1", placeholder=-2**7, non_zero=attr.non_zero) + + if lower < 0: + if lower > -2**7 and upper < 2**7: + return _OptimizedStorageParameters(type="i1", placeholder=-2**7, non_zero=attr.non_zero) + elif lower > -2**15 and upper < 2**15: + return _OptimizedStorageParameters(type="i2", placeholder=-2**15, non_zero=attr.non_zero) + elif lower > -2**31 and upper < 2**31: + return _OptimizedStorageParameters(type="i4", placeholder=-2**31, non_zero=attr.non_zero) + else: + if upper < 2**8 - 1: + return _OptimizedStorageParameters(type="u1", placeholder=2**8-1, non_zero=attr.non_zero) + elif upper < 2**16 - 1: + return _OptimizedStorageParameters(type="u2", placeholder=2**16-1, non_zero=attr.non_zero) + elif upper < 2**32 - 1: + return _OptimizedStorageParameters(type="u4", placeholder=2**32-1, non_zero=attr.non_zero) + + placeholder = None + if not attr.has_nan: + placeholder = numpy.NaN + elif not attr.has_positive_inf: + placeholder = numpy.inf + elif not attr.has_negative_inf: + placeholder = -numpy.inf + elif not attr.has_lowest: + placeholder = numpy.finfo(x.dtype).min + elif not attr.has_highest: + placeholder = numpy.finfo(x.dtype).max + elif not attr.has_zero: + placeholder = 0 + + # Fallback that just goes through and pulls out all unique values. + # This does involve a coercion to 64-bit floats, though; that's + # just how 'choose_missing_float_placeholder' works currently. + if placeholder is None: + uniq = _unique_values(x) + placeholder = dl.choose_missing_float_placeholder(uniq) + return _OptimizedStorageParameters(type="f8", placeholder=placeholder, non_zero=attr.non_zero) + + if x.dtype == numpy.float32: + return _OptimizedStorageParameters(type="f4", placeholder=placeholder, non_zero=attr.non_zero) + else: + return _OptimizedStorageParameters(type="f8", placeholder=placeholder, non_zero=attr.non_zero) + + else: + if not attr.non_integer: + lower = attr.minimum + upper = attr.maximum + + # See logic in optimize_integer_storage(). + if lower is None: + return _OptimizedStorageParameters(type="i1", placeholder=None, non_zero=attr.non_zero) + + if lower < 0: + if lower >= -2**7 and upper < 2**7: + return _OptimizedStorageParameters(type="i1", placeholder=None, non_zero=attr.non_zero) + elif lower >= -2**15 and upper < 2**15: + return _OptimizedStorageParameters(type="i2", placeholder=None, non_zero=attr.non_zero) + elif lower >= -2**31 and upper < 2**31: + return _OptimizedStorageParameters(type="i4", placeholder=None, non_zero=attr.non_zero) + else: + if upper < 2**8: + return _OptimizedStorageParameters(type="u1", placeholder=None, non_zero=attr.non_zero) + elif upper < 2**16: + return _OptimizedStorageParameters(type="u2", placeholder=None, non_zero=attr.non_zero) + elif upper < 2**32: + return _OptimizedStorageParameters(type="u4", placeholder=None, non_zero=attr.non_zero) + + if x.dtype == numpy.float32: + return _OptimizedStorageParameters(type="f4", placeholder=None, non_zero=attr.non_zero) + else: + return _OptimizedStorageParameters(type="f8", placeholder=None, non_zero=attr.non_zero) + + +################################################### +################################################### + + +@dataclass +class _StringAttributes: + has_na1: bool + has_na2: bool + missing: bool + max_len: int + + +def _simple_string_collector(x: numpy.ndarray) -> _FloatAttributes: + if x.size == 0: + return _StringAttributes( + has_na1 = False, + has_na2 = False, + missing = False, + max_len = 0, + ) + + missing = False + if numpy.ma.is_masked(x): + if x.mask.all(): + return _StringAttributes( + has_na1=False, + has_na2=False, + missing=True, + max_len=0, + ) + if x.mask.any(): + missing = True + + max_len = 0 + if missing: + for y in x.ravel(): + if not numpy.ma.is_masked(y): + candidate = len(y.encode("UTF8")) + if max_len < candidate: + max_len = candidate + else: + max_len = max(len(y.encode("UTF8")) for y in x.ravel()) + + return _StringAttributes( + has_na1=x.dtype.type("NA") in x, + has_na2=x.dtype.type("NA_") in x, + missing=missing, + max_len=max_len, + ) + + +@singledispatch +def collect_string_attributes(x: Any) -> _StringAttributes: + collected = apply_over_blocks(x, lambda pos, block : _collect_string_attributes_from_ndarray(block)) + return _combine_string_attributes(collected) + + +def _combine_string_attributes(x: list[_StringAttributes]) -> _StringAttributes: + return _StringAttributes( + has_na1 = _aggregate_any(x, "has_na1"), + has_na2 = _aggregate_any(x, "has_na2"), + missing = _aggregate_any(x, "missing"), + max_len = _aggregate_max(x, "max_len"), + ) + + +@collect_string_attributes.register +def _collect_string_attributes_from_ndarray(x: numpy.ndarray) -> _StringAttributes: + return _simple_string_collector(x) + + +def optimize_string_storage(x) -> _OptimizedStorageParameters: + attr = collect_string_attributes(x) + attr.max_len = max(1, attr.max_len) + + placeholder = None + if attr.missing: + if not attr.has_na1: + placeholder = "NA" + elif not attr.has_na2: + placeholder = "NA_" + else: + uniq = _unique_values(x) + placeholder = dl.choose_missing_string_placeholder(uniq) + attr.max_len = max(len(placeholder.encode("UTF8")), attr.max_len) + + strtype = h5py.string_dtype(encoding = "utf8", length = attr.max_len) + return _OptimizedStorageParameters(type = strtype, placeholder = placeholder, non_zero = 0) + + +################################################### +################################################### + + +@dataclass +class _BooleanAttributes: + missing: bool + non_zero: int = 0 + + +@singledispatch +def collect_boolean_attributes(x: Any) -> _BooleanAttributes: + if is_sparse(x): + collated = apply_over_blocks(x, lambda pos, block : _collect_boolean_attributes_from_Sparse2darray(block), allow_sparse=True) + else: + collated = apply_over_blocks(x, lambda pos, block : _collect_boolean_attributes_from_ndarray(block)) + return _combine_boolean_attributes(collated) + + +@collect_boolean_attributes.register +def _collect_boolean_attributes_from_ndarray(x: numpy.ndarray) -> _BooleanAttributes: + return _simple_boolean_collector(x) + + +@collect_boolean_attributes.register +def _collect_boolean_attributes_from_Sparse2darray(x: SparseNdarray) -> _BooleanAttributes: + collected = _collect_from_Sparse2darray(x.contents, _simple_boolean_collector, x.dtype) + return _combine_boolean_attributes(collected) + + +def _simple_boolean_collector(x: numpy.ndarray) -> _BooleanAttributes: + missing = False + if x.size: + if numpy.ma.is_masked(x): + if x.mask.any(): + missing = True + return _BooleanAttributes(non_zero = 0, missing = missing) + + +def _combine_boolean_attributes(x: list[_BooleanAttributes]) -> _BooleanAttributes: + return _BooleanAttributes( + missing = _aggregate_any(x, "missing"), + non_zero = _aggregate_sum(x, "non_zero") + ) + + +if has_scipy: + @collect_boolean_attributes.register + def _collect_boolean_attributes_from_scipy_csc(x: scipy.sparse.csc_matrix): + output = _simple_boolean_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_boolean_attributes.register + def _collect_boolean_attributes_from_scipy_csr(x: scipy.sparse.csr_matrix): + output = _simple_boolean_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + + @collect_boolean_attributes.register + def _collect_boolean_attributes_from_scipy_coo(x: scipy.sparse.coo_matrix): + output = _simple_boolean_collector(x.data) + output.non_zero = int(x.data.shape[0]) + return output + + +def optimize_boolean_storage(x) -> _OptimizedStorageParameters: + attr = collect_boolean_attributes(x) + if attr.missing: + return _OptimizedStorageParameters(type="i1", placeholder=-1, non_zero=attr.non_zero) + else: + return _OptimizedStorageParameters(type="i1", placeholder=None, non_zero=attr.non_zero) diff --git a/src/dolomite_matrix/_utils.py b/src/dolomite_matrix/_utils.py index fbeaa34..ea1441e 100644 --- a/src/dolomite_matrix/_utils.py +++ b/src/dolomite_matrix/_utils.py @@ -1,48 +1,11 @@ -from numpy import issubdtype, integer, floating, bool_, uint8, prod, ceil -from h5py import File -from typing import Tuple -from delayedarray import DelayedArray, chunk_shape - - -def _translate_array_type(dtype): - if issubdtype(dtype, integer): - return "integer" - if issubdtype(dtype, floating): - return "number" - if dtype == bool_: - return "boolean" - raise NotImplementedError("staging of '" + str(type(dtype)) + "' arrays not yet supported") - - -def _choose_file_dtype(dtype): - if dtype == bool_: - return uint8 - return dtype - - -def _open_writeable_hdf5_handle(path: str, cache_size: int, num_slots: int = 1000003): - # using a prime for the number of slots to avoid collisions in the cache. - return File(path, "w", rdcc_nbytes = cache_size, rdcc_nslots = num_slots) - - -def _choose_block_shape(x: DelayedArray, block_size: int) -> Tuple[int, ...]: - # Block shapes are calculated by scaling up the chunks (from last to first, - # i.e., the fastest changing to the slowest) until the block size is exceeded. - full_shape = x.shape - ndim = len(full_shape) - - block_shape = list(chunk_shape(x)) - block_elements = int(block_size / x.dtype.itemsize) - - for i in range(ndim - 1, -1, -1): - current_elements = prod(block_shape) # just recompute it, avoid potential overflow issues. - if current_elements >= block_elements: - break - scaling = int(block_elements / current_elements) - if scaling == 1: - break - block_shape[i] = min(full_shape[i], scaling * block_shape[i]) - - return (*block_shape,) +import numpy +def sanitize_for_writing(x, placeholder): + if not numpy.ma.is_masked(x): + return x + if not x.mask.any(): + return vals.data + copy = x.data.copy() + copy[mask] = placeholder + return copy diff --git a/src/dolomite_matrix/choose_dense_chunk_sizes.py b/src/dolomite_matrix/choose_chunk_dimensions.py similarity index 85% rename from src/dolomite_matrix/choose_dense_chunk_sizes.py rename to src/dolomite_matrix/choose_chunk_dimensions.py index bb5d6e6..dbf19ea 100644 --- a/src/dolomite_matrix/choose_dense_chunk_sizes.py +++ b/src/dolomite_matrix/choose_chunk_dimensions.py @@ -1,8 +1,9 @@ -from typing import Tuple +from typing import Tuple, Any -def choose_dense_chunk_sizes(shape: Tuple[int, ...], size: int, min_extent: int = 100, memory: int = 1e7) -> Tuple[int, ...]: - """Choose some chunk sizes to use for a dense HDF5 dataset. For each +def choose_chunk_dimensions(shape: Tuple[int, ...], size: int, min_extent: int = 100, memory: int = 1e7) -> Tuple[int, ...]: + """ + Choose chunk dimensions to use for a dense HDF5 dataset. For each dimension, we consider a slice of the array that consists of the full extent of all other dimensions. We want this slice to occupy less than ``memory`` in memory, and we resize the slice along the current dimension diff --git a/src/dolomite_matrix/load_hdf5_dense_array.py b/src/dolomite_matrix/load_hdf5_dense_array.py deleted file mode 100644 index 2dd395c..0000000 --- a/src/dolomite_matrix/load_hdf5_dense_array.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Any -from numpy import bool_ -from filebackedarray import Hdf5DenseArray -from dolomite_base import acquire_file - - -def load_hdf5_dense_array(meta: dict[str, Any], project, **kwargs) -> Hdf5DenseArray: - """ - Load a HDF5-backed dense array. In general, this function should be - called via :py:meth:`~dolomite_base.load_object.load_object`. - - Args: - meta: Metadata for this HDF5 array. - - project: Value specifying the project of interest. This is most - typically a string containing a file path to a staging directory - but may also be an application-specific object that works with - :py:meth:`~dolomite_base.acquire_file.acquire_file`. - - kwargs: Further arguments, ignored. - - Returns: - A HDF5-backed dense array. - """ - fpath = acquire_file(project, meta["path"]) - name = meta["hdf5_dense_array"]["dataset"] - - dtype = None - if meta["array"]["type"] == "boolean": - dtype = bool_ - - return Hdf5DenseArray(fpath, name, dtype=dtype, native_order=False) diff --git a/src/dolomite_matrix/load_hdf5_sparse_matrix.py b/src/dolomite_matrix/load_hdf5_sparse_matrix.py deleted file mode 100644 index 99b4dae..0000000 --- a/src/dolomite_matrix/load_hdf5_sparse_matrix.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Any -from numpy import bool_ -from filebackedarray import Hdf5CompressedSparseMatrix -from dolomite_base import acquire_file - - -def load_hdf5_sparse_matrix(meta: dict[str, Any], project, **kwargs) -> Hdf5CompressedSparseMatrix: - """ - Load a HDF5-backed sparse matrix. In general, this function should be - called via :py:meth:`~dolomite_base.load_object.load_object`. - - Args: - meta: Metadata for this HDF5 array. - - project: Value specifying the project of interest. This is most - typically a string containing a file path to a staging directory - but may also be an application-specific object that works with - :py:meth:`~dolomite_base.acquire_file.acquire_file`. - - kwargs: Further arguments, ignored. - - Returns: - A HDF5-backed sparse matrix. - """ - fpath = acquire_file(project, meta["path"]) - name = meta["hdf5_sparse_matrix"]["group"] - - dtype = None - if meta["array"]["type"] == "boolean": - dtype = bool_ - - return Hdf5CompressedSparseMatrix(fpath, name, shape=(*meta["array"]["dimensions"],), by_column=True, dtype=dtype) diff --git a/src/dolomite_matrix/read_compressed_sparse_matrix.py b/src/dolomite_matrix/read_compressed_sparse_matrix.py new file mode 100644 index 0000000..d8bab24 --- /dev/null +++ b/src/dolomite_matrix/read_compressed_sparse_matrix.py @@ -0,0 +1,54 @@ +import numpy +import os +import h5py +from delayedarray import DelayedArray +from filebackedarray import Hdf5CompressedSparseMatrixSeed + +from .DelayedMask import DelayedMask +from .ReloadedArray import ReloadedArray + + +def read_compressed_sparse_matrix(path: str, **kwargs) -> DelayedArray: + """ + Read a compressed sparse matrix from its on-disk representation. In + general, this function should not be called directly but instead be + dispatched via :py:meth:`~dolomite_base.read_object.read_object`. + + Args: + path: Path to the directory containing the object. + + kwargs: Further arguments, ignored. + + Returns: + A HDF5-backed compressed sparse matrix. + """ + fpath = os.path.join(path, "matrix.h5") + name = "compressed_sparse_matrix" + + with h5py.File(fpath, "r") as handle: + ghandle = handle[name] + dhandle = ghandle["data"] + + tt = ghandle.attrs["type"] + dtype = None + if tt == "boolean": + dtype = numpy.dtype("bool") + elif tt == "float": + if not numpy.issubdtype(dhandle.dtype, numpy.floating): + dtype = numpy.dtype("float64") + + layout = ghandle.attrs["layout"] + shape = (*[int(y) for y in ghandle["shape"]],) + + placeholder = None + if "missing-value-placeholder" in dhandle.attrs: + placeholder = dhandle.attrs["missing-value-placeholder"] + + bycol = (layout == "CSC") + if placeholder is None: + seed = Hdf5CompressedSparseMatrixSeed(fpath, name, shape=shape, by_column = bycol, dtype = dtype) + else: + core = Hdf5CompressedSparseMatrixSeed(fpath, name, shape=shape, by_column = bycol) + seed = DelayedMask(core, placeholder=placeholder, dtype=dtype) + + return ReloadedArray(seed, path) diff --git a/src/dolomite_matrix/read_dense_array.py b/src/dolomite_matrix/read_dense_array.py new file mode 100644 index 0000000..68b47cd --- /dev/null +++ b/src/dolomite_matrix/read_dense_array.py @@ -0,0 +1,59 @@ +import numpy +import os +import h5py +from delayedarray import DelayedArray +from filebackedarray import Hdf5DenseArraySeed + +from .DelayedMask import DelayedMask +from .ReloadedArray import ReloadedArray + + +def read_dense_array(path: str, **kwargs) -> DelayedArray: + """ + Read a dense array from its on-disk representation. In general, this + function should not be called directly but instead be dispatched via + :py:meth:`~dolomite_base.read_object.read_object`. + + Args: + path: Path to the directory containing the object. + + kwargs: Further arguments, ignored. + + Returns: + A HDF5-backed dense array. + """ + fpath = os.path.join(path, "array.h5") + name = "dense_array" + + with h5py.File(fpath, "r") as handle: + ghandle = handle[name] + dhandle = ghandle["data"] + + tt = ghandle.attrs["type"] + dtype = None + if tt == "boolean": + dtype = numpy.dtype("bool") + elif tt == "string": + dtype_name = "U" + str(dhandle.dtype.itemsize) + dtype = numpy.dtype(dtype_name) + elif tt == "float": + if not numpy.issubdtype(dhandle.dtype, numpy.floating): + dtype = numpy.dtype("float64") + + transposed = False + if "transposed" in ghandle: + transposed = (ghandle["transposed"][()] != 0) + + placeholder = None + if "missing-value-placeholder" in dhandle.attrs: + placeholder = dhandle.attrs["missing-value-placeholder"] + + dname = name + "/data" + is_native = not transposed + if placeholder is None: + seed = Hdf5DenseArraySeed(fpath, dname, dtype=dtype, native_order=is_native) + else: + core = Hdf5DenseArraySeed(fpath, dname, native_order=is_native) + seed = DelayedMask(core, placeholder=placeholder, dtype=dtype) + + return ReloadedArray(seed, path) diff --git a/src/dolomite_matrix/save_compressed_sparse_matrix.py b/src/dolomite_matrix/save_compressed_sparse_matrix.py new file mode 100644 index 0000000..2a0b01a --- /dev/null +++ b/src/dolomite_matrix/save_compressed_sparse_matrix.py @@ -0,0 +1,327 @@ +from typing import Any +from functools import singledispatch +from dolomite_base import save_object, validate_saves +from delayedarray import SparseNdarray, chunk_shape +import os +import h5py +import numpy +import delayedarray + +from . import _utils as ut +from . import _optimize_storage as optim + + +has_scipy = False +try: + import scipy.sparse + has_scipy = True +except: + pass + + +def _choose_index_type(n: int) -> str: + if n < 2**8: + return "u1" + elif n < 2**16: + return "u2" + elif n < 2**32: + return "u4" + else: + return "u8" + + +############################################### +############################################### + + +@singledispatch +def _h5_write_sparse_matrix(x: Any, handle, details, compressed_sparse_matrix_buffer_size, compressed_sparse_matrix_chunk_size): + chunks = chunk_shape(x) + chunks_per_col = x.shape[0] / chunks[0] + chunks_per_row = x.shape[1] / chunks[1] + + # If we have to extract fewer chunks per column, we iterate by column to + # create a CSC matrix. Otherwise we make a CSR matrix. + if chunks_per_col < chunks_per_row: + primary = 1 + secondary = 0 + handle.attrs["layout"] = "CSC" + else: + primary = 0 + secondary = 1 + handle.attrs["layout"] = "CSR" + + compressed_sparse_matrix_chunk_size = min(compressed_sparse_matrix_chunk_size, details.non_zero) + dhandle = handle.create_dataset("data", shape = details.non_zero, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) + if details.placeholder is not None: + dhandle.create("missing-value-placeholder", data = details.placeholder, dtype = details.dtype) + + itype = _choose_index_type(x.shape[secondary]) + ihandle = handle.create_dataset("indices", shape = details.non_zero, dtype = itype, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) + indptrs = numpy.zeros(x.shape[primary] + 1, dtype = numpy.uint64) + counter = 0 + + block_size = delayedarray.choose_block_size_for_1d_iteration(x, dimension=primary, memory=compressed_sparse_matrix_buffer_size) + limit = x.shape[primary] + subset = [None] * 2 + subset[secondary] = range(x.shape[secondary]) + + for start in range(0, limit, block_size): + end = min(limit, start + block_size) + subset[primary] = range(start, end) + block = delayedarray.extract_sparse_array(x, (*subset,)) + + if block.contents is not None: + # Sparse2darrays are always CSC, so if we need to save it as CSR, + # we transpose it before we extract the contents. + if primary == 0: + block = block.T + + original = counter + icollected = [] + dcollected = [] + + for i, b in enumerate(block.contents): + if b is not None: + counter += len(b[0]) + icollected.append(b[0]) + dcollected.append(ut.sanitize_for_writing(b[1], details.placeholder)) + indptrs[start + i + 1] = counter + + # Collecting everything in memory for a single write operation, avoid + # potential issues with writing/reloading partial chunks. + ihandle[original : counter] = numpy.concatenate(icollected) + dhandle[original : counter] = numpy.concatenate(dcollected) + + handle.create_dataset("indptr", data=indptrs, dtype="u8", compression="gzip", chunks=True) + + +if has_scipy: + def _write_compressed_sparse_matrix(x: Any, handle, details, compressed_sparse_matrix_buffer_size, compressed_sparse_matrix_chunk_size, by_column): + if by_column: + primary = 1 + secondary = 0 + handle.attrs["layout"] = "CSC" + else: + primary = 0 + secondary = 1 + handle.attrs["layout"] = "CSR" + + compressed_sparse_matrix_chunk_size = min(compressed_sparse_matrix_chunk_size, details.non_zero) + itype = _choose_index_type(x.shape[secondary]) + handle.create_dataset("indices", data = x.indices, dtype = itype, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) + handle.create_dataset("indptr", data = x.indptr, dtype = "u8", compression = "gzip", chunks = True) + + if not numpy.ma.is_masked(x.data): + handle.create_dataset("data", data = x.data, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) + elif not x.mask.any(): + handle.create_dataset("data", data = x.data.data, dtype = details.type, compression = "gzip", chunks = compressed_sparse_matrix_chunk_size) + else: + dhandle = handle.create_dataset("data", shape = details.non_zero, dtype = details.type, compression="gzip", chunks = compressed_sparse_matrix_chunk_size) + if details.placeholder is not None: + dhandle.create("missing-value-placeholder", data = details.placeholder, dtype = details.dtype) + + step = max(1, int(compressed_sparse_matrix_buffer_size / compressed_sparse_matrix_chunk_size)) * compressed_sparse_matrix_chunk_size + for i in range(0, details.non_zero, step): + end = min(details.non_zero, i + step) + block = x.data[i : end] # might be a view, so sanitization (and possible copying) is necessary. + dhandle[i : end] = ut.sanitize_for_writing(block, details.placeholder) + + + @_h5_write_sparse_matrix.register + def _h5_write_sparse_matrix_from_csc_matrix(x: scipy.sparse.csc_matrix, handle, details, compressed_sparse_matrix_buffer_size, compressed_sparse_matrix_chunk_size): + _write_compressed_sparse_matrix( + x, + handle, + details, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + by_column = True + ) + + + @_h5_write_sparse_matrix.register + def _h5_write_sparse_matrix_from_csr_matrix(x: scipy.sparse.csr_matrix, handle, details, compressed_sparse_matrix_buffer_size, compressed_sparse_matrix_chunk_size): + _write_compressed_sparse_matrix( + x, + handle, + details, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + by_column = False + ) + + +############################################### +############################################### + + +def _save_compressed_sparse_matrix(x: Any, path: str, compressed_sparse_matrix_chunk_size: int = 10000, compressed_sparse_matrix_buffer_size: int = 1e8, **kwargs): + os.mkdir(path) + if len(x.shape) != 2: + raise ValueError("only 2-dimensional sparse arrays are currently supported") + + with h5py.File(os.path.join(path, "matrix.h5"), "w") as handle: + ghandle = handle.create_group("compressed_sparse_matrix") + + if numpy.issubdtype(x.dtype, numpy.integer): + tt = "integer" + opts = optim.optimize_integer_storage(x) + elif numpy.issubdtype(x.dtype, numpy.floating): + tt = "number" + opts = optim.optimize_float_storage(x) + elif x.dtype == numpy.bool_: + tt = "boolean" + opts = optim.optimize_boolean_storage(x) + else: + raise NotImplementedError("cannot save sparse matrix of type '" + x.dtype.name + "'") + + ghandle.attrs["type"] = tt + ghandle.create_dataset("shape", data = x.shape, dtype = "u8") + _h5_write_sparse_matrix( + x, + handle = ghandle, + details = opts, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size + ) + + with open(os.path.join(path, "OBJECT"), "w") as handle: + handle.write('{ "type": "compressed_sparse_matrix", "compressed_sparse_matrix": { "version": "1.0" } }') + + +@save_object.register +@validate_saves +def save_compresssed_sparse_matrix_from_Sparse2darray(x: SparseNdarray, path: str, compressed_sparse_matrix_chunk_size: int = 10000, compressed_sparse_matrix_buffer_size: int = 1e8, **kwargs): + """ + Method for saving a :py:class:`~delayedarray.SparseNdarray.SparseNdarray` + to disk, see :py:meth:`~dolomite_base.save_object.save_object` for details. + + Args: + x: Object to be saved. + + path: Path to a directory to save ``x``. + + compressed_sparse_matrix_chunk_size + Chunk size for the data and indices. Larger values improve compression + at the potential cost of reducing random access efficiency. + + compressed_sparse_matrix_buffer_size: + Size of the buffer in bytes, for blockwise processing and writing + to file. Larger values improve speed at the cost of memory. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + _save_compressed_sparse_matrix( + x, + path, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + **kwargs + ) + + +if has_scipy: + @save_object.register + @validate_saves + def save_compresssed_sparse_matrix_from_scipy_csc_matrix(x: scipy.sparse.csc_matrix, path: str, compressed_sparse_matrix_chunk_size: int = 10000, compressed_sparse_matrix_buffer_size: int = 1e8, **kwargs): + """ + Method for saving :py:class:`~scipy.sparse.csc_matrix` objects to disk, + see :py:meth:`~dolomite_base.stage_object.stage_object` for details. + + Args: + x: Matrix to be saved. + + path: Path to a directory to save ``x``. + + compressed_sparse_matrix_chunk_size + Chunk size for the data and indices. Larger values improve compression + at the potential cost of reducing random access efficiency. + + compressed_sparse_matrix_cache_size: + Size of the buffer in bytes, for blockwise processing and writing + to file. Larger values improve speed at the cost of memory. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + return _save_compressed_sparse_matrix( + x, + path, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + **kwargs + ) + + + @save_object.register + @validate_saves + def save_compresssed_sparse_matrix_from_scipy_csr_matrix(x: scipy.sparse.csr_matrix, path: str, compressed_sparse_matrix_chunk_size: int = 10000, compressed_sparse_matrix_buffer_size: int = 1e8, **kwargs): + """ + Method for saving :py:class:`~scipy.sparse.csr_matrix` objects to disk, + see :py:meth:`~dolomite_base.stage_object.stage_object` for details. + + Args: + x: Matrix to be saved. + + path: Path to a directory to save ``x``. + + compressed_sparse_matrix_chunk_size + Chunk size for the data and indices. Larger values improve compression + at the potential cost of reducing random access efficiency. + + compressed_sparse_matrix_cache_size: + Size of the buffer in bytes, for blockwise processing and writing + to file. Larger values improve speed at the cost of memory. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + return _save_compressed_sparse_matrix( + x, + path, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + **kwargs + ) + + + @save_object.register + @validate_saves + def save_compresssed_sparse_matrix_from_scipy_coo_matrix(x: scipy.sparse.coo_matrix, path: str, compressed_sparse_matrix_chunk_size: int = 10000, compressed_sparse_matrix_buffer_size: int = 1e8, **kwargs): + """ + Method for saving :py:class:`~scipy.sparse.coo_matrix` objects to disk, + see :py:meth:`~dolomite_base.stage_object.stage_object` for details. + + Args: + x: Matrix to be saved. + + path: Path to a directory to save ``x``. + + compressed_sparse_matrix_chunk_size + Chunk size for the data and indices. Larger values improve compression + at the potential cost of reducing random access efficiency. + + compressed_sparse_matrix_cache_size: + Size of the buffer in bytes, for blockwise processing and writing + to file. Larger values improve speed at the cost of memory. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + return _save_compressed_sparse_matrix( + x, + path, + compressed_sparse_matrix_chunk_size = compressed_sparse_matrix_chunk_size, + compressed_sparse_matrix_buffer_size = compressed_sparse_matrix_buffer_size, + **kwargs + ) diff --git a/src/dolomite_matrix/save_delayed_array.py b/src/dolomite_matrix/save_delayed_array.py new file mode 100644 index 0000000..70fec5b --- /dev/null +++ b/src/dolomite_matrix/save_delayed_array.py @@ -0,0 +1,48 @@ +from typing import Tuple, Optional, Any, Union +from dolomite_base import save_object, validate_saves +import delayedarray +import numpy +import os + +from .save_compressed_sparse_matrix import _save_compressed_sparse_matrix +from .save_dense_array import _save_dense_array + + +@save_object.register +@validate_saves +def save_delayed_array(x: delayedarray.DelayedArray, path: str, delayed_array_preserve_operations: bool = False, **kwargs): + """ + Method to save :py:class:`~delayedarray.DelayedArray.DelayedArray` objects + to disk, see :py:meth:`~dolomite_base.save_object.save_object` for details. + + If the array is pristine, we attempt to use the ``save_object`` method of + the seed. If ``delayed_array_preserve_operations = False``, we save the + ``DelayedArray`` as a dense array or a compressed sparse matrix. + + Args: + x: Object to be saved. + + path: Path to a directory to save ``x``. + + delayed_array_preserve_operations: + Whether to preserve delayed operations via the **chihaya** specification. + Currently not supported. + + kwargs: + Further arguments, passed to the ``save_object`` methods for dense + arrays and compressed sparse matrices. + + Returns: + ``x`` is saved to ``path``. + """ + if delayedarray.is_pristine(x): + candidate = save_object.dispatch(type(x.seed)) + if save_object.dispatch(object) != candidate: + return candidate(x.seed, path, delayed_array_preserve_operations=delayed_array_preserve_operations, **kwargs) + + if not delayed_array_preserve_operations: + if delayedarray.is_sparse(x): + return _save_compressed_sparse_matrix(x, path, **kwargs) + return _save_dense_array(x, path, **kwargs) + + raise NotImplementedError("no support yet for delayed operations, this will be coming soon") diff --git a/src/dolomite_matrix/save_dense_array.py b/src/dolomite_matrix/save_dense_array.py new file mode 100644 index 0000000..78e9648 --- /dev/null +++ b/src/dolomite_matrix/save_dense_array.py @@ -0,0 +1,183 @@ +from typing import Tuple, Optional, Any, Dict +import numpy +from dolomite_base import save_object, validate_saves +import delayedarray +import os +import h5py + +from .choose_chunk_dimensions import choose_chunk_dimensions +from . import _optimize_storage as optim +from . import _utils as ut + + +################################################### +################################################### + + +# We use a mock class with the properties of the HDF5 dataset. This allows us +# to use choose_block_shape_for_iteration to pick dimensions that align with +# the HDF5 chunks; these may or may not be suitable for the input array, but +# we'll take the chance that the input array is already in memory. +class _DenseArrayOutputMock: + def __init__(self, shape: Tuple, dtype: numpy.dtype, chunks: Tuple): + self.shape = shape + self.dtype = dtype + self.chunks = chunks + + +@delayedarray.chunk_shape.register +def _chunk_shape_DenseArrayOutputMock(x: _DenseArrayOutputMock): + return x.chunks + + +def _blockwise_write_to_hdf5(dhandle: h5py.Dataset, chunk_shape: Tuple, x: Any, placeholder: Any, is_string: bool, memory: int): + mock = _DenseArrayOutputMock(x.shape, x.dtype, chunk_shape) + block_shape = delayedarray.choose_block_shape_for_iteration(mock, memory=memory) + if placeholder is not None: + placeholder = x.dtype.type(placeholder) + + def _blockwise_dense_writer(pos: Tuple, block): + block = ut.sanitize_for_writing(block, placeholder) + + # h5py doesn't want to convert from numpy's Unicode type to bytes + # automatically, and fails: so fine, we'll do it ourselves. + if is_string: + block = block.astype(dhandle.dtype, copy=False) + + # Block processing is inherently Fortran-order based (i.e., first + # dimension is assumed to change the fastest), and the blocks + # themselves are also in F-contiguous layout (i.e., column-major). By + # comparison HDF5 uses C order. To avoid any rearrangement of data + # by h5py, we save it as a transposed array for efficiency. + coords = [slice(start, end) for start, end in reversed(pos)] + dhandle[(*coords,)] = block.T + + delayedarray.apply_over_blocks(x, _blockwise_dense_writer, block_shape = block_shape) + return + + +################################################### +################################################### + + +def _save_dense_array( + x: numpy.ndarray, + path: str, + dense_array_chunk_dimensions: Optional[Tuple[int, ...]] = None, + dense_array_chunk_args: Dict = {}, + dense_array_buffer_size: int = 1e8, + **kwargs +): + os.mkdir(path) + + # Coming up with a decent chunk size. + if dense_array_chunk_dimensions is None: + dense_array_chunk_dimensions = choose_chunk_dimensions(x.shape, x.dtype.itemsize, **dense_array_chunk_args) + else: + capped = [] + for i, d in enumerate(x.shape): + capped.append(min(d, dense_array_chunk_dimensions[i])) + dense_array_chunk_dimensions = (*capped,) + + # Choosing the smallest data type that we can use. + tt = None + blockwise = False + if numpy.issubdtype(x.dtype, numpy.integer): + tt = "integer" + opts = optim.optimize_integer_storage(x) + elif numpy.issubdtype(x.dtype, numpy.floating): + tt = "number" + opts = optim.optimize_float_storage(x) + elif x.dtype == numpy.bool_: + tt = "boolean" + opts = optim.optimize_boolean_storage(x) + elif numpy.issubdtype(x.dtype, numpy.str_): + tt = "string" + opts = optim.optimize_string_storage(x) + blockwise = True + else: + raise NotImplementedError("cannot save dense array of type '" + x.dtype.name + "'") + + if opts.placeholder is not None: + blockwise = True + if not isinstance(x, numpy.ndarray): + blockwise = True + + fpath = os.path.join(path, "array.h5") + with h5py.File(fpath, "w") as handle: + ghandle = handle.create_group("dense_array") + ghandle.attrs["type"] = tt + + if not blockwise: + # Saving it in transposed form if it's in Fortran order (i.e., first dimensions are fastest). + # This avoids the need for any data reorganization inside h5py itself. + if x.flags.f_contiguous: + x = x.T + dense_array_chunk_dimensions = (*reversed(dense_array_chunk_dimensions),) + ghandle.create_dataset("transposed", data=1, dtype="i1") + else: + ghandle.create_dataset("transposed", data=0, dtype="i1") + dhandle = ghandle.create_dataset("data", data=x, chunks=dense_array_chunk_dimensions, dtype=opts.type, compression="gzip") + else: + # Block processing of a dataset is always Fortran order, but HDF5 uses C order. + # So, we save the blocks in transposed form for efficiency. + ghandle.create_dataset("transposed", data=1, dtype="i1") + dhandle = ghandle.create_dataset("data", shape=(*reversed(x.shape),), chunks=(*reversed(dense_array_chunk_dimensions),), dtype=opts.type, compression="gzip") + _blockwise_write_to_hdf5(dhandle, chunk_shape=dense_array_chunk_dimensions, x=x, placeholder=opts.placeholder, is_string=(tt == "string"), memory=dense_array_buffer_size) + if opts.placeholder is not None: + dhandle.attrs.create("missing-value-placeholder", data=opts.placeholder, dtype=opts.type) + + with open(os.path.join(path, "OBJECT"), "w") as handle: + handle.write('{ "type": "dense_array", "dense_array": { "version": "1.0" } }') + + +################################################### +################################################### + + +@save_object.register +@validate_saves +def save_dense_array_from_ndarray( + x: numpy.ndarray, + path: str, + dense_array_chunk_dimensions: Optional[Tuple[int, ...]] = None, + dense_array_chunk_args: Dict = {}, + dense_array_buffer_size: int = 1e8, + **kwargs +): + """ + Method for saving :py:class:`~numpy.ndarray` objects to disk, see + :py:meth:`~dolomite_base.save_object.save_object` for details. + + Args: + x: Object to be saved. + + path: Path to a directory to save ``x``. + + dense_array_chunk_dimensions: + Chunk dimensions for the HDF5 dataset. Larger values improve + compression at the potential cost of reducing random access + efficiency. If not provided, we choose some chunk sizes with + :py:meth:`~dolomite_matrix.choose_chunk_dimensions.choose_chunk_dimensions`. + + dense_array_chunk_args: + Arguments to pass to ``choose_chunk_dimensions`` if + ``dense_array_chunk_dimensions`` is not provided. + + dense_array_buffer_size: + Size of the buffer in bytes, for blockwise processing and writing + to file. Larger values improve speed at the cost of memory. + + kwargs: Further arguments, ignored. + + Returns: + ``x`` is saved to ``path``. + """ + _save_dense_array( + x, + path=path, + dense_array_chunk_dimensions=dense_array_chunk_dimensions, + dense_array_chunk_args = dense_array_chunk_args, + dense_array_buffer_size = dense_array_buffer_size, + **kwargs + ) diff --git a/src/dolomite_matrix/stage_DelayedArray.py b/src/dolomite_matrix/stage_DelayedArray.py deleted file mode 100644 index e40854d..0000000 --- a/src/dolomite_matrix/stage_DelayedArray.py +++ /dev/null @@ -1,180 +0,0 @@ -from typing import Tuple, Optional, Any, Union -from delayedarray import DelayedArray, chunk_shape, is_sparse, extract_dense_array, extract_sparse_array, is_pristine -from numpy import ceil, prod -from dolomite_base import stage_object -import os - -from .choose_dense_chunk_sizes import choose_dense_chunk_sizes -from ._utils import _choose_file_dtype, _translate_array_type, _open_writeable_hdf5_handle, _choose_block_shape -from .stage_sparse import _stage_sparse_matrix - - -def _stage_DelayedArray_dense( - x: DelayedArray, - dir: str, - path: str, - is_child: bool = False, - chunks: Optional[Tuple[int, ...]] = None, - cache_size: int = 1e8, - block_size: int = 1e8, - **kwargs -) -> dict[str, Any]: - os.mkdir(os.path.join(dir, path)) - newpath = path + "/array.h5" - - # Coming up with a decent chunk size. - if chunks is None: - chunks = choose_dense_chunk_sizes(x.shape, x.dtype.itemsize) - else: - capped = [] - for i, d in enumerate(x.shape): - capped.append(min(d, chunks[i])) - chunks = (*capped,) - - # Transposing it so that we save it in the right order. - t = x.T - chunks = (*list(reversed(chunks)),) - - # Saving the matrix in a blockwise fashion. We progress along HDF5's - # fastest changing dimension (i.e., the last one), and we shift along the - # other dimensions once we need to wrap around. - full_shape = t.shape - ndim = len(full_shape) - block_shape = _choose_block_shape(t, block_size) - - fpath = os.path.join(dir, newpath) - with _open_writeable_hdf5_handle(fpath, cache_size) as fhandle: - dset = fhandle.create_dataset("data", shape=t.shape, chunks=chunks, dtype=_choose_file_dtype(t.dtype), compression="gzip") - - num_chunks = [] - subset_as_slices = [] - for i, s in enumerate(full_shape): - b = block_shape[i] - num_chunks.append(int(ceil(s / b))) - subset_as_slices.append(slice(0, b)) - - starts = [0] * len(num_chunks) - counter = [0] * len(num_chunks) - subset_as_ranges = [None] * len(num_chunks) - - running = True - while running: - for i, sl in enumerate(subset_as_slices): - subset_as_ranges[i] = range(*(sl.indices(full_shape[i]))) - curblock = extract_dense_array(t, subset_as_ranges) - dset[(*subset_as_slices,)] = curblock - - for i in range(ndim - 1, -1, -1): - starts[i] += 1 - block_extent = block_shape[i] - if starts[i] < num_chunks[i]: - new_start = starts[i] * block_extent - subset_as_slices[i] = slice(new_start, min(new_start + block_extent, full_shape[i])) - break - if i == 0: - running = False - break - starts[i] = 0 - subset_as_slices[i] = slice(0, block_extent) - - return { - "$schema": "hdf5_dense_array/v1.json", - "path": newpath, - "is_child": is_child, - "array": { - "type": _translate_array_type(x.dtype), - "dimensions": list(x.shape), - }, - "hdf5_dense_array": { - "dataset": "data", - } - } - - -@stage_object.register -def stage_DelayedArray( - x: DelayedArray, - dir: str, - path: str, - is_child: bool = False, - chunks: Optional[Union[Tuple[int, ...], int]] = None, - cache_size: int = 1e8, - block_size: int = 1e8, - **kwargs -) -> dict[str, Any]: - """Method for saving :py:class:`~delaydearray.DelayedArray.DelayedArray` - objects to file, see :py:meth:`~dolomite_base.stage_object.stage_object` - for details. - - Args: - x: Array to be saved. - - dir: Staging directory. - - path: Relative path inside ``dir`` to save the object. - - is_child: Is ``x`` a child of another object? - - chunks: - For dense ``x``, a tuple of chunk dimensions. If None, we - automatically choose some chunk sizes with - :py:meth:`~dolomite_matrix.choose_dense_chunk_sizes.choose_dense_chunk_sizes`. - - For sparse ``x``, an integer specifying the chunk dimension in - :py:meth:`~dolomite_matrix.write_sparse_matrix.write_sparse_matrix`. - If None, a suitable chunk size is chosen. - - cache_size: - Size of the HDF5 cache size, in bytes. Larger values improve speed - at the cost of memory. - - block_size: - Size of each block in bytes. Staging is performed by iterating over - ``x``, extracting one block at a time, and saving it to the HDF5 - file. Larger values improve speed at the cost of memory. - - kwargs: - Further arguments, ignored for dense ``x``. For sparse ``x``, these - are passed to - :py:meth:`~dolomite_matrix.write_sparse_matrix.write_sparse_matrix`. - - Returns: - Metadata that can be edited by calling methods and then saved with - :py:meth:`~dolomite_base.write_metadata.write_metadata`. - """ - # Seeing if we can call specialized method for the seed in pristine objects. - if is_pristine(x) and isinstance(x, DelayedArray): - candidate = stage_object.dispatch(type(x.seed)) - if stage_object.dispatch(object) != candidate: - return candidate( - x.seed, - dir=dir, - path=path, - is_child=is_child, - chunks=chunks, - cache_size=cache_size, - block_size=block_size, - **kwargs - ) - - if is_sparse(x): - return _stage_sparse_matrix( - x, - dir=dir, - path=path, - is_child=is_child, - chunks=chunks, - block_size=block_size, - **kwargs, - ) - else: - return _stage_DelayedArray_dense( - x, - dir=dir, - path=path, - is_child=is_child, - chunks=chunks, - cache_size=cache_size, - block_size=block_size, - **kwargs - ) diff --git a/src/dolomite_matrix/stage_ndarray.py b/src/dolomite_matrix/stage_ndarray.py deleted file mode 100644 index 41e54b5..0000000 --- a/src/dolomite_matrix/stage_ndarray.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Tuple, Optional, Any -from numpy import ndarray, bool_, uint8 -from dolomite_base import stage_object -import os - -from .choose_dense_chunk_sizes import choose_dense_chunk_sizes -from ._utils import _translate_array_type, _open_writeable_hdf5_handle, _choose_file_dtype - - -@stage_object.register -def stage_ndarray( - x: ndarray, - dir: str, - path: str, - is_child: bool = False, - chunks: Optional[Tuple[int, ...]] = None, - cache_size: int = 1e8, - **kwargs -) -> dict[str, Any]: - """Method for saving :py:class:`~numpy.ndarray` objects to file, see - :py:meth:`~dolomite_base.stage_object.stage_object` for details. - - Args: - x: Array to be saved. - - dir: Staging directory. - - path: Relative path inside ``dir`` to save the object. - - is_child: Is ``x`` a child of another object? - - chunks: - Chunk dimensions. If not provided, we choose some chunk sizes with - :py:meth:`~dolomite_matrix.choose_dense_chunk_sizes.choose_dense_chunk_sizes`. - - cache_size: - Size of the HDF5 cache size, in bytes. Larger values improve speed - at the cost of memory. - - kwargs: Further arguments, ignored. - - Returns: - Metadata that can be edited by calling methods and then saved with - :py:meth:`~dolomite_base.write_metadata.write_metadata`. - """ - os.mkdir(os.path.join(dir, path)) - newpath = path + "/array.h5" - - # Coming up with a decent chunk size. - if chunks is None: - chunks = choose_dense_chunk_sizes(x.shape, x.dtype.itemsize) - else: - capped = [] - for i, d in enumerate(x.shape): - capped.append(min(d, chunks[i])) - chunks = (*capped,) - - # Transposing it so that we save it in the right order. - t = x.T - chunks = (*list(reversed(chunks)),) - - fpath = os.path.join(dir, newpath) - with _open_writeable_hdf5_handle(fpath, cache_size) as fhandle: - fhandle.create_dataset("data", data=t, chunks=chunks, dtype=_choose_file_dtype(t.dtype), compression="gzip") - - return { - "$schema": "hdf5_dense_array/v1.json", - "path": newpath, - "is_child": is_child, - "array": { - "type": _translate_array_type(x.dtype), - "dimensions": list(x.shape), - }, - "hdf5_dense_array": { - "dataset": "data", - } - } diff --git a/src/dolomite_matrix/stage_sparse.py b/src/dolomite_matrix/stage_sparse.py deleted file mode 100644 index 5701468..0000000 --- a/src/dolomite_matrix/stage_sparse.py +++ /dev/null @@ -1,111 +0,0 @@ -from dolomite_base import stage_object -import os - -from .write_sparse_matrix import write_sparse_matrix -from ._utils import _translate_array_type - - -has_scipy = False -try: - import scipy.sparse - has_scipy = True -except: - pass - - -def _stage_sparse_matrix(x, dir: str, path: str, is_child: bool, **kwargs): - os.mkdir(os.path.join(dir, path)) - newpath = path + "/matrix.h5" - fullpath = os.path.join(dir, newpath) - write_sparse_matrix(x, fullpath, "data", **kwargs) - return { - "$schema": "hdf5_sparse_matrix/v1.json", - "path": newpath, - "array": { - "type": _translate_array_type(x.dtype), - "dimensions": list(x.shape), - }, - "hdf5_sparse_matrix": { - "format": "tenx_matrix", - "group": "data" - } - } - - -if has_scipy: - @stage_object.register - def stage_scipy_csc_matrix(x: scipy.sparse.csc_matrix, dir: str, path: str, is_child: bool = False, **kwargs): - """Method for saving :py:class:`~scipy.sparse.csc_matrix` objects to - file, see :py:meth:`~dolomite_base.stage_object.stage_object` for - details. - - Args: - x: Matrix to be saved. - - dir: Staging directory. - - path: Relative path inside ``dir`` to save the object. - - is_child: Is ``x`` a child of another object? - - kwargs: - Further arguments to pass to - :py:meth:`~dolomite_matrix.write_sparse_matrix.write_sparse_matrix`. - - Returns: - Metadata that can be edited by calling methods and then saved with - :py:meth:`~dolomite_base.write_metadata.write_metadata`. - """ - return _stage_sparse_matrix(x, dir, path, is_child, **kwargs) - - - @stage_object.register - def stage_scipy_csr_matrix(x: scipy.sparse.csr_matrix, dir: str, path: str, is_child: bool = False, **kwargs): - """Method for saving :py:class:`~scipy.sparse.csr_matrix` objects to - file, see :py:meth:`~dolomite_base.stage_object.stage_object` for - details. - - Args: - x: Matrix to be saved. - - dir: Staging directory. - - path: Relative path inside ``dir`` to save the object. - - is_child: Is ``x`` a child of another object? - - kwargs: - Further arguments to pass to - :py:meth:`~dolomite_matrix.write_sparse_matrix.write_sparse_matrix`. - - Returns: - Metadata that can be edited by calling methods and then saved with - :py:meth:`~dolomite_base.write_metadata.write_metadata`. - """ - return _stage_sparse_matrix(x, dir, path, is_child, **kwargs) - - - @stage_object.register - def stage_scipy_coo_matrix(x: scipy.sparse.coo_matrix, dir: str, path: str, is_child: bool = False, **kwargs): - """Method for saving :py:class:`~scipy.sparse.coo_matrix` objects to - file, see :py:meth:`~dolomite_base.stage_object.stage_object` for - details. - - Args: - x: Matrix to be saved. - - dir: Staging directory. - - path: Relative path inside ``dir`` to save the object. - - is_child: Is ``x`` a child of another object? - - kwargs: - Further arguments to pass to - :py:meth:`~dolomite_matrix.write_sparse_matrix.write_sparse_matrix`. - - Returns: - Metadata that can be edited by calling methods and then saved with - :py:meth:`~dolomite_base.write_metadata.write_metadata`. - """ - return _stage_sparse_matrix(x, dir, path, is_child, **kwargs) diff --git a/src/dolomite_matrix/write_sparse_matrix.py b/src/dolomite_matrix/write_sparse_matrix.py deleted file mode 100644 index 662ff3b..0000000 --- a/src/dolomite_matrix/write_sparse_matrix.py +++ /dev/null @@ -1,333 +0,0 @@ -from functools import singledispatch -from typing import Any, Optional -from collections import namedtuple -from delayedarray import wrap, extract_sparse_array, DelayedArray, SparseNdarray, Combine, guess_iteration_block_size -from h5py import File -import numpy - -from ._utils import _choose_block_shape - - -def _choose_smallest_uint(value): - if value < 2**8: - return numpy.uint8 - elif value < 2**16: - return numpy.uint16 - elif value < 2**32: - return numpy.uint32 - else: - return numpy.uint64 - - -def write_sparse_matrix(x, path: str, name: str, chunks: Optional[int] = None, guess_integer: bool = True, block_size: int = 1e8): - """Write a sparse matrix into a HDF5 file in a compressed-sparse column - format. This creates a HDF5 group containing the ``data``, ``indices`` and - ``indptr`` datasets, containing the corresponding components of the usual - compressed sparse representation. The shape of the matrix is stored in the - ``shape`` dataset inside the same group. - - Args: - x: Any sparse matrix. - - path: Path to the file in which to save ``x``. - - name: Name of the group in the file. - - chunks: - Size of the 1-dimensional chunks for the data and index datasets. - If None, defaults to 10000. - - guess_integer: - Whether to guess a compact integer type for the data. This can reduce - disk usage at the cost of a second pass through the data. - - block_size: - Size of the blocks to use for processing ``x``. Larger values - increase speed at the cost of memory efficiency. - - Returns: - A HDF5 file is created at ``path`` with the sparse data stored in the - ``name`` group. - """ - with File(path, "w") as handle: - ghandle = handle.create_group(name) - ghandle.create_dataset(name="shape", data=numpy.array(x.shape, dtype=numpy.uint64)) - - # Choosing a compact type for the indices. - index_dtype = _choose_smallest_uint(x.shape[0]) - - deets = _extract_details(x, block_size=block_size) - - # Choosing a compact type for the data. - if not guess_integer: - dtype = _choose_file_dtype(x.dtype) - else: - if deets.non_integer: - if x.dtype == numpy.float32: - dtype = numpy.float32 - else: - dtype = numpy.float64 - elif deets.minimum < 0: - if deets.minimum >= -2**7 and deets.maximum < 2**7: - dtype = numpy.int8 - elif deets.minimum >= -2**15 and deets.maximum < 2**15: - dtype = numpy.int16 - elif deets.minimum >= -2**31 and deets.maximum < 2**31: - dtype = numpy.int32 - else: - dtype = numpy.int64 - else: - dtype = _choose_smallest_uint(deets.maximum) - - # Doing the dump. - if chunks is None: - chunks = 10000 - ihandle = ghandle.create_dataset(name="indices", shape=deets.count, dtype=index_dtype, chunks=chunks, compression="gzip") - dhandle = ghandle.create_dataset(name="data", shape=deets.count, dtype=dtype, chunks=chunks, compression="gzip") - saved = _dump_sparse_matrix(x, ihandle, dhandle, block_size=block_size, offset=0) - - pointers = numpy.zeros(x.shape[1] + 1, dtype=numpy.uint64) - for i, s in enumerate(saved): - pointers[i + 1] += pointers[i] + s - ghandle.create_dataset(name="indptr", data=pointers, compression="gzip") - - -has_scipy = False -try: - import scipy.sparse - has_scipy = True -except: - pass - - -#################################################### -#################################################### - - -_SparseMatrixDetails = namedtuple("_SparseMatrixDetails", [ "minimum", "maximum", "non_integer", "count" ]) - - -def _extract_details_by_block(x, block_size: int) -> _SparseMatrixDetails: - full_shape = x.shape - block_shape = _choose_block_shape(x, block_size) - - num_row_blocks = int(numpy.ceil(full_shape[0] / block_shape[0])) - num_col_blocks = int(numpy.ceil(full_shape[1] / block_shape[1])) - - minimum = None - maximum = None - count = 0 - non_integer = False - - for c in range(num_col_blocks): - col_start = c * block_shape[1] - col_range = range(col_start, min(col_start + block_shape[1], full_shape[1])) - - for r in range(num_row_blocks): - row_start = c * block_shape[0] - row_range = range(row_start, min(row_start + block_shape[0], full_shape[0])) - - block = extract_sparse_array(x, (row_range, col_range)) - deets = _extract_details(block) - - if minimum is None or deets.minimum < minimum: - minimum = deets.minimum - if maximum is None or deets.maximum > maximum: - maximum = deets.maximum - if not non_integer: - non_integer = deets.non_integer - count += deets.count - - return _SparseMatrixDetails( - minimum = minimum, - maximum = maximum, - non_integer = non_integer, - count = count, - ) - - -@singledispatch -def _extract_details(x: Any, block_size: int, **kwargs) -> _SparseMatrixDetails: - return _extract_details_by_block(x, block_size) - - -def _has_non_integer(data: numpy.ndarray): - if numpy.issubdtype(data.dtype, numpy.floating): - for y in data: - if not y.is_integer(): - return True - return False - - -@_extract_details.register -def _extract_details_SparseNdarray(x: SparseNdarray, **kwargs) -> _SparseMatrixDetails: - minimum = None - maximum = None - count = 0 - non_integer = False - - if x.contents is not None: - for idx, val in x.contents: - candidate_min = min(val) - if minimum is None or candidate_min < minimum: - minimum = candidate_min - - candidate_max = max(val) - if maximum is None or candidate_max > maximum: - maximum = candidate_max - - if not non_integer: - non_integer = _has_non_integer(val) - - count += len(val) - else: - minimum = 0 - maximum = 0 - - return _SparseMatrixDetails( - minimum = minimum, - maximum = maximum, - non_integer = non_integer, - count = count, - ) - - - -@_extract_details.register -def _extract_details_Combine(x: Combine, **kwargs) -> _SparseMatrixDetails: - minimum = None - maximum = None - count = 0 - non_integer = False - - for s in x.seeds: - deets = _extract_details(s, **kwargs) - if minimum is None or deets.minimum < minimum: - minimum = deets.minimum - if maximum is None or deets.maximum > maximum: - maximum = deets.maximum - if not non_integer: - non_integer = deets.non_integer - count += deets.count - - return _SparseMatrixDetails( - minimum = minimum, - maximum = maximum, - non_integer = non_integer, - count = count, - ) - - -@_extract_details.register -def _extract_details_DelayedArray(x: DelayedArray, block_size: int, **kwargs) -> _SparseMatrixDetails: - candidate = _extract_details.dispatch(type(x.seed)) - if _extract_details.dispatch(object) != candidate: - return candidate(x.seed, block_size = block_size, **kwargs) - else: - return _extract_details_by_block(x, block_size) - - -if has_scipy: - def _extract_from_compressed_data(data: numpy.ndarray): - return _SparseMatrixDetails( - minimum = min(data), - maximum = max(data), - non_integer = _has_non_integer(data), - count = len(data), - ) - - - @_extract_details.register - def _extract_details_scipy_csc(x: scipy.sparse.csc_matrix, **kwargs): - return _extract_from_compressed_data(x.data) - - - @_extract_details.register - def _extract_details_scipy_csr(x: scipy.sparse.csr_matrix, **kwargs): - return _extract_from_compressed_data(x.data) - - - @_extract_details.register - def _extract_details_scipy_coo(x: scipy.sparse.coo_matrix, **kwargs): - return _extract_from_compressed_data(x.data) - - -#################################################### -#################################################### - -def _dump_sparse_matrix_as_SparseNdarray(x, indices_handle, data_handle, offset: int): - saved = numpy.zeros(x.shape[1], dtype=numpy.uint64) - if x.contents is not None: - for s, con in enumerate(x.contents): - if con is not None: - idx, val = con - indices_handle[offset:offset + len(idx)] = idx - data_handle[offset:offset + len(val)] = val - offset += len(idx) - saved[s] = len(idx) - return saved - - -def _dump_sparse_matrix_by_block(x, indices_handle, data_handle, offset: int, block_size: int): - block_size = guess_iteration_block_size(x, 1, block_size) - start = 0 - row_range = range(x.shape[0]) - all_saved = [numpy.zeros(0, dtype=numpy.uint64)] - - while start < x.shape[1]: - end = min(start + block_size, x.shape[1]) - col_range = range(start, end) - block = extract_sparse_array(x, (row_range, col_range)) - saved = _dump_sparse_matrix_as_SparseNdarray(block, indices_handle, data_handle, offset=offset) - offset += saved.sum() - all_saved.append(saved) - start = end - - return numpy.concatenate(all_saved) - - -@singledispatch -def _dump_sparse_matrix(x: Any, indices_handle, data_handle, offset: int, block_size: int, **kwargs) -> numpy.ndarray: - return _dump_sparse_matrix_by_block(x, indices_handle, data_handle, offset=offset, block_size=block_size) - - -@_dump_sparse_matrix.register -def _dump_sparse_matrix_SparseNdarray(x: SparseNdarray, indices_handle, data_handle, offset: int, block_size: int, **kwargs): - return _dump_sparse_matrix_by_block(x, indices_handle, data_handle, offset=offset, block_size=block_size) - - -@_dump_sparse_matrix.register -def _dump_sparse_matrix_Combine(x: Combine, indices_handle, data_handle, offset: int, block_size: int, **kwargs): - if x.along != 1: - return _dump_sparse_matrix_by_block(x, indices_handle, data_handle, offset=offset, block_size=block_size) - - all_saved = [numpy.zeros(0, dtype=numpy.uint64)] - offset = 0 - for s in x.seeds: - saved = _dump_sparse_matrix(s, indices_handle, data_handle, offset=offset, block_size=block_size, **kwargs) - all_saved.append(saved) - offset += int(saved.sum()) - - return numpy.concatenate(all_saved) - - -@_dump_sparse_matrix.register -def _dump_sparse_matrix_DelayedArray(x: DelayedArray, indices_handle, data_handle, offset: int, block_size: int, **kwargs): - candidate = _dump_sparse_matrix.dispatch(type(x.seed)) - if _dump_sparse_matrix.dispatch(object) != candidate: - return candidate(x.seed, indices_handle, data_handle, offset=offset, block_size = block_size, **kwargs) - else: - return _dump_sparse_matrix_by_block(x, indices_handle, data_handle, offset=offset, block_size=block_size) - - -if has_scipy: - # All other representations can undergo block processing. - @_dump_sparse_matrix.register - def _dump_sparse_matrix_scipy_csc(x: scipy.sparse.csc_matrix, indices_handle, data_handle, offset: int, block_size: int, **kwargs): - n = len(x.data) - indices_handle[offset:offset + n] = x.indices - data_handle[offset:offset + n] = x.data - saved = numpy.zeros(x.shape[1], dtype=numpy.uint64) - for i in range(x.shape[1]): - saved[i] = x.indptr[i+1] - x.indptr[i] - return saved diff --git a/tests/test_DelayedMask.py b/tests/test_DelayedMask.py new file mode 100644 index 0000000..a275074 --- /dev/null +++ b/tests/test_DelayedMask.py @@ -0,0 +1,115 @@ +import dolomite_matrix as dm +import delayedarray +import numpy + + +def test_DelayedMask_dense(): + y = numpy.array([[1,2,3],[4,5,6]]) + + m = dm.DelayedMask(y, 1) + assert m.dtype == y.dtype + assert m.shape == y.shape + assert m.placeholder == 1 + assert not delayedarray.is_sparse(m) + assert delayedarray.chunk_shape(m) == (1, 3) + + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[0,0]) + assert block[1,2] == 6 + + # A no-op block. + block = delayedarray.extract_dense_array(m, (range(0, 2), range(1, 3))) + assert not numpy.ma.is_masked(block) + + +def test_DelayedMask_dense_NaN(): + y = numpy.array([[1,2,numpy.NaN],[4,5,6]]) + m = dm.DelayedMask(y, numpy.NaN) + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[0,2]) + assert block[0,0] == 1 + + +def test_DelayedMask_strings(): + # Check that the placeholders work with different types of strings being compared. + y = numpy.array(["A", "B", "C"]) + m = dm.DelayedMask(y, "C") + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[2]) + assert block[0] == "A" + + m = dm.DelayedMask(y, "ASDASD") + block = delayedarray.extract_dense_array(m) + assert not numpy.ma.is_masked(block) + assert m.placeholder == "ASDASD" + + m = dm.DelayedMask(y, b"B") + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[1]) + assert m.placeholder == "B" + + m = dm.DelayedMask(y, numpy.bytes_("C")) + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert m.placeholder == "C" + + # Trying with byte strings now. + y = numpy.array([b"A", b"B", b"C"]) + m = dm.DelayedMask(y, b"A") + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[0]) + assert block[2] == b"C" + + m = dm.DelayedMask(y, "ASDASD") + block = delayedarray.extract_dense_array(m) + assert not numpy.ma.is_masked(block) + assert m.placeholder == b"ASDASD" + + m = dm.DelayedMask(y, numpy.str_("B")) + block = delayedarray.extract_dense_array(m) + assert numpy.ma.is_masked(block) + assert m.placeholder == b"B" + + +def test_DelayedMask_dense_type(): + y = numpy.array([[1,2,10000],[4,5,6]]) + m = dm.DelayedMask(y, 10000, dtype=numpy.dtype(numpy.int8)) + + block = delayedarray.extract_dense_array(m) + assert block.dtype == numpy.int8 + assert numpy.ma.is_masked(block) + assert numpy.ma.is_masked(block[0,2]) + assert block[0,0] == 1 + + +def test_DelayedMask_sparse(): + contents = [None, (numpy.array([1, 8]), numpy.array([2, 10])), None, None, (numpy.array([0, 3, 9]), numpy.array([3, 4, 6]))] + y = delayedarray.SparseNdarray((10, 5), contents) + + m = dm.DelayedMask(y, 4) + assert m.dtype == y.dtype + assert m.shape == y.shape + assert m.placeholder == 4 + assert delayedarray.is_sparse(m) + assert delayedarray.chunk_shape(m) == (10, 1) + + block = delayedarray.extract_sparse_array(m) + assert block.dtype == y.dtype + assert numpy.ma.is_masked(block.contents[4][1]) + #assert numpy.ma.is_masked(block[3,4]) # TODO: fix SparseNdarray subsetting to support masked arrays. + assert block[1,1] == 2 + + +def test_DelayedMask_dense_dask(): + y = numpy.array([[1,2,10000],[4,5,6]]) + m = dm.DelayedMask(y, 10000, dtype=numpy.dtype(numpy.int8)) + da = delayedarray.create_dask_array(m).compute() + assert numpy.ma.is_masked(da) + assert numpy.ma.is_masked(da[0,2]) + assert da[1,2] == 6 + diff --git a/tests/test_ReloadedArray.py b/tests/test_ReloadedArray.py new file mode 100644 index 0000000..8a30e63 --- /dev/null +++ b/tests/test_ReloadedArray.py @@ -0,0 +1,78 @@ +import numpy +from dolomite_base import save_object +import dolomite_matrix as dm +import os +import scipy.sparse +import filebackedarray +import delayedarray +from tempfile import mkdtemp + + +def test_ReloadedArray_basic(): + y = numpy.random.rand(100, 200) + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = dm.read_dense_array(dir) + + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + assert isinstance(roundtrip, dm.ReloadedArray) + assert roundtrip.path == dir + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5DenseArraySeed) + + assert (delayedarray.extract_dense_array(roundtrip) == y).all() + + # Checking re-saves. + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2) + assert os.path.samefile(os.path.join(dir, "OBJECT"), os.path.join(dir2, "OBJECT")) + assert os.path.samefile(os.path.join(dir, "array.h5"), os.path.join(dir2, "array.h5")) + + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2, reloaded_array_reuse_mode="symlink") + assert os.readlink(os.path.join(dir2, "OBJECT")) == os.path.join(dir, "OBJECT") + assert os.readlink(os.path.join(dir2, "array.h5")) == os.path.join(dir, "array.h5") + + # Copies along the tracer. + with open(os.path.join(dir, "_TRACER"), "w") as handle: + handle.write("YAY") + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2, reloaded_array_reuse_mode="copy") + assert os.stat(os.path.join(dir2, "OBJECT")).st_size == os.stat(os.path.join(dir, "OBJECT")).st_size + assert os.stat(os.path.join(dir2, "array.h5")).st_size == os.stat(os.path.join(dir, "array.h5")).st_size + assert os.stat(os.path.join(dir2, "_TRACER")).st_size == os.stat(os.path.join(dir, "_TRACER")).st_size + + # New save ignores the tracer. + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2, reloaded_array_reuse_mode="none") + assert os.path.exists(os.path.join(dir2, "OBJECT")) + assert os.path.exists(os.path.join(dir2, "array.h5")) + assert not os.path.exists(os.path.join(dir2, "_TRACER")) + + +def test_ReloadedArray_sparse(): + y = (scipy.sparse.random(1000, 200, 0.1) > 0.5).tocsc() + dir = os.path.join(mkdtemp(),"foobar") + save_object(y, dir) + roundtrip = dm.read_compressed_sparse_matrix(dir) + + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.bool_ + assert isinstance(roundtrip, dm.ReloadedArray) + assert roundtrip.path == dir + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5CompressedSparseMatrixSeed) + + as_sparse = delayedarray.extract_sparse_array(roundtrip) + assert isinstance(as_sparse, delayedarray.SparseNdarray) + assert (numpy.array(as_sparse) == y.toarray()).all() + + # Checking re-saves. + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2) + assert os.path.samefile(os.path.join(dir, "OBJECT"), os.path.join(dir2, "OBJECT")) + assert os.path.samefile(os.path.join(dir, "matrix.h5"), os.path.join(dir2, "matrix.h5")) + + dir2 = os.path.join(mkdtemp(), "foobar2") + save_object(roundtrip, dir2, reloaded_array_reuse_mode="none") + assert os.path.exists(os.path.join(dir2, "OBJECT")) + assert os.path.exists(os.path.join(dir2, "matrix.h5")) diff --git a/tests/test_choose_chunk_dimensions.py b/tests/test_choose_chunk_dimensions.py new file mode 100644 index 0000000..2395bb7 --- /dev/null +++ b/tests/test_choose_chunk_dimensions.py @@ -0,0 +1,16 @@ +from dolomite_matrix import choose_chunk_dimensions + + +def test_choose_chunk_dimensions(): + assert choose_chunk_dimensions((1000, 100), 4, min_extent = 0, memory=8000) == (20, 2) + assert choose_chunk_dimensions((1000, 100), 4, min_extent = 10, memory=8000) == (20, 10) + assert choose_chunk_dimensions((1000, 100), 4, min_extent = 10, memory=80000) == (200, 20) + assert choose_chunk_dimensions((1000, 100), 4, min_extent = 0, memory=800000) == (1000, 100) + assert choose_chunk_dimensions((1000, 100), 8, min_extent = 0, memory=80000) == (100, 10) + assert choose_chunk_dimensions((1000, 100), 1, min_extent = 0, memory=1000) == (10, 1) + + +def test_choose_chunk_dimensions_3d(): + assert choose_chunk_dimensions((1000, 100, 10), 4, min_extent = 0, memory=400000) == (100, 10, 1) + assert choose_chunk_dimensions((1000, 100, 10), 1, min_extent = 0, memory=400000) == (400, 40, 4) + assert choose_chunk_dimensions((1000, 100, 10), 1, min_extent = 0, memory=1e8) == (1000, 100, 10) diff --git a/tests/test_choose_dense_chunk_sizes.py b/tests/test_choose_dense_chunk_sizes.py deleted file mode 100644 index 9460e12..0000000 --- a/tests/test_choose_dense_chunk_sizes.py +++ /dev/null @@ -1,16 +0,0 @@ -from dolomite_matrix import choose_dense_chunk_sizes - - -def test_choose_dense_chunk_sizes(): - assert choose_dense_chunk_sizes((1000, 100), 4, min_extent = 0, memory=8000) == (20, 2) - assert choose_dense_chunk_sizes((1000, 100), 4, min_extent = 10, memory=8000) == (20, 10) - assert choose_dense_chunk_sizes((1000, 100), 4, min_extent = 10, memory=80000) == (200, 20) - assert choose_dense_chunk_sizes((1000, 100), 4, min_extent = 0, memory=800000) == (1000, 100) - assert choose_dense_chunk_sizes((1000, 100), 8, min_extent = 0, memory=80000) == (100, 10) - assert choose_dense_chunk_sizes((1000, 100), 1, min_extent = 0, memory=1000) == (10, 1) - - -def test_choose_dense_chunk_sizes_3d(): - assert choose_dense_chunk_sizes((1000, 100, 10), 4, min_extent = 0, memory=400000) == (100, 10, 1) - assert choose_dense_chunk_sizes((1000, 100, 10), 1, min_extent = 0, memory=400000) == (400, 40, 4) - assert choose_dense_chunk_sizes((1000, 100, 10), 1, min_extent = 0, memory=1e8) == (1000, 100, 10) diff --git a/tests/test_compressed_sparse_matrix.py b/tests/test_compressed_sparse_matrix.py new file mode 100644 index 0000000..778f337 --- /dev/null +++ b/tests/test_compressed_sparse_matrix.py @@ -0,0 +1,88 @@ +import scipy.sparse +import dolomite_base as dl +import dolomite_matrix as dm +from tempfile import mkdtemp +import numpy +import delayedarray +import filebackedarray +import os + + +def test_compressed_sparse_matrix_csc(): + y = scipy.sparse.random(1000, 200, 0.1).tocsc() + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert isinstance(roundtrip, dm.ReloadedArray) + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5CompressedSparseMatrixSeed) + assert (numpy.array(roundtrip) == y.toarray()).all() + + +def test_compressed_sparse_matrix_csr(): + y = scipy.sparse.random(1000, 200, 0.1) + y = y.tocsr() + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + assert (numpy.array(roundtrip) == y.toarray()).all() + + +def test_compressed_sparse_matrix_coo(): + y = scipy.sparse.random(1000, 200, 0.1) + y = y.tocoo() + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + assert (numpy.array(roundtrip) == y.toarray()).all() + + +def test_compressed_sparse_matrix_SparseNdarray(): + y = delayedarray.SparseNdarray( + (10, 5), + [ + None, + (numpy.array([0, 8]), numpy.array([1, 20])), + None, + (numpy.array([2, 9]), numpy.array([0, 5000])), + None + ] + ) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + assert (numpy.array(roundtrip) == numpy.array(y)).all() + + +def test_compressed_sparse_matrix_integer(): + y = (scipy.sparse.random(1000, 200, 0.1) * 10).tocsc() + y = y.astype(numpy.int32) + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + assert (numpy.array(roundtrip) == y.toarray()).all() + + +def test_compressed_sparse_matrix_boolean(): + y = (scipy.sparse.random(1000, 200, 0.1) > 0).tocsc() + dir = os.path.join(mkdtemp(),"foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert (numpy.array(roundtrip) == y.toarray()).all() diff --git a/tests/test_delayed_array.py b/tests/test_delayed_array.py new file mode 100644 index 0000000..bfef9da --- /dev/null +++ b/tests/test_delayed_array.py @@ -0,0 +1,129 @@ +import numpy +import dolomite_base as dl +import dolomite_matrix as dm +import delayedarray as da +import os +import h5py +import filebackedarray +from tempfile import mkdtemp +import scipy.sparse + + +def test_save_delayed_array_simple(): + x = numpy.random.rand(100, 200) + y = da.wrap(x) + 1 + assert isinstance(y, da.DelayedArray) + + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert isinstance(roundtrip, dm.ReloadedArray) + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5DenseArraySeed) + assert (numpy.array(roundtrip) == x + 1).all() + + +def test_save_delayed_array_booleans(): + x1 = numpy.random.rand(100, 200) > 0 + x2 = numpy.random.rand(100, 200) > 0 + y = numpy.logical_and(da.wrap(x1), da.wrap(x2)) + assert isinstance(y, da.DelayedArray) + + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir) + + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.bool_ + assert (numpy.array(roundtrip) == numpy.logical_and(x1, x2)).all() + + +######################################################## +######################################################## + + +class _ChunkyBoi: + def __init__(self, core, chunks): + self._core = core + self._chunks = chunks + + @property + def dtype(self): + return self._core.dtype + + @property + def shape(self): + return self._core.shape + +@da.extract_dense_array.register +def extract_dense_array_ChunkyBoi(x: _ChunkyBoi, subsets): + return da.extract_dense_array(x._core, subsets) + +@da.chunk_shape.register +def chunk_shape_ChunkyBoi(x: _ChunkyBoi): + return x._chunks + + +def test_delayed_array_custom_chunks(): + # Chunky boi (I) + x = numpy.random.rand(100, 20, 30) + y = da.wrap(_ChunkyBoi(x, (10, 10, 10))) + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir, dense_array_buffer_size=20 * 5000, dense_array_chunk_dimensions=(5, 20, 5)) + + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert (numpy.array(roundtrip) == x).all() + + # Chunky boi (II) + y = da.wrap(_ChunkyBoi(x, (1, 1, x.shape[2]))) + dir = os.path.join(mkdtemp(), "foobar2") + dl.save_object(y, dir, dense_array_buffer_size=20 * 5000, dense_array_chunk_dimensions=(5, 1, 10)) + + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert (numpy.array(roundtrip) == x).all() + + +def test_delayed_array_low_block_size_C_contiguous(): + x = numpy.random.rand(100, 200) + y = da.wrap(x) + 1 + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir, dense_array_buffer_size=8 * 1000) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert (numpy.array(roundtrip) == x + 1).all() + + +def test_delayed_array_low_block_size_F_contiguous(): + x = numpy.asfortranarray(numpy.random.rand(100, 200)) + y = da.wrap(x) + 1 + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir, dense_array_buffer_size=8 * 1000) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert (numpy.array(roundtrip) == x + 1).all() + + +######################################################## +######################################################## + + +def test_delayed_array_sparse(): + x = scipy.sparse.random(1000, 200, 0.1).tocsc() + y = da.wrap(x) * 10 + + dir = os.path.join(mkdtemp(), "foobar") + dl.save_object(y, dir) + roundtrip = dm.read_compressed_sparse_matrix(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == y.dtype + assert isinstance(roundtrip, dm.ReloadedArray) + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5CompressedSparseMatrixSeed) + assert (numpy.array(roundtrip) == x.toarray() * 10).all() diff --git a/tests/test_dense_array.py b/tests/test_dense_array.py new file mode 100644 index 0000000..2f9cf53 --- /dev/null +++ b/tests/test_dense_array.py @@ -0,0 +1,101 @@ +import numpy +import random +from dolomite_base import save_object +import dolomite_matrix as dm +import os +import h5py +import filebackedarray +import delayedarray +from tempfile import mkdtemp + + +def test_dense_array_number(): + y = numpy.random.rand(100, 200) + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.floating) + assert isinstance(roundtrip, dm.ReloadedArray) + assert isinstance(roundtrip.seed.seed, filebackedarray.Hdf5DenseArraySeed) + assert (numpy.array(roundtrip) == y).all() + + +def test_dense_array_integer(): + y = numpy.random.rand(150, 250) * 10 + y = y.astype(numpy.int32) + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.integer) + assert (numpy.array(roundtrip) == y).all() + + +def test_dense_array_boolean(): + y = numpy.random.rand(99, 75) > 0 + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert roundtrip.dtype == numpy.bool_ + assert (numpy.array(roundtrip) == y).all() + + +def test_dense_array_string(): + y = numpy.array(["Sumire", "Kanon", "Chisato", "Ren", "Keke"]) + dir = os.path.join(mkdtemp(), "foobar") + save_object(y, dir) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == y.shape + assert numpy.issubdtype(roundtrip.dtype, numpy.str_) + assert (numpy.array(roundtrip) == y).all() + + +######################################################## +######################################################## + + +#def test_dense_array_number_mask(): +# y0 = numpy.random.rand(100, 200) +# mask = y0 > 0.9 +# y = numpy.ma.MaskedArray(y0, mask=mask) +# +# dir = os.path.join(mkdtemp(), "foobar") +# save_object(y, dir) +# roundtrip = dm.read_dense_array(dir) +# assert roundtrip.shape == y.shape +# assert numpy.issubdtype(roundtrip.dtype, numpy.floating) +# +# dump = delayedarray.extract_dense_array(roundtrip) +# assert (dump.mask == mask).all() +# assert (dump == y).all() + + +######################################################## +######################################################## + + +def test_dense_array_F_contiguous(): + x = numpy.asfortranarray(numpy.random.rand(100, 200)) + dir = os.path.join(mkdtemp(), "foobar") + save_object(x, dir) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == x.shape + assert roundtrip.dtype == x.dtype + assert (numpy.array(roundtrip) == x).all() + + +def test_dense_array_block_size(): + x = numpy.ndarray([100, 200], dtype="U1") + choices = "ABCDE" + for i in range(x.shape[0]): + for j in range(x.shape[1]): + x[i,j] = random.choice(choices) + + dir = os.path.join(mkdtemp(), "foobar") + save_object(x, dir, dense_array_buffer_size= x.dtype.itemsize * 1000) + roundtrip = dm.read_dense_array(dir) + assert roundtrip.shape == x.shape + assert roundtrip.dtype == x.dtype + assert (numpy.array(roundtrip) == x).all() diff --git a/tests/test_optimize_storage.py b/tests/test_optimize_storage.py new file mode 100644 index 0000000..a2e978b --- /dev/null +++ b/tests/test_optimize_storage.py @@ -0,0 +1,660 @@ +import dolomite_matrix._optimize_storage as optim +import numpy +import delayedarray + + +################################################### +################################################### + + +def test_optimize_integer_storage_dense(): + # Unsigned integers + y = numpy.array([1,2,3]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u1" + assert opt.placeholder is None + + y = numpy.array([1,2,300]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u2" + assert opt.placeholder is None + + y = numpy.array([1,2,300000]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i4" + assert opt.placeholder is None + + y = numpy.array([1,2,3000000000]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + # Signed integers + y = numpy.array([-1,2,3]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = numpy.array([-1,2,200]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i2" + assert opt.placeholder is None + + y = numpy.array([-1,2,200000]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i4" + assert opt.placeholder is None + + y = numpy.array([-1,2,-20000000000]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + # Empty + y = numpy.array([], dtype=numpy.int32) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + +def test_optimize_integer_storage_dense_MaskedArray(): + # Unsigned integers + y = numpy.ma.MaskedArray(numpy.array([1,2,3]), mask=[True, False, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u1" + assert opt.placeholder == 2**8 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1,2,300]), mask=[True, False, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u2" + assert opt.placeholder == 2**16 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1,2,3000000]), mask=[True, False, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i4" + assert opt.placeholder == 2**31 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1,2,3000000000]), mask=[True, False, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "f8" + assert numpy.isnan(opt.placeholder) + + # Signed integers + y = numpy.ma.MaskedArray(numpy.array([-1,2,3]), mask=[False, True, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -2**7 + + y = numpy.ma.MaskedArray(numpy.array([-1,2,200]), mask=[False, True, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i2" + assert opt.placeholder == -2**15 + + y = numpy.ma.MaskedArray(numpy.array([-1,2,200000]), mask=[False, True, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i4" + assert opt.placeholder == -2**31 + + y = numpy.ma.MaskedArray(numpy.array([-1,2,200000000000]), mask=[False, True, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "f8" + assert numpy.isnan(opt.placeholder) + + # Masked large values. + y = numpy.ma.MaskedArray(numpy.array([1000,2,3]), mask=[True, False, False]) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u1" + assert opt.placeholder == 2**8 - 1 + + # Masked but no op. + y = numpy.ma.MaskedArray(numpy.array([1000,2,3]), mask=False) + opt = optim.optimize_integer_storage(y) + assert opt.type == "u2" + assert opt.placeholder is None + + # Fully masked. + y = numpy.ma.MaskedArray([1,2,3], mask=True) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -128 + + +def test_optimize_integer_storage_Sparse2darray(): + y = delayedarray.SparseNdarray([10, 5], None, dtype=numpy.int32, index_dtype=numpy.int8) + opt = optim.optimize_integer_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([1, 20])), + None, + (numpy.array([2, 9]), numpy.array([0, 5000])), + None + ] + ) + + opt = optim.optimize_integer_storage(y) + assert opt.type == "u2" + assert opt.placeholder is None + assert opt.non_zero == 4 + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.ma.MaskedArray(numpy.array([1, 20]), mask=True)), + None, + (numpy.array([1, 7, 9]), numpy.ma.MaskedArray(numpy.array([-1, -1000, 500000]), mask=False)), + None + ] + ) + + opt = optim.optimize_integer_storage(y) + assert opt.type == "i4" + assert opt.placeholder == -2**31 + assert opt.non_zero == 5 + + +def test_optimize_integer_storage_scipy(): + import scipy + y = scipy.sparse.coo_matrix( + ( + [1,-200,3,-4,500], + ( + [1,2,3,4,5], + [1,2,3,4,5] + ) + ), + [10, 10] + ) + + opt = optim.optimize_integer_storage(y) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_integer_storage(y.tocsc()) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_integer_storage(y.tocsr()) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + +def test_optimize_integer_storage_Any(): + y = delayedarray.DelayedArray(numpy.array([[1,2,3],[4,5,6]])) + opt = optim.optimize_integer_storage(y * 200000) + assert opt.type == "i4" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([1, 20])), + None, + (numpy.array([2, 9]), numpy.array([0, 5000])), + None + ] + ) + y = delayedarray.DelayedArray(y) + opt = optim.optimize_integer_storage(y * 2) + assert opt.type == "u2" + assert opt.placeholder is None + + +################################################### +################################################### + + +def test_optimize_float_storage_dense(): + # Unsigned integers. + y = numpy.array([1.0,2,3]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u1" + assert opt.placeholder is None + + y = numpy.array([1.0,2,300]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u2" + assert opt.placeholder is None + + y = numpy.array([1.0,2,300000]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u4" + assert opt.placeholder is None + + y = numpy.array([1.0,2,30000000000]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + # Signed integers. + y = numpy.array([-1.0,2,3]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = numpy.array([-1.0,2,200]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i2" + assert opt.placeholder is None + + y = numpy.array([-1.0,2,200000]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i4" + assert opt.placeholder is None + + y = numpy.array([-1.0,2,-20000000000]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + # Empty + y = numpy.array([], dtype=numpy.float64) + opt = optim.optimize_float_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + # Actual floating point values. + y = numpy.array([-1.5,2,3]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + y = numpy.array([numpy.NaN,2,3]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + y = numpy.array([numpy.inf,2,3]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + + # 32-bit floating point values. + y = numpy.array([-1.5,2,3], dtype=numpy.float32) + opt = optim.optimize_float_storage(y) + assert opt.type == "f4" + assert opt.placeholder is None + + +def test_optimize_float_storage_dense_MaskedArray(): + # Unsigned floats + y = numpy.ma.MaskedArray(numpy.array([1.0,2,3]), mask=[True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u1" + assert opt.placeholder == 2**8 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1.0,2,300]), mask=[True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u2" + assert opt.placeholder == 2**16 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1.0,2,3000000]), mask=[True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u4" + assert opt.placeholder == 2**32 - 1 + + y = numpy.ma.MaskedArray(numpy.array([1.0,2,30000000000]), mask=[True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert numpy.isnan(opt.placeholder) + + # Signed floats + y = numpy.ma.MaskedArray(numpy.array([-1.0,2,3]), mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -2**7 + + y = numpy.ma.MaskedArray(numpy.array([-1.0,2,200]), mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i2" + assert opt.placeholder == -2**15 + + y = numpy.ma.MaskedArray(numpy.array([-1.0,2,200000]), mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "i4" + assert opt.placeholder == -2**31 + + y = numpy.ma.MaskedArray(numpy.array([-1.0,2,200000000000]), mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert numpy.isnan(opt.placeholder) + + # Masked large values. + y = numpy.ma.MaskedArray(numpy.array([1000.0,2,3]), mask=[True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "u1" + assert opt.placeholder == 2**8 - 1 + + # Masked but no op. + y = numpy.ma.MaskedArray(numpy.array([1000.0,2,3]), mask=False) + opt = optim.optimize_float_storage(y) + assert opt.type == "u2" + assert opt.placeholder is None + + # Fully masked. + y = numpy.ma.MaskedArray([1.0,2,3], mask=True) + opt = optim.optimize_float_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -128 + + # Actual floating point values. + y = numpy.ma.MaskedArray([-1.5,2,3], mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert numpy.isnan(opt.placeholder) + + y = numpy.ma.MaskedArray([numpy.NaN,2,3], mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == numpy.inf + + y = numpy.ma.MaskedArray([numpy.NaN,2,numpy.inf], mask=[False, True, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == -numpy.inf + + fstats = numpy.finfo(numpy.float64) + y = numpy.ma.MaskedArray([numpy.NaN, 2, numpy.inf, -numpy.inf], mask=[False, True, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == fstats.min + + y = numpy.ma.MaskedArray([numpy.NaN, 2, numpy.inf, -numpy.inf, fstats.min], mask=[False, True, False, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == fstats.max + + y = numpy.ma.MaskedArray([numpy.NaN, 2, numpy.inf, -numpy.inf, fstats.min, fstats.max], mask=[False, True, False, False, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == 0 + + y = numpy.ma.MaskedArray([numpy.NaN, 2, numpy.inf, -numpy.inf, fstats.min, fstats.max, 0], mask=[False, True, False, False, False, False, False]) + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder == fstats.min / 2 + + # 32-bit floating point values. + y = numpy.ma.MaskedArray([-1.5,2.2,3], mask=[True, False, False], dtype=numpy.float32) + opt = optim.optimize_float_storage(y) + assert opt.type == "f4" + assert numpy.isnan(opt.placeholder) + + +def test_optimize_float_storage_Sparse2darray(): + y = delayedarray.SparseNdarray([10, 5], None, dtype=numpy.float32, index_dtype=numpy.int8) + opt = optim.optimize_float_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([1.0, 20])), + None, + (numpy.array([2, 9]), numpy.array([0, 5000.5])), + None + ] + ) + + opt = optim.optimize_float_storage(y) + assert opt.type == "f8" + assert opt.placeholder is None + assert opt.non_zero == 4 + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.ma.MaskedArray(numpy.array([1, 2.0]), mask=True)), + None, + (numpy.array([1, 7, 9]), numpy.ma.MaskedArray(numpy.array([-1.0, -1000, 500000]), mask=False)), + None + ] + ) + + opt = optim.optimize_float_storage(y) + assert opt.type == "i4" + assert opt.placeholder == -2**31 + assert opt.non_zero == 5 + + +def test_optimize_float_storage_scipy(): + import scipy + y = scipy.sparse.coo_matrix( + ( + [1.0,-200.0,3,-4,500], + ( + [1,2,3,4,5], + [1,2,3,4,5] + ) + ), + [10, 10] + ) + assert y.dtype == numpy.float64 + + opt = optim.optimize_float_storage(y) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_float_storage(y.tocsc()) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_float_storage(y.tocsr()) + assert opt.type == "i2" + assert opt.placeholder is None + assert opt.non_zero == 5 + + +def test_optimize_float_storage_Any(): + y = delayedarray.DelayedArray(numpy.array([[1,2,3],[4,5,6]])) + y = y * 20000.000 + assert y.dtype == numpy.float64 + + opt = optim.optimize_float_storage(y) + assert opt.type == "u4" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([1, 2.0])), + None, + (numpy.array([2, 9]), numpy.array([0, 500.0])), + None + ] + ) + y = delayedarray.DelayedArray(y) + opt = optim.optimize_float_storage(y * 2) + assert opt.type == "u2" + assert opt.placeholder is None + + +################################################### +################################################### + + +def test_optimize_string_storage_dense(): + y = numpy.array(["A","BB","CCC"]) + opt = optim.optimize_string_storage(y) + assert opt.type == "S3" + assert opt.placeholder is None + + # Empty + y = numpy.array([], dtype=numpy.str_) + opt = optim.optimize_string_storage(y) + assert opt.type == "S1" + assert opt.placeholder is None + + +def test_optimize_string_storage_dense_MaskedArray(): + y = numpy.ma.MaskedArray(["A","BB","CCC"], mask=[True,False,False]) + opt = optim.optimize_string_storage(y) + assert opt.type == "S3" + assert opt.placeholder == "NA" + + y = numpy.ma.MaskedArray(["A","NA","CCC"], mask=[True,False,False]) + opt = optim.optimize_string_storage(y) + assert opt.type == "S3" + assert opt.placeholder == "NA_" + + y = numpy.ma.MaskedArray(["A","NA","NA_","CCC"], mask=[True,False,False,False]) + opt = optim.optimize_string_storage(y) + assert opt.type == "S4" + assert opt.placeholder == "NA__" + + # All masked. + y = numpy.ma.MaskedArray(["A","BB","CCC"], mask=[True,False,False]) + opt = optim.optimize_string_storage(y) + assert opt.type == "S3" + assert opt.placeholder == "NA" + + +def test_optimize_string_storage_Any(): + y = delayedarray.DelayedArray(numpy.array([["A","BB","CCC"],["DDDD","EEEEE","FFFFFF"]])) + opt = optim.optimize_string_storage(y) + assert opt.type == "S6" + assert opt.placeholder is None + + +################################################### +################################################### + + +def test_optimize_boolean_storage_dense(): + y = numpy.array([True,False,True]) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + # Empty + y = numpy.array([], dtype=numpy.bool_) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + +def test_optimize_boolean_storage_dense_MaskedArray(): + # Unsigned booleans + y = numpy.ma.MaskedArray(numpy.array([True,False,True]), mask=[True, False, False]) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -1 + + # Masked but no op. + y = numpy.ma.MaskedArray(numpy.array([True,False,True]), mask=False) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + # Fully masked. + y = numpy.ma.MaskedArray([True,False,False], mask=True) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -1 + + +def test_optimize_boolean_storage_Sparse2darray(): + y = delayedarray.SparseNdarray([10, 5], None, dtype=numpy.bool_, index_dtype=numpy.int8) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([True, False])), + None, + (numpy.array([2, 9]), numpy.array([False, True])), + None + ] + ) + + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + assert opt.non_zero == 4 + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.ma.MaskedArray(numpy.array([True, False]), mask=True)), + None, + (numpy.array([1, 7, 9]), numpy.ma.MaskedArray(numpy.array([False, False, True]), mask=False)), + None + ] + ) + + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder == -1 + assert opt.non_zero == 5 + + +def test_optimize_boolean_storage_scipy(): + import scipy + y = scipy.sparse.coo_matrix( + ( + [True,False,True,False,True], + ( + [1,2,3,4,5], + [1,2,3,4,5] + ) + ), + [10, 10] + ) + + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_boolean_storage(y.tocsc()) + assert opt.type == "i1" + assert opt.placeholder is None + assert opt.non_zero == 5 + + opt = optim.optimize_boolean_storage(y.tocsr()) + assert opt.type == "i1" + assert opt.placeholder is None + assert opt.non_zero == 5 + + +def test_optimize_boolean_storage_Any(): + y = delayedarray.DelayedArray(numpy.array([[True,False,True],[False,True,False]])) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + y = delayedarray.SparseNdarray( + [10, 5], + [ + None, + (numpy.array([0, 8]), numpy.array([True, False])), + None, + (numpy.array([2, 9]), numpy.array([False, True])), + None + ] + ) + y = delayedarray.DelayedArray(y) + opt = optim.optimize_boolean_storage(y) + assert opt.type == "i1" + assert opt.placeholder is None + + diff --git a/tests/test_stage_DelayedArray.py b/tests/test_stage_DelayedArray.py deleted file mode 100644 index 2878de8..0000000 --- a/tests/test_stage_DelayedArray.py +++ /dev/null @@ -1,178 +0,0 @@ -import numpy -from dolomite_base import stage_object, write_metadata, load_object -from delayedarray import wrap -import delayedarray as da -import dolomite_matrix -from dolomite_matrix._utils import _choose_block_shape -import os -import h5py -import filebackedarray -from tempfile import mkdtemp -import scipy.sparse - - -def test_stage_DelayedArray_simple(): - x = numpy.random.rand(100, 200) - y = wrap(x) + 1 - assert isinstance(y, da.DelayedArray) - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "number" - - fpath = os.path.join(dir, meta["path"]) - handle = h5py.File(fpath, "r") - dset = handle[meta["hdf5_dense_array"]["dataset"]] - - copy = dset[:].T - assert copy.dtype == y.dtype - assert (copy == x + 1).all() - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == x + 1).all() - - -def test_stage_DelayedArray_booleans(): - x1 = numpy.random.rand(100, 200) > 0 - x2 = numpy.random.rand(100, 200) > 0 - y = numpy.logical_and(wrap(x1), wrap(x2)) - assert isinstance(y, da.DelayedArray) - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "boolean" - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == numpy.bool_ - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == numpy.logical_and(x1, x2)).all() - - -######################################################## -######################################################## - - -class _ChunkyBoi: - def __init__(self, core, chunks): - self._core = core - self._chunks = chunks - - @property - def dtype(self): - return self._core.dtype - - @property - def shape(self): - return self._core.shape - -@da.extract_dense_array.register -def extract_dense_array_ChunkyBoi(x: _ChunkyBoi, subsets): - return da.extract_dense_array(x._core, subsets) - -@da.chunk_shape.register -def chunk_shape_ChunkyBoi(x: _ChunkyBoi): - return x._chunks - - -######################################################## -######################################################## - - -def test_stage_DelayedArray_choose_block_shape(): - y = _ChunkyBoi(numpy.random.rand(100, 200), (10, 10)) - assert _choose_block_shape(y, 2000 * 8) == (10, 200) - assert _choose_block_shape(y, 5000 * 8) == (20, 200) - - y = _ChunkyBoi(numpy.random.rand(100, 200), (100, 10)) - assert _choose_block_shape(y, 2000 * 8) == (100, 20) - assert _choose_block_shape(y, 5000 * 8) == (100, 50) - - y = _ChunkyBoi(numpy.random.rand(100, 200, 300), (10, 10, 10)) - assert _choose_block_shape(y, 2000 * 8) == (10, 10, 20) - - y = _ChunkyBoi(numpy.random.rand(100, 200, 300), (1, 1, 300)) - assert _choose_block_shape(y, 5000 * 8) == (1, 16, 300) - - -def test_stage_DelayedArray_low_block_size_C_contiguous(): - x = numpy.random.rand(100, 200) - y = wrap(x) + 1 - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar", block_size=8000) - - fpath = os.path.join(dir, meta["path"]) - handle = h5py.File(fpath, "r") - dset = handle[meta["hdf5_dense_array"]["dataset"]] - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == x + 1).all() - - -def test_stage_DelayedArray_low_block_size_F_contiguous(): - x = numpy.asfortranarray(numpy.random.rand(100, 200)) - y = wrap(x) + 1 - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar", block_size=8000) - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == x + 1).all() - - -def test_stage_DelayedArray_custom_chunks(): - # Chunky boi (I) - x = numpy.random.rand(100, 200, 300) - - y = wrap(_ChunkyBoi(x, (10, 10, 10))) - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar", block_size=8 * 5000) - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == x).all() - - # Chunky boi (II) - y = wrap(_ChunkyBoi(x, (1, 1, x.shape[2]))) - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar", block_size=8 * 5000) - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == x).all() - - -######################################################## -######################################################## - - -def test_stage_DelayedArray_sparse(): - x = scipy.sparse.random(1000, 200, 0.1).tocsc() - y = wrap(x) * 10 - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "number" - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5CompressedSparseMatrix) - assert (numpy.array(roundtrip) == x.toarray() * 10).all() diff --git a/tests/test_stage_ndarray.py b/tests/test_stage_ndarray.py deleted file mode 100644 index 80c150a..0000000 --- a/tests/test_stage_ndarray.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy -from dolomite_base import stage_object, write_metadata, load_object -import dolomite_matrix -import os -import h5py -import filebackedarray -from tempfile import mkdtemp - - -def test_stage_ndarray_number(): - y = numpy.random.rand(100, 200) - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "number" - - fpath = os.path.join(dir, meta["path"]) - handle = h5py.File(fpath, "r") - dset = handle[meta["hdf5_dense_array"]["dataset"]] - - assert dset.shape[0] == 200 # transposed, as expected. - assert dset.shape[1] == 100 - assert dset.dtype == numpy.float64 - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == y).all() - - -def test_stage_ndarray_integer(): - y = numpy.random.rand(150, 250) * 10 - y = y.astype(numpy.int32) - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "integer" - - fpath = os.path.join(dir, meta["path"]) - handle = h5py.File(fpath, "r") - dset = handle[meta["hdf5_dense_array"]["dataset"]] - - assert dset.shape[0] == 250 # transposed, as expected. - assert dset.shape[1] == 150 - assert dset.dtype == numpy.int32 - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == y).all() - - -def test_stage_ndarray_boolean(): - y = numpy.random.rand(99, 75) > 0 - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "boolean" - - fpath = os.path.join(dir, meta["path"]) - handle = h5py.File(fpath, "r") - dset = handle[meta["hdf5_dense_array"]["dataset"]] - - assert dset.shape[0] == 75 # transposed, as expected. - assert dset.shape[1] == 99 - assert dset.dtype == numpy.uint8 - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5DenseArray) - assert (numpy.array(roundtrip) == y).all() diff --git a/tests/test_stage_sparse.py b/tests/test_stage_sparse.py deleted file mode 100644 index 67fc709..0000000 --- a/tests/test_stage_sparse.py +++ /dev/null @@ -1,54 +0,0 @@ -import scipy.sparse -from dolomite_base import stage_object, write_metadata, load_object -import dolomite_matrix -from tempfile import mkdtemp -import filebackedarray -import numpy - - -def test_stage_scipy_csc(): - y = scipy.sparse.random(1000, 200, 0.1).tocsc() - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "number" - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5CompressedSparseMatrix) - assert (numpy.array(roundtrip) == y.toarray()).all() - - -def test_stage_scipy_csr(): - y = scipy.sparse.random(1000, 200, 0.1) * 10 - y = y.astype(numpy.int32) - y = y.tocsr() - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "integer" - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert numpy.issubdtype(roundtrip.dtype, numpy.integer) - assert isinstance(roundtrip, filebackedarray.Hdf5CompressedSparseMatrix) - assert (numpy.array(roundtrip) == y.toarray()).all() - - -def test_stage_ndarray_boolean(): - y = scipy.sparse.random(1000, 200, 0.1) > 0 - y = y.tocoo() - - dir = mkdtemp() - meta = stage_object(y, dir=dir, path="foobar") - write_metadata(meta, dir=dir) - assert meta["array"]["type"] == "boolean" - - roundtrip = load_object(meta, dir) - assert roundtrip.shape == y.shape - assert roundtrip.dtype == y.dtype - assert isinstance(roundtrip, filebackedarray.Hdf5CompressedSparseMatrix) - assert (numpy.array(roundtrip) == y.toarray()).all() diff --git a/tests/test_write_sparse_matrix.py b/tests/test_write_sparse_matrix.py deleted file mode 100644 index c010094..0000000 --- a/tests/test_write_sparse_matrix.py +++ /dev/null @@ -1,134 +0,0 @@ -import scipy.sparse -import dolomite_matrix -from tempfile import mkstemp -from filebackedarray import Hdf5CompressedSparseMatrix -from delayedarray import wrap, Combine -import delayedarray -import h5py -import numpy - - -def test_write_sparse_matrix_scipy_csc(): - out = scipy.sparse.random(1000, 200, 0.1).tocsc() - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - - with h5py.File(fpath, "r") as handle: - dset = handle["foo/data"] - assert dset.dtype == numpy.float64 - iset = handle["foo/indices"] - assert iset.dtype == numpy.uint16 - assert list(handle["foo/shape"][:]) == [1000, 200] - - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == out.toarray()).all() - - -def test_write_sparse_matrix_scipy_other(): - out = scipy.sparse.random(1000, 200, 0.1).tocoo() - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - - with h5py.File(fpath, "r") as handle: - dset = handle["foo/data"] - assert dset.dtype == numpy.float64 - iset = handle["foo/indices"] - assert iset.dtype == numpy.uint16 - assert list(handle["foo/shape"][:]) == [1000, 200] - - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == out.toarray()).all() - - -def test_write_sparse_matrix_DelayedArray(): - raw = scipy.sparse.random(1000, 200, 0.1).tocsc() - out = wrap(raw) * 10 - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - - with h5py.File(fpath, "r") as handle: - dset = handle["foo/data"] - assert dset.dtype == numpy.float64 - iset = handle["foo/indices"] - assert iset.dtype == numpy.uint16 - assert list(handle["foo/shape"][:]) == [1000, 200] - - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == raw.toarray() * 10).all() - - # Trying again with a pristine object. - out = wrap(raw) - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == raw.toarray()).all() - - -def test_write_sparse_matrix_Combine(): - raw1 = scipy.sparse.random(1000, 200, 0.1).tocsc() - raw2 = scipy.sparse.random(1000, 100, 0.1).tocsc() - out = numpy.concatenate([wrap(raw1), wrap(raw2)], axis=1) - assert isinstance(out.seed, Combine) - - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 300), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == numpy.concatenate([raw1.toarray(), raw2.toarray()], axis=1)).all() - - # Now combining it in the other dimension. - raw2b = scipy.sparse.random(150, 200, 0.1).tocsc() - out = numpy.concatenate([wrap(raw1), wrap(raw2b)], axis=0) - - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1150, 200), by_column=True) - assert reloaded.dtype == numpy.float64 - assert (numpy.array(reloaded) == numpy.concatenate([raw1.toarray(), raw2b.toarray()], axis=0)).all() - - -def test_write_sparse_matrix_type_guess(): - raw = scipy.sparse.random(1000, 200, 0.1) - raw.data *= (-1)**(numpy.random.rand(len(raw.data)) > 0.5) - - # Small signed integer. - out = numpy.round(raw * 10) - out = out.tocsc() - _, fpath = mkstemp(suffix=".h5") - - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.int8 - assert (numpy.array(reloaded) == out.toarray()).all() - - # Trying a larger integer. - out = numpy.round(raw * 10000) - out = out.tocsc() - - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.int16 - assert (numpy.array(reloaded) == out.toarray()).all() - - # Checking the unsigned choices. - out = numpy.round(numpy.abs(raw) * 2**15) - out = out.tocsc() - - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.uint16 - assert (numpy.array(reloaded) == out.toarray()).all() - - # Ramping up to 32 bits. - out = numpy.round(raw * 2**30) - out = out.tocsc() - - _, fpath = mkstemp(suffix=".h5") - dolomite_matrix.write_sparse_matrix(out, fpath, "foo") - reloaded = Hdf5CompressedSparseMatrix(fpath, "foo", shape=(1000, 200), by_column=True) - assert reloaded.dtype == numpy.int32 - assert (numpy.array(reloaded) == out.toarray()).all()