Skip to content

Commit

Permalink
Switch to the new array formats corresponding to takane validators in…
Browse files Browse the repository at this point in the history
… base. (#3)

This also includes (some of) the functionality in alabaster.matrix, namely the optimization
of the types and the blockwise processing of dense/sparse matrices.
  • Loading branch information
LTLA authored Jan 25, 2024
1 parent f596d9e commit db09526
Show file tree
Hide file tree
Showing 31 changed files with 2,841 additions and 1,282 deletions.
5 changes: 3 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ package_dir =
# For more information, check out https://semver.org/.
install_requires =
importlib-metadata; python_version<"3.8"
dolomite-base
dolomite-base>=0.2.0-alpha5
h5py
delayedarray>=0.3.2
delayedarray>=0.3.3
numpy
filebackedarray

Expand All @@ -71,6 +71,7 @@ testing =
pytest
pytest-cov
scipy
dask

[options.entry_points]
# Add here console scripts like:
Expand Down
136 changes: 136 additions & 0 deletions src/dolomite_matrix/DelayedMask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from typing import Tuple, Optional, Sequence
import delayedarray
import numpy

class DelayedMask(delayedarray.DelayedOp):
"""
Delayed mask to replace the missing value placeholder with a NumPy masked array.
"""

def __init__(self, seed, placeholder, dtype: Optional[numpy.dtype] = None):
"""
Args:
seed:
Any object that satisfies the seed contract,
see :py:class:`~delayedarray.DelayedArray.DelayedArray` for details.
placeholder:
Placeholder value for defining masked values, of the same type
as ``seed.dtype`` (or coercible into that type). All values
equal to the placeholder are considered to be missing.
dtype:
Desired type of the masked output, defaults to ``seed.dtype``.
"""
self._seed = seed

if numpy.issubdtype(seed.dtype, numpy.str_) and isinstance(placeholder, bytes):
self._placeholder = numpy.str_(placeholder.decode("UTF8"))
else:
self._placeholder = seed.dtype.type(placeholder)

if dtype is None:
dtype = seed.dtype
self._dtype = dtype

@property
def shape(self) -> Tuple[int, ...]:
"""
Returns:
Tuple of integers specifying the extent of each dimension of this
object. This is the same as the ``seed`` object.
"""
return self._seed.shape

@property
def dtype(self) -> numpy.dtype:
"""
Returns:
NumPy type for the contents after masking.
"""
return self._dtype

@property
def seed(self):
"""
Returns:
The seed object.
"""
return self._seed

@property
def placeholder(self):
"""
Returns:
The placeholder value.
"""
return self._placeholder


def _create_mask(x: numpy.ndarray, placeholder):
if numpy.issubdtype(placeholder.dtype, numpy.floating) and numpy.isnan(placeholder):
return numpy.isnan(x)
else:
return (x == placeholder)


@delayedarray.extract_dense_array.register
def extract_dense_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None):
"""See :py:meth:`~delayedarray.extract_dense_array.extract_dense_array`."""
out = delayedarray.extract_dense_array(x._seed, subset)
mask = _create_mask(out, x._placeholder) # do this before type coercion, as the placeholder is assumed to be of the same underlying seed type.
out = out.astype(x._dtype, copy=False)
if mask.any():
out = numpy.ma.MaskedArray(out, mask=mask)
return out


def _mask_SparseNdarray(contents, placeholder, dtype):
if not isinstance(contents, list):
indices, values = contents
mask = _create_mask(values, placeholder) # do this before type coercion, again.
values = values.astype(dtype, copy=False)
if mask.any():
values = numpy.ma.MaskedArray(values, mask=mask)
return indices, values
else:
output = []
for val in contents:
if val is None:
output.append(val)
else:
output.append(_mask_SparseNdarray(val, placeholder, dtype))
return output


@delayedarray.extract_sparse_array.register
def extract_sparse_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None):
"""See :py:meth:`~delayedarray.extract_sparse_array.extract_sparse_array`."""
out = delayedarray.extract_sparse_array(x._seed, subset)
contents = out.contents
if contents is not None:
contents = _mask_SparseNdarray(contents, x._placeholder, x._dtype)
return delayedarray.SparseNdarray(x.shape, contents, dtype=x._dtype, index_dtype=out.index_dtype, check=False)


@delayedarray.create_dask_array.register
def create_dask_array_DelayedMask(x: DelayedMask):
"""See :py:meth:`~delayedarray.create_dask_array.create_dask_array`."""
target = delayedarray.create_dask_array(x._seed)
mask = (target == x._placeholder)
target = target.astype(x._dtype)
import dask.array
return dask.array.ma.masked_array(target, mask=mask)


@delayedarray.chunk_shape.register
def chunk_shape_DelayedMask(x: DelayedMask):
"""See :py:meth:`~delayedarray.chunk_shape.chunk_shape`."""
return delayedarray.chunk_shape(x._seed)


@delayedarray.is_sparse.register
def is_sparse_DelayedMask(x: DelayedMask):
"""See :py:meth:`~delayedarray.is_sparse.is_sparse`."""
return delayedarray.is_sparse(x._seed)

124 changes: 124 additions & 0 deletions src/dolomite_matrix/ReloadedArray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import delayedarray
from dolomite_base import save_object
import os
import shutil

from .WrapperArraySeed import WrapperArraySeed
from .save_compressed_sparse_matrix import _save_compressed_sparse_matrix
from .save_dense_array import _save_dense_array


class ReloadedArraySeed(WrapperArraySeed):
"""
Seed for the :py:class:`~ReloadedArray` class. This is a subclass
of :py:class:`~dolomite_matrix.WrapperArraySeed.WrapperArraySeed`.
"""

def __init__(self, seed, path: str):
"""
Args:
seed: The contents of the reloaded array.
path: Path to the directory containing the on-disk representation.
"""
super(ReloadedArraySeed, self).__init__(seed)
self._path = path

@property
def path(self) -> str:
"""
Returns:
Path to the directory containing the on-disk representation.
"""
return self._path


class ReloadedArray(delayedarray.DelayedArray):
"""
An array that was reloaded from disk by the
:py:func:`~dolomite_base.read_object.read_object` function, and remembers
the path from which it was loaded. This class allows methods to refer to
the existing on-disk representation by inspecting the path. For example,
:py:func:`~dolomite_base.save_object.save_object` can just copy/link to the
existing files instead of repeating the saving process.
"""

def __init__(self, seed, path: str):
"""
To construct a ``ReloadedArray`` from an existing
:py:class:`~ReloadedArraySeed`, use :py:meth:`~delayedarray.wrap.wrap`
instead.
Args:
seed: The contents of the reloaded array.
path: Path to the directory containing the on-disk representation.
"""
if not isinstance(seed, ReloadedArraySeed):
seed = ReloadedArraySeed(seed, path)
super(ReloadedArray, self).__init__(seed)

@property
def path(self) -> str:
"""
Returns:
Path to the directory containing the on-disk representation.
"""
return self.seed._path


@delayedarray.wrap.register
def wrap_ReloadedArraySeed(x: ReloadedArraySeed) -> ReloadedArray:
"""See :py:func:`~delayedarray.wrap.wrap`."""
return ReloadedArray(x)


@save_object.register
def save_object_ReloadedArray(x: ReloadedArray, path: str, reloaded_array_reuse_mode: str = "link", **kwargs):
"""
Method for saving :py:class:`~ReloadedArray.ReloadedArray` objects to disk,
see :py:meth:`~dolomite_base.save_object.save_object` for details.
Args:
x: Object to be saved.
path: Path to a directory to save ``x``.
reloaded_array_reuse_mode:
How the files in ``x.path`` should be re-used when populating
``path``. This can be ``"link"``, to create a hard link to each
file; ``"symlink"``, to create a symbolic link to each file;
``"copy"``, to create a copy of each file; or ``"none"``, to
perform a fresh save of ``x`` without relying on ``x.path``.
kwargs: Further arguments, ignored.
Returns:
``x`` is saved to ``path``.
"""
if reloaded_array_reuse_mode == "none":
if delayedarray.is_sparse(x):
return _save_compressed_sparse_matrix(x, path, **kwargs)
else:
return _save_dense_array(x, path, **kwargs)

if reloaded_array_reuse_mode == "link":
def FUN(src, dest):
try:
os.link(src, dest)
except:
shutil.copyfile(src, dest)
elif reloaded_array_reuse_mode == "symlink":
def FUN(src, dest):
try:
os.symlink(src, dest)
except:
shutil.copyfile(src, dest)
elif reloaded_array_reuse_mode == "copy":
FUN = shutil.copyfile
else:
raise ValueError("invalid reuse mode '" + reloaded_array_reuse_mode + "'")

for root, dirs, files in os.walk(x.path):
newpath = os.path.join(path, os.path.relpath(root, x.path))
os.makedirs(newpath)
for f in files:
FUN(os.path.join(root, f), os.path.join(newpath, f))
73 changes: 73 additions & 0 deletions src/dolomite_matrix/WrapperArraySeed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Tuple, Sequence, Optional
import numpy
import delayedarray


class WrapperArraySeed:
"""
Wrapper for a DelayedArray seed, which forwards all of the required
operations to the seed object. This is expected to be used as a base for
concrete subclasses that attach more provenance-tracking information - see
:py:class:`~dolomite_base.ReloadedArray.ReloadedArray` for an example.
"""

def __init__(self, seed):
"""
Args:
seed: The underlying seed instance to be wrapped.
"""
self._seed = seed

@property
def seed(self):
"""
Returns:
The underlying seed instance.
"""
return self._seed

@property
def shape(self) -> Tuple[int, ...]:
"""
Returns:
The shape of the seed.
"""
return self._seed.shape

@property
def dtype(self) -> numpy.dtype:
"""
Returns:
The type of the seed.
"""
return self._seed.dtype


@delayedarray.is_sparse.register
def is_sparse_WrapperArraySeed(x: WrapperArraySeed) -> bool:
"""See :py:func:`~delayedarray.is_sparse.is_sparse` for details."""
return delayedarray.is_sparse(x._seed)


@delayedarray.chunk_shape.register
def chunk_shape_WrapperArraySeed(x: WrapperArraySeed) -> Tuple[int, ...]:
"""See :py:func:`~delayedarray.chunk_shape.chunk_shape` for details."""
return delayedarray.chunk_shape(x._seed)


@delayedarray.extract_dense_array.register
def extract_dense_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> numpy.ndarray:
"""See :py:func:`~delayedarray.extract_dense_array.extract_dense_array` for details."""
return delayedarray.extract_dense_array(x._seed, subset)


@delayedarray.extract_sparse_array.register
def extract_sparse_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> delayedarray.SparseNdarray:
"""See :py:func:`~delayedarray.extract_sparse_array.extract_sparse_array` for details."""
return delayedarray.extract_sparse_array(x._seed, subset)


@delayedarray.create_dask_array.register
def create_dask_array_WrapperArraySeed(x: WrapperArraySeed):
"""See :py:func:`~delayedarray.create_dask_array.create_dask_array` for details."""
return delayedarray.create_dask_array(x._seed)
17 changes: 10 additions & 7 deletions src/dolomite_matrix/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@
del version, PackageNotFoundError


from .choose_dense_chunk_sizes import choose_dense_chunk_sizes
from .stage_ndarray import stage_ndarray
from .stage_DelayedArray import stage_DelayedArray
from .load_hdf5_dense_array import load_hdf5_dense_array
from .write_sparse_matrix import write_sparse_matrix
from .stage_sparse import *
from .load_hdf5_sparse_matrix import load_hdf5_sparse_matrix
from .choose_chunk_dimensions import choose_chunk_dimensions
from .save_dense_array import save_dense_array_from_ndarray
from .read_dense_array import read_dense_array
from .save_compressed_sparse_matrix import *
from .read_compressed_sparse_matrix import read_compressed_sparse_matrix
from .save_delayed_array import save_delayed_array

from .DelayedMask import DelayedMask
from .WrapperArraySeed import WrapperArraySeed
from .ReloadedArray import ReloadedArray, ReloadedArraySeed
Loading

0 comments on commit db09526

Please sign in to comment.