-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Switch to the new array formats corresponding to takane validators in…
… base. (#3) This also includes (some of) the functionality in alabaster.matrix, namely the optimization of the types and the blockwise processing of dense/sparse matrices.
- Loading branch information
Showing
31 changed files
with
2,841 additions
and
1,282 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
from typing import Tuple, Optional, Sequence | ||
import delayedarray | ||
import numpy | ||
|
||
class DelayedMask(delayedarray.DelayedOp): | ||
""" | ||
Delayed mask to replace the missing value placeholder with a NumPy masked array. | ||
""" | ||
|
||
def __init__(self, seed, placeholder, dtype: Optional[numpy.dtype] = None): | ||
""" | ||
Args: | ||
seed: | ||
Any object that satisfies the seed contract, | ||
see :py:class:`~delayedarray.DelayedArray.DelayedArray` for details. | ||
placeholder: | ||
Placeholder value for defining masked values, of the same type | ||
as ``seed.dtype`` (or coercible into that type). All values | ||
equal to the placeholder are considered to be missing. | ||
dtype: | ||
Desired type of the masked output, defaults to ``seed.dtype``. | ||
""" | ||
self._seed = seed | ||
|
||
if numpy.issubdtype(seed.dtype, numpy.str_) and isinstance(placeholder, bytes): | ||
self._placeholder = numpy.str_(placeholder.decode("UTF8")) | ||
else: | ||
self._placeholder = seed.dtype.type(placeholder) | ||
|
||
if dtype is None: | ||
dtype = seed.dtype | ||
self._dtype = dtype | ||
|
||
@property | ||
def shape(self) -> Tuple[int, ...]: | ||
""" | ||
Returns: | ||
Tuple of integers specifying the extent of each dimension of this | ||
object. This is the same as the ``seed`` object. | ||
""" | ||
return self._seed.shape | ||
|
||
@property | ||
def dtype(self) -> numpy.dtype: | ||
""" | ||
Returns: | ||
NumPy type for the contents after masking. | ||
""" | ||
return self._dtype | ||
|
||
@property | ||
def seed(self): | ||
""" | ||
Returns: | ||
The seed object. | ||
""" | ||
return self._seed | ||
|
||
@property | ||
def placeholder(self): | ||
""" | ||
Returns: | ||
The placeholder value. | ||
""" | ||
return self._placeholder | ||
|
||
|
||
def _create_mask(x: numpy.ndarray, placeholder): | ||
if numpy.issubdtype(placeholder.dtype, numpy.floating) and numpy.isnan(placeholder): | ||
return numpy.isnan(x) | ||
else: | ||
return (x == placeholder) | ||
|
||
|
||
@delayedarray.extract_dense_array.register | ||
def extract_dense_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None): | ||
"""See :py:meth:`~delayedarray.extract_dense_array.extract_dense_array`.""" | ||
out = delayedarray.extract_dense_array(x._seed, subset) | ||
mask = _create_mask(out, x._placeholder) # do this before type coercion, as the placeholder is assumed to be of the same underlying seed type. | ||
out = out.astype(x._dtype, copy=False) | ||
if mask.any(): | ||
out = numpy.ma.MaskedArray(out, mask=mask) | ||
return out | ||
|
||
|
||
def _mask_SparseNdarray(contents, placeholder, dtype): | ||
if not isinstance(contents, list): | ||
indices, values = contents | ||
mask = _create_mask(values, placeholder) # do this before type coercion, again. | ||
values = values.astype(dtype, copy=False) | ||
if mask.any(): | ||
values = numpy.ma.MaskedArray(values, mask=mask) | ||
return indices, values | ||
else: | ||
output = [] | ||
for val in contents: | ||
if val is None: | ||
output.append(val) | ||
else: | ||
output.append(_mask_SparseNdarray(val, placeholder, dtype)) | ||
return output | ||
|
||
|
||
@delayedarray.extract_sparse_array.register | ||
def extract_sparse_array_DelayedMask(x: DelayedMask, subset: Optional[Tuple[Sequence[int], ...]] = None): | ||
"""See :py:meth:`~delayedarray.extract_sparse_array.extract_sparse_array`.""" | ||
out = delayedarray.extract_sparse_array(x._seed, subset) | ||
contents = out.contents | ||
if contents is not None: | ||
contents = _mask_SparseNdarray(contents, x._placeholder, x._dtype) | ||
return delayedarray.SparseNdarray(x.shape, contents, dtype=x._dtype, index_dtype=out.index_dtype, check=False) | ||
|
||
|
||
@delayedarray.create_dask_array.register | ||
def create_dask_array_DelayedMask(x: DelayedMask): | ||
"""See :py:meth:`~delayedarray.create_dask_array.create_dask_array`.""" | ||
target = delayedarray.create_dask_array(x._seed) | ||
mask = (target == x._placeholder) | ||
target = target.astype(x._dtype) | ||
import dask.array | ||
return dask.array.ma.masked_array(target, mask=mask) | ||
|
||
|
||
@delayedarray.chunk_shape.register | ||
def chunk_shape_DelayedMask(x: DelayedMask): | ||
"""See :py:meth:`~delayedarray.chunk_shape.chunk_shape`.""" | ||
return delayedarray.chunk_shape(x._seed) | ||
|
||
|
||
@delayedarray.is_sparse.register | ||
def is_sparse_DelayedMask(x: DelayedMask): | ||
"""See :py:meth:`~delayedarray.is_sparse.is_sparse`.""" | ||
return delayedarray.is_sparse(x._seed) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import delayedarray | ||
from dolomite_base import save_object | ||
import os | ||
import shutil | ||
|
||
from .WrapperArraySeed import WrapperArraySeed | ||
from .save_compressed_sparse_matrix import _save_compressed_sparse_matrix | ||
from .save_dense_array import _save_dense_array | ||
|
||
|
||
class ReloadedArraySeed(WrapperArraySeed): | ||
""" | ||
Seed for the :py:class:`~ReloadedArray` class. This is a subclass | ||
of :py:class:`~dolomite_matrix.WrapperArraySeed.WrapperArraySeed`. | ||
""" | ||
|
||
def __init__(self, seed, path: str): | ||
""" | ||
Args: | ||
seed: The contents of the reloaded array. | ||
path: Path to the directory containing the on-disk representation. | ||
""" | ||
super(ReloadedArraySeed, self).__init__(seed) | ||
self._path = path | ||
|
||
@property | ||
def path(self) -> str: | ||
""" | ||
Returns: | ||
Path to the directory containing the on-disk representation. | ||
""" | ||
return self._path | ||
|
||
|
||
class ReloadedArray(delayedarray.DelayedArray): | ||
""" | ||
An array that was reloaded from disk by the | ||
:py:func:`~dolomite_base.read_object.read_object` function, and remembers | ||
the path from which it was loaded. This class allows methods to refer to | ||
the existing on-disk representation by inspecting the path. For example, | ||
:py:func:`~dolomite_base.save_object.save_object` can just copy/link to the | ||
existing files instead of repeating the saving process. | ||
""" | ||
|
||
def __init__(self, seed, path: str): | ||
""" | ||
To construct a ``ReloadedArray`` from an existing | ||
:py:class:`~ReloadedArraySeed`, use :py:meth:`~delayedarray.wrap.wrap` | ||
instead. | ||
Args: | ||
seed: The contents of the reloaded array. | ||
path: Path to the directory containing the on-disk representation. | ||
""" | ||
if not isinstance(seed, ReloadedArraySeed): | ||
seed = ReloadedArraySeed(seed, path) | ||
super(ReloadedArray, self).__init__(seed) | ||
|
||
@property | ||
def path(self) -> str: | ||
""" | ||
Returns: | ||
Path to the directory containing the on-disk representation. | ||
""" | ||
return self.seed._path | ||
|
||
|
||
@delayedarray.wrap.register | ||
def wrap_ReloadedArraySeed(x: ReloadedArraySeed) -> ReloadedArray: | ||
"""See :py:func:`~delayedarray.wrap.wrap`.""" | ||
return ReloadedArray(x) | ||
|
||
|
||
@save_object.register | ||
def save_object_ReloadedArray(x: ReloadedArray, path: str, reloaded_array_reuse_mode: str = "link", **kwargs): | ||
""" | ||
Method for saving :py:class:`~ReloadedArray.ReloadedArray` objects to disk, | ||
see :py:meth:`~dolomite_base.save_object.save_object` for details. | ||
Args: | ||
x: Object to be saved. | ||
path: Path to a directory to save ``x``. | ||
reloaded_array_reuse_mode: | ||
How the files in ``x.path`` should be re-used when populating | ||
``path``. This can be ``"link"``, to create a hard link to each | ||
file; ``"symlink"``, to create a symbolic link to each file; | ||
``"copy"``, to create a copy of each file; or ``"none"``, to | ||
perform a fresh save of ``x`` without relying on ``x.path``. | ||
kwargs: Further arguments, ignored. | ||
Returns: | ||
``x`` is saved to ``path``. | ||
""" | ||
if reloaded_array_reuse_mode == "none": | ||
if delayedarray.is_sparse(x): | ||
return _save_compressed_sparse_matrix(x, path, **kwargs) | ||
else: | ||
return _save_dense_array(x, path, **kwargs) | ||
|
||
if reloaded_array_reuse_mode == "link": | ||
def FUN(src, dest): | ||
try: | ||
os.link(src, dest) | ||
except: | ||
shutil.copyfile(src, dest) | ||
elif reloaded_array_reuse_mode == "symlink": | ||
def FUN(src, dest): | ||
try: | ||
os.symlink(src, dest) | ||
except: | ||
shutil.copyfile(src, dest) | ||
elif reloaded_array_reuse_mode == "copy": | ||
FUN = shutil.copyfile | ||
else: | ||
raise ValueError("invalid reuse mode '" + reloaded_array_reuse_mode + "'") | ||
|
||
for root, dirs, files in os.walk(x.path): | ||
newpath = os.path.join(path, os.path.relpath(root, x.path)) | ||
os.makedirs(newpath) | ||
for f in files: | ||
FUN(os.path.join(root, f), os.path.join(newpath, f)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from typing import Tuple, Sequence, Optional | ||
import numpy | ||
import delayedarray | ||
|
||
|
||
class WrapperArraySeed: | ||
""" | ||
Wrapper for a DelayedArray seed, which forwards all of the required | ||
operations to the seed object. This is expected to be used as a base for | ||
concrete subclasses that attach more provenance-tracking information - see | ||
:py:class:`~dolomite_base.ReloadedArray.ReloadedArray` for an example. | ||
""" | ||
|
||
def __init__(self, seed): | ||
""" | ||
Args: | ||
seed: The underlying seed instance to be wrapped. | ||
""" | ||
self._seed = seed | ||
|
||
@property | ||
def seed(self): | ||
""" | ||
Returns: | ||
The underlying seed instance. | ||
""" | ||
return self._seed | ||
|
||
@property | ||
def shape(self) -> Tuple[int, ...]: | ||
""" | ||
Returns: | ||
The shape of the seed. | ||
""" | ||
return self._seed.shape | ||
|
||
@property | ||
def dtype(self) -> numpy.dtype: | ||
""" | ||
Returns: | ||
The type of the seed. | ||
""" | ||
return self._seed.dtype | ||
|
||
|
||
@delayedarray.is_sparse.register | ||
def is_sparse_WrapperArraySeed(x: WrapperArraySeed) -> bool: | ||
"""See :py:func:`~delayedarray.is_sparse.is_sparse` for details.""" | ||
return delayedarray.is_sparse(x._seed) | ||
|
||
|
||
@delayedarray.chunk_shape.register | ||
def chunk_shape_WrapperArraySeed(x: WrapperArraySeed) -> Tuple[int, ...]: | ||
"""See :py:func:`~delayedarray.chunk_shape.chunk_shape` for details.""" | ||
return delayedarray.chunk_shape(x._seed) | ||
|
||
|
||
@delayedarray.extract_dense_array.register | ||
def extract_dense_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> numpy.ndarray: | ||
"""See :py:func:`~delayedarray.extract_dense_array.extract_dense_array` for details.""" | ||
return delayedarray.extract_dense_array(x._seed, subset) | ||
|
||
|
||
@delayedarray.extract_sparse_array.register | ||
def extract_sparse_array_WrapperArraySeed(x: WrapperArraySeed, subset: Optional[Tuple[Sequence[int], ...]] = None) -> delayedarray.SparseNdarray: | ||
"""See :py:func:`~delayedarray.extract_sparse_array.extract_sparse_array` for details.""" | ||
return delayedarray.extract_sparse_array(x._seed, subset) | ||
|
||
|
||
@delayedarray.create_dask_array.register | ||
def create_dask_array_WrapperArraySeed(x: WrapperArraySeed): | ||
"""See :py:func:`~delayedarray.create_dask_array.create_dask_array` for details.""" | ||
return delayedarray.create_dask_array(x._seed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.