Skip to content

Commit

Permalink
Switch to NamedList subclasses for atomic_vector support.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Jan 19, 2024
1 parent e6ab8ec commit f695e49
Show file tree
Hide file tree
Showing 6 changed files with 407 additions and 156 deletions.
2 changes: 1 addition & 1 deletion src/dolomite_base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from .save_object import save_object
from .validate_object import validate_object
from .save_string_list import save_string_list
from .save_atomic_vector import save_atomic_vector_from_string_list, save_atomic_vector_from_integer_list, save_atomic_vector_from_float_list, save_atomic_vector_from_boolean_list
from .save_string_factor import save_string_factor
from .save_simple_list import save_simple_list_from_list, save_simple_list_from_dict, save_simple_list_from_NamedList
from .save_data_frame import save_data_frame
Expand Down
145 changes: 118 additions & 27 deletions src/dolomite_base/_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from numpy import ndarray
import numpy
from typing import Union, Sequence, Tuple
from biocutils import StringList
from . import lib_dolomite_base as lib
import h5py


def _is_missing_scalar(x) -> bool:
Expand Down Expand Up @@ -45,7 +45,7 @@ def _determine_save_type(x: Union[numpy.ndarray, numpy.generic]):
raise NotImplementedError("saving a NumPy array of " + str(x.dtype) + " is not supported yet")


def _is_actually_masked(x: numpy.ndarray):
def _is_actually_masked(x: numpy.ndarray) -> bool:
if not numpy.ma.is_masked(x):
return False
if isinstance(x.mask, bool):
Expand All @@ -55,52 +55,143 @@ def _is_actually_masked(x: numpy.ndarray):
return True


def _choose_missing_integer_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
copy = x.data.astype(numpy.int32) # make a copy as we'll be mutating it in C++.
mask = x.mask.astype(numpy.uint8) # use uint8 to avoid problems with ambiguous bool typing.
def list_to_numpy_with_mask(x: Sequence, x_dtype, mask_dtype = numpy.uint8) -> numpy.ndarray:
"""
Convert a list of numbers or None into NumPy arrays.
Args:
x: List of numbers.
x_dtype: Data type to use for the output array.
mask_dtype: Data type to use for the mask array.
Returns:
Tuple containing the contents of ``x`` in a NumPy array, plus another
array indicating whether each element of ``x`` was None or masked.
(Masked or None values are set to zero in the first array.)
"""
mask = numpy.ndarray(len(x), dtype=mask_dtype)
arr = numpy.ndarray(len(x), dtype=x_dtype)
for i, y in enumerate(x):
if _is_missing_scalar(y):
arr[i] = 0
mask[i] = 1
else:
arr[i] = y
mask[i] = 0
return arr, mask

okay, placeholder = lib.choose_missing_integer_placeholder(copy, mask)
if okay:
return copy, placeholder, int

# In the rare case that it's not okay, we just convert it to a float, which
# gives us some more room to save placeholders.
copy, placeholder = _choose_missing_float_placeholder(x)
return copy, placeholder, float
def choose_missing_integer_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True) -> Tuple:
"""
Choose a missing placeholder for integer arrays.
Args:
x: An integer array.
mask: An array of the same shape as ``x``, indicating which elements are masked.
copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
def _choose_missing_float_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
copy = x.data.astype(numpy.float64) # make a copy as we'll be mutating it in C++.
mask = x.mask.astype(numpy.uint8) # use uint8 to avoid problems with ambiguous bool typing.
Returns:
A tuple containing an int32 array with the contents of ``x``, where all
masked values are replaced by a placeholder, plus the placeholder value
itself. Note that the output array may be of a floating-point type.
"""
xcopy = x.astype(numpy.int32, copy = copy) # make a copy as we'll be mutating it in C++.
mask = mask.astype(numpy.uint8, copy = False) # use uint8 to avoid problems with ambiguous bool typing.

okay, placeholder = lib.choose_missing_float_placeholder(copy, mask)
okay, placeholder = lib.choose_missing_integer_placeholder(xcopy, mask)
if okay:
return xcopy, placeholder

# In the rare case that it's not okay, we just convert it to a float, which
# gives us some more room to save placeholders.
xcopy, placeholder = choose_missing_float_placeholder(x, mask, copy = copy)
return xcopy, placeholder


def choose_missing_float_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True) -> Tuple:
"""
Choose a missing placeholder for float arrays.
Args:
x: A floating-point array.
mask: An array of the same shape as ``x``, indicating which elements are masked.
copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
Returns:
A tuple containing a float64 array with the contents of ``x`` where all
masked values are replaced by a placeholder, plus the placeholder value.
"""
xcopy = x.astype(numpy.float64, copy = copy) # make a copy as we'll be mutating it in C++.
mask = mask.astype(numpy.uint8, copy = False) # use uint8 to avoid problems with ambiguous bool typing.
okay, placeholder = lib.choose_missing_float_placeholder(xcopy, mask)
if not okay:
raise ValueError("failed to find an appropriate floating-point missing value placeholder")
return copy, placeholder
return xcopy, placeholder


def choose_missing_boolean_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True):
"""
Choose a missing placeholder for boolean arrays.
def _choose_missing_boolean_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
copy = x.data.astype(numpy.int8)
Args:
x: A boolean array (or any numeric array to be interpreted as boolean).
mask: An array of the same shape as ``x``, indicating which elements are masked.
copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
Returns:
A tuple containing an int8 array with the contents of ``x``, where all
masked values are replaced by a placeholder, plus the placeholder value.
"""
xcopy = x.astype(numpy.int8, copy = copy)
placeholder = numpy.int8(-1)
copy[x.mask] = placeholder
return copy, placeholder
if mask.dtype == numpy.bool_:
xcopy[mask] = placeholder
else:
xcopy[mask != 0] = placeholder
return xcopy, placeholder


def choose_missing_string_placeholder(x: Sequence, copy: bool = True) -> Tuple:
"""
Choose a missing placeholder for string sequences.
Args:
x: A sequence of strings or Nones.
copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
def _choose_missing_string_placeholder(x: StringList) -> Tuple:
Returns:
A tuple containing a list of strings with the contents of ``x`` where
all masked values are replaced by a placeholder, plus the placeholder.
"""
present = set(x)
placeholder = "NA"
while placeholder in present:
placeholder += "_"

copy = x[:]
for j, y in enumerate(copy):
if copy:
xcopy = x[:]
else:
xcopy = x

for j, y in enumerate(xcopy):
if y is None:
copy[j] = placeholder
xcopy[j] = placeholder

return xcopy, placeholder


return copy, placeholder
def save_fixed_length_strings(handle: h5py.Group, name: str, strings: list[str]):
"""
Save a list of strings into a fixed-length string dataset.
Args:
handle: Handle to a HDF5 Group.
name: Name of the dataset to create in ``handle``.
strings: List of strings to save.
def _save_fixed_length_strings(handle, name: str, strings: list[str]):
Returns:
``strings`` is saved into the group as a fixed-length string dataset.
"""
tmp = [ y.encode("UTF8") for y in strings ]
maxed = 1
for b in tmp:
Expand Down
51 changes: 27 additions & 24 deletions src/dolomite_base/read_atomic_vector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Union
from biocutils import StringList
from biocutils import StringList, IntegerList, FloatList, BooleanList
import numpy
import h5py
import os
Expand Down Expand Up @@ -38,29 +38,32 @@ def read_atomic_vector(path: str, metadata: dict, **kwargs) -> Union[StringList,
for i, x in enumerate(output):
if x == placeholder:
output[i] = None

if has_names:
output.set_names([a.decode() for a in ghandle["names"][:]], in_place=True)
return output

if has_names:
warnings.warn("skipping names when reading a numeric 'atomic_vector'")

output = dhandle[:]
if has_none:
placeholder = dhandle.attrs["missing-value-placeholder"]
if numpy.isnan(placeholder):
mask = numpy.isnan(output)
else:
values = dhandle[:]
if not has_none:
output = values
else:
mask = (output == placeholder)
output = [None] * values.shape[0]
placeholder = dhandle.attrs["missing-value-placeholder"]
if numpy.isnan(placeholder):
for i, x in enumerate(values):
if not numpy.isnan(x):
output[i] = x
else:
for i, x in enumerate(values):
if x != placeholder:
output[i] = x

if vectype == "boolean":
output = output.astype(numpy.bool_)
elif vectype == "number":
if not numpy.issubdtype(output.dtype, numpy.floating):
output = output.astype(numpy.double)
if vectype == "integer":
output = IntegerList(output)
elif vectype == "number":
if "_python_original_type" in dhandle.attrs and dhandle.attrs["_python_original_type"] == "biocutils.IntegerList":
output = IntegerList(output)
else:
output = FloatList(output)
elif vectype == "boolean":
output = BooleanList(output)

if has_none:
return numpy.ma.MaskedArray(output, mask=mask)
else:
return output
if has_names:
output.set_names([a.decode() for a in ghandle["names"][:]], in_place=True)
return output
Loading

0 comments on commit f695e49

Please sign in to comment.