Switch to NamedList subclasses for atomic_vector support.

ArtifactDB · Jan 19, 2024 · f695e49 · f695e49
1 parent e6ab8ec
commit f695e49
Show file tree

Hide file tree

Showing 6 changed files with 407 additions and 156 deletions.
diff --git a/src/dolomite_base/__init__.py b/src/dolomite_base/__init__.py
@@ -17,7 +17,7 @@
 
 from .save_object import save_object
 from .validate_object import validate_object
-from .save_string_list import save_string_list
+from .save_atomic_vector import save_atomic_vector_from_string_list, save_atomic_vector_from_integer_list, save_atomic_vector_from_float_list, save_atomic_vector_from_boolean_list
 from .save_string_factor import save_string_factor
 from .save_simple_list import save_simple_list_from_list, save_simple_list_from_dict, save_simple_list_from_NamedList
 from .save_data_frame import save_data_frame

diff --git a/src/dolomite_base/_utils.py b/src/dolomite_base/_utils.py
@@ -1,8 +1,8 @@
 from numpy import ndarray
 import numpy
 from typing import Union, Sequence, Tuple
-from biocutils import StringList
 from . import lib_dolomite_base as lib
+import h5py
 
 
 def _is_missing_scalar(x) -> bool:
@@ -45,7 +45,7 @@ def _determine_save_type(x: Union[numpy.ndarray, numpy.generic]):
         raise NotImplementedError("saving a NumPy array of " + str(x.dtype) + " is not supported yet")
 
 
-def _is_actually_masked(x: numpy.ndarray):
+def _is_actually_masked(x: numpy.ndarray) -> bool:
     if not numpy.ma.is_masked(x):
         return False
     if isinstance(x.mask, bool):
@@ -55,52 +55,143 @@ def _is_actually_masked(x: numpy.ndarray):
     return True
 
 
-def _choose_missing_integer_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
-    copy = x.data.astype(numpy.int32) # make a copy as we'll be mutating it in C++.
-    mask = x.mask.astype(numpy.uint8) # use uint8 to avoid problems with ambiguous bool typing.
+def list_to_numpy_with_mask(x: Sequence, x_dtype, mask_dtype = numpy.uint8) -> numpy.ndarray:
+    """
+    Convert a list of numbers or None into NumPy arrays.
+
+    Args:
+        x: List of numbers.
+        x_dtype: Data type to use for the output array.
+        mask_dtype: Data type to use for the mask array.
+
+    Returns:
+        Tuple containing the contents of ``x`` in a NumPy array, plus another
+        array indicating whether each element of ``x`` was None or masked.
+        (Masked or None values are set to zero in the first array.)
+    """
+    mask = numpy.ndarray(len(x), dtype=mask_dtype)
+    arr = numpy.ndarray(len(x), dtype=x_dtype)
+    for i, y in enumerate(x):
+        if _is_missing_scalar(y):
+            arr[i] = 0
+            mask[i] = 1
+        else:
+            arr[i] = y
+            mask[i] = 0
+    return arr, mask
 
-    okay, placeholder = lib.choose_missing_integer_placeholder(copy, mask)
-    if okay:
-        return copy, placeholder, int
 
-    # In the rare case that it's not okay, we just convert it to a float, which
-    # gives us some more room to save placeholders.
-    copy, placeholder = _choose_missing_float_placeholder(x)
-    return copy, placeholder, float
+def choose_missing_integer_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True) -> Tuple:
+    """
+    Choose a missing placeholder for integer arrays.
 
+    Args:
+        x: An integer array.
+        mask: An array of the same shape as ``x``, indicating which elements are masked.
+        copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
 
-def _choose_missing_float_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
-    copy = x.data.astype(numpy.float64) # make a copy as we'll be mutating it in C++.
-    mask = x.mask.astype(numpy.uint8) # use uint8 to avoid problems with ambiguous bool typing.
+    Returns:
+        A tuple containing an int32 array with the contents of ``x``, where all
+        masked values are replaced by a placeholder, plus the placeholder value
+        itself. Note that the output array may be of a floating-point type.
+    """
+    xcopy = x.astype(numpy.int32, copy = copy) # make a copy as we'll be mutating it in C++.
+    mask = mask.astype(numpy.uint8, copy = False) # use uint8 to avoid problems with ambiguous bool typing.
 
-    okay, placeholder = lib.choose_missing_float_placeholder(copy, mask)
+    okay, placeholder = lib.choose_missing_integer_placeholder(xcopy, mask)
+    if okay:
+        return xcopy, placeholder
+
+    # In the rare case that it's not okay, we just convert it to a float, which
+    # gives us some more room to save placeholders.
+    xcopy, placeholder = choose_missing_float_placeholder(x, mask, copy = copy)
+    return xcopy, placeholder
+
+
+def choose_missing_float_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True) -> Tuple:
+    """
+    Choose a missing placeholder for float arrays.
+
+    Args:
+        x: A floating-point array.
+        mask: An array of the same shape as ``x``, indicating which elements are masked.
+        copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
+
+    Returns:
+        A tuple containing a float64 array with the contents of ``x`` where all
+        masked values are replaced by a placeholder, plus the placeholder value.
+    """
+    xcopy = x.astype(numpy.float64, copy = copy) # make a copy as we'll be mutating it in C++.
+    mask = mask.astype(numpy.uint8, copy = False) # use uint8 to avoid problems with ambiguous bool typing.
+    okay, placeholder = lib.choose_missing_float_placeholder(xcopy, mask)
     if not okay:
         raise ValueError("failed to find an appropriate floating-point missing value placeholder")
-    return copy, placeholder
+    return xcopy, placeholder
+
 
+def choose_missing_boolean_placeholder(x: numpy.ndarray, mask: numpy.ndarray, copy: bool = True):
+    """
+    Choose a missing placeholder for boolean arrays.
 
-def _choose_missing_boolean_placeholder(x: numpy.ma.MaskedArray) -> Tuple:
-    copy = x.data.astype(numpy.int8) 
+    Args:
+        x: A boolean array (or any numeric array to be interpreted as boolean).
+        mask: An array of the same shape as ``x``, indicating which elements are masked.
+        copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
+
+    Returns:
+        A tuple containing an int8 array with the contents of ``x``, where all
+        masked values are replaced by a placeholder, plus the placeholder value.
+    """
+    xcopy = x.astype(numpy.int8, copy = copy) 
     placeholder = numpy.int8(-1)
-    copy[x.mask] = placeholder
-    return copy, placeholder
+    if mask.dtype == numpy.bool_:
+        xcopy[mask] = placeholder
+    else:
+        xcopy[mask != 0] = placeholder
+    return xcopy, placeholder
+
+
+def choose_missing_string_placeholder(x: Sequence, copy: bool = True) -> Tuple:
+    """
+    Choose a missing placeholder for string sequences.
 
+    Args:
+        x: A sequence of strings or Nones.
+        copy: Whether to make a copy of ``x``. If ``False``, this function may mutate it in-place.
 
-def _choose_missing_string_placeholder(x: StringList) -> Tuple:
+    Returns:
+        A tuple containing a list of strings with the contents of ``x`` where
+        all masked values are replaced by a placeholder, plus the placeholder.
+    """
     present = set(x)
     placeholder = "NA"
     while placeholder in present:
         placeholder += "_"
 
-    copy = x[:]
-    for j, y in enumerate(copy):
+    if copy:
+        xcopy = x[:]
+    else:
+        xcopy = x
+
+    for j, y in enumerate(xcopy):
         if y is None:
-            copy[j] = placeholder 
+            xcopy[j] = placeholder 
+
+    return xcopy, placeholder
+
 
-    return copy, placeholder
+def save_fixed_length_strings(handle: h5py.Group, name: str, strings: list[str]):
+    """
+    Save a list of strings into a fixed-length string dataset.
 
+    Args:
+        handle: Handle to a HDF5 Group.
+        name: Name of the dataset to create in ``handle``.
+        strings: List of strings to save.
 
-def _save_fixed_length_strings(handle, name: str, strings: list[str]):
+    Returns:
+        ``strings`` is saved into the group as a fixed-length string dataset.
+    """
     tmp = [ y.encode("UTF8") for y in strings ]
     maxed = 1
     for b in tmp:

diff --git a/src/dolomite_base/read_atomic_vector.py b/src/dolomite_base/read_atomic_vector.py
@@ -1,5 +1,5 @@
 from typing import Union
-from biocutils import StringList
+from biocutils import StringList, IntegerList, FloatList, BooleanList
 import numpy
 import h5py
 import os
@@ -38,29 +38,32 @@ def read_atomic_vector(path: str, metadata: dict, **kwargs) -> Union[StringList,
                 for i, x in enumerate(output):
                     if x == placeholder:
                         output[i] = None
-
-            if has_names:
-                output.set_names([a.decode() for a in ghandle["names"][:]], in_place=True)
-            return output
-
-        if has_names:
-            warnings.warn("skipping names when reading a numeric 'atomic_vector'")
-
-        output = dhandle[:]
-        if has_none:
-            placeholder = dhandle.attrs["missing-value-placeholder"]
-            if numpy.isnan(placeholder):
-                mask = numpy.isnan(output)
+        else:
+            values = dhandle[:]
+            if not has_none:
+                output = values
             else:
-                mask = (output == placeholder)
+                output = [None] * values.shape[0]
+                placeholder = dhandle.attrs["missing-value-placeholder"]
+                if numpy.isnan(placeholder):
+                    for i, x in enumerate(values):
+                        if not numpy.isnan(x):
+                            output[i] = x
+                else:
+                    for i, x in enumerate(values):
+                        if x != placeholder:
+                            output[i] = x
 
-        if vectype == "boolean":
-            output = output.astype(numpy.bool_)
-        elif vectype == "number":
-            if not numpy.issubdtype(output.dtype, numpy.floating):
-                output = output.astype(numpy.double)
+            if vectype == "integer":
+                output = IntegerList(output)
+            elif vectype == "number":
+                if "_python_original_type" in dhandle.attrs and dhandle.attrs["_python_original_type"] == "biocutils.IntegerList":
+                    output = IntegerList(output)
+                else:
+                    output = FloatList(output)
+            elif vectype == "boolean":
+                output = BooleanList(output)
 
-        if has_none:
-            return numpy.ma.MaskedArray(output, mask=mask)
-        else:
-            return output
+        if has_names:
+            output.set_names([a.decode() for a in ghandle["names"][:]], in_place=True)
+        return output