From 42b0c15de78f1abd4226ee448f1f537057f9a164 Mon Sep 17 00:00:00 2001
From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com>
Date: Fri, 2 Aug 2024 19:57:37 +0200
Subject: [PATCH] Simplify Dataset state variables

---
 khiops/sklearn/estimators.py |  93 ++++++++++++++-----------
 khiops/utils/dataset.py      | 129 ++++++++++++++++-------------------
 khiops/utils/helpers.py      |   6 +-
 tests/test_dataset_class.py  |   7 +-
 4 files changed, 119 insertions(+), 116 deletions(-)

diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
index 27892bcb..fb5f4d22 100644
--- a/khiops/sklearn/estimators.py
+++ b/khiops/sklearn/estimators.py
@@ -148,38 +148,51 @@ def _check_categorical_target_type(ds):
     if ds.target_column is None:
         raise ValueError("Target vector is not specified.")
 
-    if ds.is_in_memory() and not (
-        isinstance(ds.target_column_dtype, pd.CategoricalDtype)
-        or pd.api.types.is_string_dtype(ds.target_column_dtype)
-        or pd.api.types.is_integer_dtype(ds.target_column_dtype)
-        or pd.api.types.is_float_dtype(ds.target_column_dtype)
+    if ds.is_in_memory and not (
+        isinstance(ds.target_column.dtype, pd.CategoricalDtype)
+        or pd.api.types.is_string_dtype(ds.target_column.dtype)
+        or pd.api.types.is_integer_dtype(ds.target_column.dtype)
+        or pd.api.types.is_float_dtype(ds.target_column.dtype)
     ):
         raise ValueError(
             f"'y' has invalid type '{ds.target_column_type}'. "
             "Only string, integer, float and categorical types "
             "are accepted for the target."
         )
-    elif not ds.is_in_memory() and ds.target_column_type != "Categorical":
+    elif (
+        not ds.is_in_memory
+        and ds.main_table.khiops_types[ds.target_column_id] != "Categorical"
+    ):
         raise ValueError(
-            f"Target column has invalid type '{ds.target_column_type}'. "
+            "Target column has invalid type "
+            f"'{ds.main_table.khiops_types[ds.target_column_id]}'. "
             "Only Categorical types are accepted for file datasets."
         )
 
 
 def _check_numerical_target_type(ds):
+    # Check that the target column is specified
     if ds.target_column is None:
         raise ValueError("Target vector is not specified.")
-    if ds.is_in_memory():
-        if not pd.api.types.is_numeric_dtype(ds.target_column_dtype):
+
+    # If in-memory: Check that the column is numerical and that the values are finite
+    # The latter is required by sklearn
+    if ds.is_in_memory:
+        if not pd.api.types.is_numeric_dtype(ds.target_column.dtype):
             raise ValueError(
-                f"Unknown label type '{ds.target_column_type}'. "
+                f"Unknown label type '{ds.target_column.dtype}'. "
                 "Expected a numerical type."
             )
         if ds.target_column is not None:
             assert_all_finite(ds.target_column)
-    elif not ds.is_in_memory() and ds.target_column_type != "Numerical":
+    # Otherwise: Check the the Khiops type
+    elif (
+        not ds.is_in_memory
+        and ds.main_table.khiops_types[ds.target_column_id] != "Numerical"
+    ):
         raise ValueError(
-            f"Target column has invalid type '{ds.target_column_type}'. "
+            "Target column has invalid type "
+            f"'{ds.main_table.khiops_types[ds.target_column_id]}'. "
             "Only Numerical types are accepted for file datasets."
         )
 
@@ -384,7 +397,7 @@ def _fit(self, ds, computation_dir, **kwargs):
         ):
             self._fit_training_post_process(ds)
             self.is_fitted_ = True
-            self.is_multitable_model_ = ds.is_multitable()
+            self.is_multitable_model_ = ds.is_multitable
 
     def _fit_check_params(self, ds, **_):
         """Check the model parameters including those data dependent (in kwargs)"""
@@ -395,7 +408,7 @@ def _fit_check_params(self, ds, **_):
         ):
             raise TypeError(type_error_message("key", self.key, str, "list-like"))
 
-        if not ds.is_in_memory() and self.output_dir is None:
+        if not ds.is_in_memory and self.output_dir is None:
             raise ValueError("'output_dir' is not set but dataset is file-based")
 
     def _fit_check_dataset(self, ds):
@@ -529,7 +542,7 @@ def _transform_deploy_model(
         output_data_table_path = fs.get_child_path(output_dir, transformed_file_name)
 
         # Set the format parameters depending on the type of dataset
-        if deployment_ds.is_in_memory():
+        if deployment_ds.is_in_memory:
             field_separator = "\t"
             header_line = True
         else:
@@ -563,7 +576,7 @@ def _transform_deployment_post_process(
         self, deployment_ds, output_table_path, drop_key
     ):
         # Return a dataframe for dataframe based datasets
-        if deployment_ds.is_in_memory():
+        if deployment_ds.is_in_memory:
             # Read the transformed table with the internal table settings
             with io.BytesIO(fs.read(output_table_path)) as output_table_stream:
                 output_table_df = read_internal_data_table(output_table_stream)
@@ -572,7 +585,7 @@ def _transform_deployment_post_process(
             # - Reorder the table to the original table order
             #     - Because transformed data table file is sorted by key
             # - Drop the key columns if specified
-            if deployment_ds.is_multitable():
+            if deployment_ds.is_multitable:
                 key_df = deployment_ds.main_table.data_source[
                     deployment_ds.main_table.key
                 ]
@@ -822,7 +835,7 @@ def _fit_check_params(self, ds, **kwargs):
             )
 
     def _fit_train_model(self, ds, computation_dir, **kwargs):
-        assert not ds.is_multitable(), "Coclustering not available in multitable"
+        assert not ds.is_multitable, "Coclustering not available in multitable"
 
         # Prepare the table files and dictionary for Khiops
         main_table_path, _ = ds.create_table_files_for_khiops(
@@ -1217,7 +1230,7 @@ def predict(self, X):
             kh.get_runner().root_temp_dir = initial_runner_temp_dir
 
         # Transform to numpy.array for in-memory inputs
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             y_pred = y_pred.to_numpy()
 
         return y_pred
@@ -1235,7 +1248,7 @@ def _transform_check_dataset(self, ds):
         # - They are mono-table only
         # - They are deployed with a multitable model whose main table contain
         #   the keys of the input table and the secondary table is the input table
-        if ds.is_multitable():
+        if ds.is_multitable:
             raise ValueError("Coclustering models not available in multi-table mode")
 
         # The "model dictionary domain" in the coclustering case it is just composed
@@ -1251,14 +1264,14 @@ def _transform_check_dataset(self, ds):
                 )
 
     def _transform_create_deployment_dataset(self, ds, computation_dir):
-        assert not ds.is_multitable(), "'dataset' is multitable"
+        assert not ds.is_multitable, "'dataset' is multitable"
 
         # Build the multitable deployment dataset
         keys_table_name = f"keys_{ds.main_table.name}"
         deploy_dataset_spec = {}
         deploy_dataset_spec["main_table"] = keys_table_name
         deploy_dataset_spec["tables"] = {}
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             # Extract the keys from the main table
             keys_table_dataframe = pd.DataFrame(
                 {
@@ -1319,7 +1332,7 @@ def _transform_prepare_deployment_model_for_predict(self, _):
     def _transform_deployment_post_process(
         self, deployment_ds, output_table_path, drop_key
     ):
-        assert deployment_ds.is_multitable()
+        assert deployment_ds.is_multitable
         return super()._transform_deployment_post_process(
             deployment_ds, output_table_path, drop_key
         )
@@ -1500,7 +1513,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir):
 
         # Set the format parameters depending on the type of dataset
         kwargs["detect_format"] = False
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             kwargs["field_separator"] = "\t"
             kwargs["header_line"] = True
         else:
@@ -1610,12 +1623,12 @@ def _transform_check_dataset(self, ds):
         super()._transform_check_dataset(ds)
 
         # Check the coherence between thi input table and the model
-        if self.is_multitable_model_ and not ds.is_multitable():
+        if self.is_multitable_model_ and not ds.is_multitable:
             raise ValueError(
                 "You are trying to apply on single-table inputs a model which has "
                 "been trained on multi-table data."
             )
-        if not self.is_multitable_model_ and ds.is_multitable():
+        if not self.is_multitable_model_ and ds.is_multitable:
             raise ValueError(
                 "You are trying to apply on multi-table inputs a model which has "
                 "been trained on single-table data."
@@ -1914,10 +1927,14 @@ def __init__(
         self._predicted_target_meta_data_tag = "Prediction"
 
     def _is_real_target_dtype_integer(self):
-        assert self._original_target_dtype is not None, "Original target type not set"
-        return pd.api.types.is_integer_dtype(self._original_target_dtype) or (
-            isinstance(self._original_target_dtype, pd.CategoricalDtype)
-            and pd.api.types.is_integer_dtype(self._original_target_dtype.categories)
+        return self._original_target_dtype is not None and (
+            pd.api.types.is_integer_dtype(self._original_target_dtype)
+            or (
+                isinstance(self._original_target_dtype, pd.CategoricalDtype)
+                and pd.api.types.is_integer_dtype(
+                    self._original_target_dtype.categories
+                )
+            )
         )
 
     def _sorted_prob_variable_names(self):
@@ -1980,7 +1997,7 @@ def _fit_check_dataset(self, ds):
         super()._fit_check_dataset(ds)
 
         # Check that the target is for classification in in_memory_tables
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             current_type_of_target = type_of_target(ds.target_column)
             if current_type_of_target not in ["binary", "multiclass"]:
                 raise ValueError(
@@ -1988,7 +2005,7 @@ def _fit_check_dataset(self, ds):
                     "for classification. Maybe you passed a floating point target?"
                 )
         # Check if the target has more than 1 class
-        if ds.is_in_memory() and len(np.unique(ds.target_column)) == 1:
+        if ds.is_in_memory and len(np.unique(ds.target_column)) == 1:
             raise ValueError(
                 f"{self.__class__.__name__} can't train when only one class is present."
             )
@@ -2001,10 +2018,10 @@ def _fit_training_post_process(self, ds):
         super()._fit_training_post_process(ds)
 
         # Save the target datatype
-        if ds.is_in_memory():
-            self._original_target_dtype = ds.target_column_dtype
+        if ds.is_in_memory:
+            self._original_target_dtype = ds.target_column.dtype
         else:
-            self._original_target_dtype = np.dtype("object")
+            self._original_target_dtype = None
 
         # Save class values in the order of deployment
         self.classes_ = []
@@ -2012,7 +2029,7 @@ def _fit_training_post_process(self, ds):
             for key in variable.meta_data.keys:
                 if key.startswith("TargetProb"):
                     self.classes_.append(variable.meta_data.get_value(key))
-        if self._is_real_target_dtype_integer():
+        if ds.is_in_memory and self._is_real_target_dtype_integer():
             self.classes_ = [int(class_value) for class_value in self.classes_]
             self.classes_.sort()
         self.classes_ = column_or_1d(self.classes_)
@@ -2165,7 +2182,7 @@ def predict_proba(self, X):
         # For in-memory datasets:
         # - Reorder the columns to that of self.classes_
         # - Transform to np.ndarray
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             assert isinstance(
                 y_probas, (pd.DataFrame, np.ndarray)
             ), "y_probas is not a Pandas DataFrame nor Numpy array"
@@ -2786,7 +2803,7 @@ def transform(self, X):
         finally:
             self._cleanup_computation_dir(computation_dir)
             kh.get_runner().root_temp_dir = initial_runner_temp_dir
-        if ds.is_in_memory():
+        if ds.is_in_memory:
             return X_transformed.to_numpy(copy=False)
         return X_transformed
 
diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py
index 66497dd7..500312ae 100644
--- a/khiops/utils/dataset.py
+++ b/khiops/utils/dataset.py
@@ -51,6 +51,9 @@ def check_dataset_spec(ds_spec):
     ValueError
         If there are objects of the spec with invalid values.
     """
+    # Check the spec type
+    if not is_dict_like(ds_spec):
+        raise TypeError(type_error_message("ds_spec", ds_spec, Mapping))
 
     # Check the "tables" field
     if "tables" not in ds_spec:
@@ -118,7 +121,6 @@ def _check_table_key(table_name, key):
 
 
 def _check_multitable_spec(ds_spec):
-    assert len(ds_spec) > 1
     # Check the main table
     if "main_table" not in ds_spec:
         raise ValueError(
@@ -137,9 +139,9 @@ def _check_multitable_spec(ds_spec):
     # Check that all tables have non-None keys
     for table_name, (_, table_key) in ds_spec["tables"].items():
         if table_key is None:
-            table_type = "main" if ds_spec["main_table"] == table_name else "secondary"
+            table_kind = "main" if ds_spec["main_table"] == table_name else "secondary"
             raise ValueError(
-                f"key of {table_type} table '{table_name}' is 'None': "
+                f"key of {table_kind} table '{table_name}' is 'None': "
                 "table keys must be specified in multi-table datasets"
             )
 
@@ -239,17 +241,21 @@ def _check_hierarchical_keys(
     relation_id, parent_table, parent_table_key, child_table, child_table_key
 ):
     """Check that the parent table's key is contained in the child table's key"""
-    table_key_error = False
+    # Perform the check and save the error status
+    error_found = False
     if isinstance(parent_table_key, str) and isinstance(child_table_key, str):
-        table_key_error = child_table_key != parent_table_key
+        error_found = child_table_key != parent_table_key
     elif isinstance(parent_table_key, str) and is_list_like(child_table_key):
-        table_key_error = parent_table_key not in child_table_key
+        error_found = parent_table_key not in child_table_key
     elif is_list_like(parent_table_key) and is_list_like(child_table_key):
-        table_key_error = not set(parent_table_key).issubset(set(child_table_key))
+        error_found = not set(parent_table_key).issubset(child_table_key)
     elif is_list_like(parent_table_key) and isinstance(child_table_key, str):
-        table_key_error = True
+        error_found = (
+            len(parent_table_key) != 1 or child_table_key not in parent_table_key
+        )
 
-    if table_key_error:
+    # Report any error found
+    if error_found:
         if isinstance(child_table_key, str):
             child_table_key_msg = f"[{child_table_key}]"
         else:
@@ -435,8 +441,6 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
         self.categorical_target = categorical_target
         self.target_column = None
         self.target_column_id = None
-        self.target_column_type = None
-        self.target_column_dtype = None  # Only for in_memory datasets
         self.sep = None
         self.header = None
 
@@ -535,7 +539,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None):
         assert isinstance(
             self.secondary_tables, list
         ), "'secondary_tables' is not a list after init"
-        assert not self.is_multitable() or len(
+        assert not self.is_multitable or len(
             self.secondary_tables
         ), "'secondary_tables' is empty in a multi-table dataset"
         assert (
@@ -699,6 +703,7 @@ def _init_tables_from_mapping(self, X):
     def _init_target_column(self, y):
         assert self.main_table is not None
         assert self.secondary_tables is not None
+
         # Check y's type
         # For in memory target columns:
         # - column_or_1d checks *and transforms* to a numpy.array if successful
@@ -722,14 +727,6 @@ def _init_target_column(self, y):
                 type_error_message("y", y, "array-like")
                 + f" (X's tables are of type {type_message})"
             )
-        if isinstance(self.main_table, (SparseTable, NumpyTable)) and isinstance(
-            y_checked, str
-        ):
-            raise TypeError(
-                type_error_message("y", y, "array-like")
-                + " (X's tables are of type numpy.ndarray"
-                + " or scipy.sparse.spmatrix)"
-            )
         if isinstance(self.main_table.data_source, str) and not isinstance(
             y_checked, str
         ):
@@ -742,7 +739,6 @@ def _init_target_column(self, y):
         # Case when y is a memory array
         if hasattr(y_checked, "__array__"):
             self.target_column = y_checked
-            self.target_column_dtype = self.target_column.dtype
 
             # Initialize the id of the target column
             if isinstance(y, pd.Series) and y.name is not None:
@@ -778,14 +774,13 @@ def _init_target_column(self, y):
 
             # Force the target column type from the parameters
             if self.categorical_target:
-                self.main_table.khiops_types[self.target_column] = "Categorical"
-                self.target_column_type = "Categorical"
+                self.main_table.khiops_types[self.target_column_id] = "Categorical"
             else:
-                self.main_table.khiops_types[self.target_column] = "Numerical"
-                self.target_column_type = "Numerical"
+                self.main_table.khiops_types[self.target_column_id] = "Numerical"
 
+    @property
     def is_in_memory(self):
-        """Tests whether the dataset is in-memory
+        """bool : ``True`` if the dataset is in-memory
 
         A dataset is in-memory if it is constituted either of only pandas.DataFrame
         tables, numpy.ndarray, or scipy.sparse.spmatrix tables.
@@ -793,28 +788,22 @@ def is_in_memory(self):
 
         return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable))
 
+    @property
     def table_type(self):
-        """Returns the table type of the dataset tables
+        """type : The table type of this dataset's tables
 
-        Returns
-        -------
-        type
-            The type of the tables in the dataset. Possible values:
-            - `PandasTable`
-            - `NumpyTable`
-            - `SparseTable`
-            - `FileTable`
+        Possible values:
+
+        - `PandasTable`
+        - `NumpyTable`
+        - `SparseTable`
+        - `FileTable`
         """
         return type(self.main_table)
 
+    @property
     def is_multitable(self):
-        """Tests whether the dataset is a multi-table one
-
-        Returns
-        -------
-        bool
-            ``True`` if the dataset is multi-table.
-        """
+        """bool : ``True`` if the dataset is multitable"""
         return self.secondary_tables is not None and len(self.secondary_tables) > 0
 
     def to_spec(self):
@@ -831,7 +820,7 @@ def to_spec(self):
         if self.relations:
             ds_spec["relations"] = []
             ds_spec["relations"].extend(self.relations)
-        if self.table_type() == FileTable:
+        if self.table_type == FileTable:
             ds_spec["format"] = (self.sep, self.header)
 
         return ds_spec
@@ -880,7 +869,7 @@ def create_khiops_dictionary_domain(self):
         dictionary_domain.add_dictionary(main_dictionary)
 
         # For in-memory datasets: Add the target variable if available
-        if self.is_in_memory() and self.target_column is not None:
+        if self.is_in_memory and self.target_column is not None:
             variable = kh.Variable()
             variable.name = get_khiops_variable_name(self.target_column_id)
             if self.categorical_target:
@@ -945,15 +934,18 @@ def create_table_files_for_khiops(self, output_dir, sort=True):
         # - The caller specifies not to do it (sort = False)
         # - The dataset is mono-table and the main table has no key
         sort_main_table = sort and (
-            self.is_multitable() or self.main_table.key is not None
+            self.is_multitable or self.main_table.key is not None
         )
-        if self.is_in_memory():
+
+        # In-memory dataset: Create the table files and add the target column
+        if self.is_in_memory:
             main_table_path = self.main_table.create_table_file_for_khiops(
                 output_dir,
                 sort=sort_main_table,
                 target_column=self.target_column,
                 target_column_id=self.target_column_id,
             )
+        # File dataset: Create the table files (the target column is in the file)
         else:
             main_table_path = self.main_table.create_table_file_for_khiops(
                 output_dir,
@@ -973,6 +965,9 @@ def __repr__(self):
         return str(self.create_khiops_dictionary_domain())
 
 
+# pylint: enable=invalid-name
+
+
 class DatasetTable(ABC):
     """A generic dataset table"""
 
@@ -1046,7 +1041,7 @@ def create_khiops_dictionary(self):
         dictionary = kh.Dictionary()
         dictionary.name = self.name
         if self.key is not None:
-            dictionary.key = list(self.key)
+            dictionary.key = self.key
 
         # For each column add a Khiops variable to the dictionary
         for column_id in self.column_ids:
@@ -1065,18 +1060,16 @@ def create_khiops_dictionary(self):
 
 
 class PandasTable(DatasetTable):
-    """Table encapsulating the features dataframe X and the target labels y
-
-    X is of type pandas.DataFrame. y is array-like.
+    """DatasetTable encapsulating a pandas dataframe
 
     Parameters
     ----------
     name : str
         Name for the table.
     dataframe : `pandas.DataFrame`
-        The data frame to be encapsulated.
-    key : list-like of str, optional
-        The names of the columns composing the key
+        The data frame to be encapsulated. It must be non-empty.
+    key : list of str, optional
+        The names of the columns composing the key.
     """
 
     def __init__(self, name, dataframe, key=None):
@@ -1144,7 +1137,7 @@ def create_table_file_for_khiops(
         output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt")
 
         # Write the output dataframe
-        output_dataframe = self._create_dataframe_copy()
+        output_dataframe = self.data_source.copy()
         output_names = {
             column_id: get_khiops_variable_name(column_id)
             for column_id in self.column_ids
@@ -1173,22 +1166,18 @@ def create_table_file_for_khiops(
 
         return output_table_path
 
-    def _create_dataframe_copy(self):
-        """Creates an in memory copy of the dataframe"""
-        return self.data_source.copy()
-
 
 class NumpyTable(DatasetTable):
-    """Table encapsulating (X,y) pair with types (ndarray, ndarray)
+    """DatasetTable encapsulating a NumPy array
 
     Parameters
     ----------
     name : str
         Name for the table.
-    array : :external:term:`array-like` of shape (n_samples, n_features_in)
+    array : `numpy.ndarray` of shape (n_samples, n_features_in)
         The data frame to be encapsulated.
     key : :external:term`array-like` of int, optional
-        The names of the columns composing the key
+        The names of the columns composing the key.
     """
 
     def __init__(self, name, array, key=None):
@@ -1259,10 +1248,7 @@ def create_table_file_for_khiops(
 
 
 class SparseTable(DatasetTable):
-    """Table encapsulating feature matrix X and target array y
-
-    X is of type scipy.sparse.spmatrix.
-    y is array-like.
+    """DatasetTable encapsulating a SciPy sparse matrix
 
     Parameters
     ----------
@@ -1270,8 +1256,8 @@ class SparseTable(DatasetTable):
         Name for the table.
     matrix : `scipy.sparse.spmatrix`
         The sparse matrix to be encapsulated.
-    key : list-like of str, optional
-        The names of the columns composing the key
+    key : list of str, optional
+        The names of the columns composing the key.
     """
 
     def __init__(self, name, matrix, key=None):
@@ -1405,7 +1391,7 @@ def create_table_file_for_khiops(
 
 
 class FileTable(DatasetTable):
-    """A table representing a delimited text file
+    """DatasetTable encapsulating a delimited text data file
 
     Parameters
     ----------
@@ -1413,12 +1399,12 @@ class FileTable(DatasetTable):
         Name for the table.
     path : str
         Path of the file containing the table.
+    key : list-like of str, optional
+        The names of the columns composing the key.
     sep : str, optional
         Field separator character. If not specified it will be inferred from the file.
     header : bool, optional
-        Indicates if the table
-    key : list-like of str, optional
-        The names of the columns composing the key
+        Indicates if the table.
     """
 
     def __init__(
@@ -1471,7 +1457,6 @@ def __init__(
             )
 
         # Set the column names and types
-        assert json_domain["dictionaries"][0]["name"] == self.name
         variables = json_domain["dictionaries"][0]["variables"]
         self.column_ids = [var["name"] for var in variables]
         self.khiops_types = {var["name"]: var["type"] for var in variables}
diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py
index d5c4f76d..e8c4d192 100644
--- a/khiops/utils/helpers.py
+++ b/khiops/utils/helpers.py
@@ -45,7 +45,7 @@ def sort_dataset(ds_spec, output_dir=None):
     ds = Dataset(ds_spec)
 
     # Check special arguments in function of the dataset
-    if ds.table_type() == FileTable and output_dir is None:
+    if ds.table_type == FileTable and output_dir is None:
         raise ValueError("'output_dir' must be specified for file based datasets")
 
     # Make a copy of the dataset (note: data sources are just reference)
@@ -136,7 +136,7 @@ def train_test_split_dataset(
     ds = Dataset(ds_spec)
 
     # Check the parameter coherence
-    if not ds.is_in_memory():
+    if not ds.is_in_memory:
         if target_column is not None:
             raise ValueError("'target_column' cannot be used with file path datasets")
         if output_dir is None:
@@ -145,7 +145,7 @@ def train_test_split_dataset(
             raise TypeError(type_error_message("output_dir", output_dir, str))
 
     # Perform the split for each type of dataset
-    if ds.is_in_memory():
+    if ds.is_in_memory:
         # Obtain the keys for the other test_train_split function
         sklearn_split_params = {}
         for param in ("train_size", "random_state", "shuffle", "stratify"):
diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py
index 9662e224..be0034c6 100644
--- a/tests/test_dataset_class.py
+++ b/tests/test_dataset_class.py
@@ -171,7 +171,10 @@ def create_multitable_star_data_files(self, main_table_path, secondary_table_pat
         secondary_table.to_csv(secondary_table_path, sep="\t", index=False)
 
     def create_multitable_snowflake_dataframes(self):
+        # Set the random seed for reproducibility
         np.random.seed(31416)
+
+        # Create the main table
         main_table_data = {
             "User_ID": [
                 "60B2Xk_3Fw",
@@ -189,6 +192,7 @@ def create_multitable_snowflake_dataframes(self):
         }
         main_table = pd.DataFrame(main_table_data)
 
+        # Create the secondary tables
         secondary_table_data_1 = {
             "User_ID": np.random.choice(main_table["User_ID"], 20),
             "VAR_1": np.random.choice(["a", "b", "c", "d"], 20),
@@ -197,7 +201,6 @@ def create_multitable_snowflake_dataframes(self):
             "VAR_4": np.round(np.random.rand(20).tolist(), 2),
         }
         secondary_table_1 = pd.DataFrame(secondary_table_data_1)
-
         secondary_table_data_2 = {
             "User_ID": np.random.choice(
                 main_table["User_ID"], len(main_table), replace=False
@@ -210,7 +213,6 @@ def create_multitable_snowflake_dataframes(self):
             "VAR_4": np.round(np.random.rand(len(main_table)).tolist(), 2),
         }
         secondary_table_2 = pd.DataFrame(secondary_table_data_2)
-
         tertiary_table_data = {
             "User_ID": np.random.choice(main_table["User_ID"], 100),
             "VAR_1": np.random.choice(["a", "b", "c", "d"], 100),
@@ -218,7 +220,6 @@ def create_multitable_snowflake_dataframes(self):
             "VAR_3": np.round(np.random.rand(100).tolist(), 2),
         }
         tertiary_table = pd.DataFrame(tertiary_table_data)
-
         quaternary_table_data = {
             "User_ID": np.random.choice(main_table["User_ID"], 50),
             "VAR_1": np.random.choice(["a", "b", "c", "d"], 50),