From 42b0c15de78f1abd4226ee448f1f537057f9a164 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:57:37 +0200 Subject: [PATCH] Simplify Dataset state variables --- khiops/sklearn/estimators.py | 93 ++++++++++++++----------- khiops/utils/dataset.py | 129 ++++++++++++++++------------------- khiops/utils/helpers.py | 6 +- tests/test_dataset_class.py | 7 +- 4 files changed, 119 insertions(+), 116 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 27892bcb..fb5f4d22 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -148,38 +148,51 @@ def _check_categorical_target_type(ds): if ds.target_column is None: raise ValueError("Target vector is not specified.") - if ds.is_in_memory() and not ( - isinstance(ds.target_column_dtype, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(ds.target_column_dtype) - or pd.api.types.is_integer_dtype(ds.target_column_dtype) - or pd.api.types.is_float_dtype(ds.target_column_dtype) + if ds.is_in_memory and not ( + isinstance(ds.target_column.dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column.dtype) + or pd.api.types.is_integer_dtype(ds.target_column.dtype) + or pd.api.types.is_float_dtype(ds.target_column.dtype) ): raise ValueError( f"'y' has invalid type '{ds.target_column_type}'. " "Only string, integer, float and categorical types " "are accepted for the target." ) - elif not ds.is_in_memory() and ds.target_column_type != "Categorical": + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Categorical" + ): raise ValueError( - f"Target column has invalid type '{ds.target_column_type}'. " + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " "Only Categorical types are accepted for file datasets." ) def _check_numerical_target_type(ds): + # Check that the target column is specified if ds.target_column is None: raise ValueError("Target vector is not specified.") - if ds.is_in_memory(): - if not pd.api.types.is_numeric_dtype(ds.target_column_dtype): + + # If in-memory: Check that the column is numerical and that the values are finite + # The latter is required by sklearn + if ds.is_in_memory: + if not pd.api.types.is_numeric_dtype(ds.target_column.dtype): raise ValueError( - f"Unknown label type '{ds.target_column_type}'. " + f"Unknown label type '{ds.target_column.dtype}'. " "Expected a numerical type." ) if ds.target_column is not None: assert_all_finite(ds.target_column) - elif not ds.is_in_memory() and ds.target_column_type != "Numerical": + # Otherwise: Check the the Khiops type + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Numerical" + ): raise ValueError( - f"Target column has invalid type '{ds.target_column_type}'. " + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " "Only Numerical types are accepted for file datasets." ) @@ -384,7 +397,7 @@ def _fit(self, ds, computation_dir, **kwargs): ): self._fit_training_post_process(ds) self.is_fitted_ = True - self.is_multitable_model_ = ds.is_multitable() + self.is_multitable_model_ = ds.is_multitable def _fit_check_params(self, ds, **_): """Check the model parameters including those data dependent (in kwargs)""" @@ -395,7 +408,7 @@ def _fit_check_params(self, ds, **_): ): raise TypeError(type_error_message("key", self.key, str, "list-like")) - if not ds.is_in_memory() and self.output_dir is None: + if not ds.is_in_memory and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _fit_check_dataset(self, ds): @@ -529,7 +542,7 @@ def _transform_deploy_model( output_data_table_path = fs.get_child_path(output_dir, transformed_file_name) # Set the format parameters depending on the type of dataset - if deployment_ds.is_in_memory(): + if deployment_ds.is_in_memory: field_separator = "\t" header_line = True else: @@ -563,7 +576,7 @@ def _transform_deployment_post_process( self, deployment_ds, output_table_path, drop_key ): # Return a dataframe for dataframe based datasets - if deployment_ds.is_in_memory(): + if deployment_ds.is_in_memory: # Read the transformed table with the internal table settings with io.BytesIO(fs.read(output_table_path)) as output_table_stream: output_table_df = read_internal_data_table(output_table_stream) @@ -572,7 +585,7 @@ def _transform_deployment_post_process( # - Reorder the table to the original table order # - Because transformed data table file is sorted by key # - Drop the key columns if specified - if deployment_ds.is_multitable(): + if deployment_ds.is_multitable: key_df = deployment_ds.main_table.data_source[ deployment_ds.main_table.key ] @@ -822,7 +835,7 @@ def _fit_check_params(self, ds, **kwargs): ) def _fit_train_model(self, ds, computation_dir, **kwargs): - assert not ds.is_multitable(), "Coclustering not available in multitable" + assert not ds.is_multitable, "Coclustering not available in multitable" # Prepare the table files and dictionary for Khiops main_table_path, _ = ds.create_table_files_for_khiops( @@ -1217,7 +1230,7 @@ def predict(self, X): kh.get_runner().root_temp_dir = initial_runner_temp_dir # Transform to numpy.array for in-memory inputs - if ds.is_in_memory(): + if ds.is_in_memory: y_pred = y_pred.to_numpy() return y_pred @@ -1235,7 +1248,7 @@ def _transform_check_dataset(self, ds): # - They are mono-table only # - They are deployed with a multitable model whose main table contain # the keys of the input table and the secondary table is the input table - if ds.is_multitable(): + if ds.is_multitable: raise ValueError("Coclustering models not available in multi-table mode") # The "model dictionary domain" in the coclustering case it is just composed @@ -1251,14 +1264,14 @@ def _transform_check_dataset(self, ds): ) def _transform_create_deployment_dataset(self, ds, computation_dir): - assert not ds.is_multitable(), "'dataset' is multitable" + assert not ds.is_multitable, "'dataset' is multitable" # Build the multitable deployment dataset keys_table_name = f"keys_{ds.main_table.name}" deploy_dataset_spec = {} deploy_dataset_spec["main_table"] = keys_table_name deploy_dataset_spec["tables"] = {} - if ds.is_in_memory(): + if ds.is_in_memory: # Extract the keys from the main table keys_table_dataframe = pd.DataFrame( { @@ -1319,7 +1332,7 @@ def _transform_prepare_deployment_model_for_predict(self, _): def _transform_deployment_post_process( self, deployment_ds, output_table_path, drop_key ): - assert deployment_ds.is_multitable() + assert deployment_ds.is_multitable return super()._transform_deployment_post_process( deployment_ds, output_table_path, drop_key ) @@ -1500,7 +1513,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Set the format parameters depending on the type of dataset kwargs["detect_format"] = False - if ds.is_in_memory(): + if ds.is_in_memory: kwargs["field_separator"] = "\t" kwargs["header_line"] = True else: @@ -1610,12 +1623,12 @@ def _transform_check_dataset(self, ds): super()._transform_check_dataset(ds) # Check the coherence between thi input table and the model - if self.is_multitable_model_ and not ds.is_multitable(): + if self.is_multitable_model_ and not ds.is_multitable: raise ValueError( "You are trying to apply on single-table inputs a model which has " "been trained on multi-table data." ) - if not self.is_multitable_model_ and ds.is_multitable(): + if not self.is_multitable_model_ and ds.is_multitable: raise ValueError( "You are trying to apply on multi-table inputs a model which has " "been trained on single-table data." @@ -1914,10 +1927,14 @@ def __init__( self._predicted_target_meta_data_tag = "Prediction" def _is_real_target_dtype_integer(self): - assert self._original_target_dtype is not None, "Original target type not set" - return pd.api.types.is_integer_dtype(self._original_target_dtype) or ( - isinstance(self._original_target_dtype, pd.CategoricalDtype) - and pd.api.types.is_integer_dtype(self._original_target_dtype.categories) + return self._original_target_dtype is not None and ( + pd.api.types.is_integer_dtype(self._original_target_dtype) + or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_integer_dtype( + self._original_target_dtype.categories + ) + ) ) def _sorted_prob_variable_names(self): @@ -1980,7 +1997,7 @@ def _fit_check_dataset(self, ds): super()._fit_check_dataset(ds) # Check that the target is for classification in in_memory_tables - if ds.is_in_memory(): + if ds.is_in_memory: current_type_of_target = type_of_target(ds.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( @@ -1988,7 +2005,7 @@ def _fit_check_dataset(self, ds): "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ds.is_in_memory() and len(np.unique(ds.target_column)) == 1: + if ds.is_in_memory and len(np.unique(ds.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -2001,10 +2018,10 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Save the target datatype - if ds.is_in_memory(): - self._original_target_dtype = ds.target_column_dtype + if ds.is_in_memory: + self._original_target_dtype = ds.target_column.dtype else: - self._original_target_dtype = np.dtype("object") + self._original_target_dtype = None # Save class values in the order of deployment self.classes_ = [] @@ -2012,7 +2029,7 @@ def _fit_training_post_process(self, ds): for key in variable.meta_data.keys: if key.startswith("TargetProb"): self.classes_.append(variable.meta_data.get_value(key)) - if self._is_real_target_dtype_integer(): + if ds.is_in_memory and self._is_real_target_dtype_integer(): self.classes_ = [int(class_value) for class_value in self.classes_] self.classes_.sort() self.classes_ = column_or_1d(self.classes_) @@ -2165,7 +2182,7 @@ def predict_proba(self, X): # For in-memory datasets: # - Reorder the columns to that of self.classes_ # - Transform to np.ndarray - if ds.is_in_memory(): + if ds.is_in_memory: assert isinstance( y_probas, (pd.DataFrame, np.ndarray) ), "y_probas is not a Pandas DataFrame nor Numpy array" @@ -2786,7 +2803,7 @@ def transform(self, X): finally: self._cleanup_computation_dir(computation_dir) kh.get_runner().root_temp_dir = initial_runner_temp_dir - if ds.is_in_memory(): + if ds.is_in_memory: return X_transformed.to_numpy(copy=False) return X_transformed diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 66497dd7..500312ae 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -51,6 +51,9 @@ def check_dataset_spec(ds_spec): ValueError If there are objects of the spec with invalid values. """ + # Check the spec type + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, Mapping)) # Check the "tables" field if "tables" not in ds_spec: @@ -118,7 +121,6 @@ def _check_table_key(table_name, key): def _check_multitable_spec(ds_spec): - assert len(ds_spec) > 1 # Check the main table if "main_table" not in ds_spec: raise ValueError( @@ -137,9 +139,9 @@ def _check_multitable_spec(ds_spec): # Check that all tables have non-None keys for table_name, (_, table_key) in ds_spec["tables"].items(): if table_key is None: - table_type = "main" if ds_spec["main_table"] == table_name else "secondary" + table_kind = "main" if ds_spec["main_table"] == table_name else "secondary" raise ValueError( - f"key of {table_type} table '{table_name}' is 'None': " + f"key of {table_kind} table '{table_name}' is 'None': " "table keys must be specified in multi-table datasets" ) @@ -239,17 +241,21 @@ def _check_hierarchical_keys( relation_id, parent_table, parent_table_key, child_table, child_table_key ): """Check that the parent table's key is contained in the child table's key""" - table_key_error = False + # Perform the check and save the error status + error_found = False if isinstance(parent_table_key, str) and isinstance(child_table_key, str): - table_key_error = child_table_key != parent_table_key + error_found = child_table_key != parent_table_key elif isinstance(parent_table_key, str) and is_list_like(child_table_key): - table_key_error = parent_table_key not in child_table_key + error_found = parent_table_key not in child_table_key elif is_list_like(parent_table_key) and is_list_like(child_table_key): - table_key_error = not set(parent_table_key).issubset(set(child_table_key)) + error_found = not set(parent_table_key).issubset(child_table_key) elif is_list_like(parent_table_key) and isinstance(child_table_key, str): - table_key_error = True + error_found = ( + len(parent_table_key) != 1 or child_table_key not in parent_table_key + ) - if table_key_error: + # Report any error found + if error_found: if isinstance(child_table_key, str): child_table_key_msg = f"[{child_table_key}]" else: @@ -435,8 +441,6 @@ def __init__(self, X, y=None, categorical_target=True, key=None): self.categorical_target = categorical_target self.target_column = None self.target_column_id = None - self.target_column_type = None - self.target_column_dtype = None # Only for in_memory datasets self.sep = None self.header = None @@ -535,7 +539,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None): assert isinstance( self.secondary_tables, list ), "'secondary_tables' is not a list after init" - assert not self.is_multitable() or len( + assert not self.is_multitable or len( self.secondary_tables ), "'secondary_tables' is empty in a multi-table dataset" assert ( @@ -699,6 +703,7 @@ def _init_tables_from_mapping(self, X): def _init_target_column(self, y): assert self.main_table is not None assert self.secondary_tables is not None + # Check y's type # For in memory target columns: # - column_or_1d checks *and transforms* to a numpy.array if successful @@ -722,14 +727,6 @@ def _init_target_column(self, y): type_error_message("y", y, "array-like") + f" (X's tables are of type {type_message})" ) - if isinstance(self.main_table, (SparseTable, NumpyTable)) and isinstance( - y_checked, str - ): - raise TypeError( - type_error_message("y", y, "array-like") - + " (X's tables are of type numpy.ndarray" - + " or scipy.sparse.spmatrix)" - ) if isinstance(self.main_table.data_source, str) and not isinstance( y_checked, str ): @@ -742,7 +739,6 @@ def _init_target_column(self, y): # Case when y is a memory array if hasattr(y_checked, "__array__"): self.target_column = y_checked - self.target_column_dtype = self.target_column.dtype # Initialize the id of the target column if isinstance(y, pd.Series) and y.name is not None: @@ -778,14 +774,13 @@ def _init_target_column(self, y): # Force the target column type from the parameters if self.categorical_target: - self.main_table.khiops_types[self.target_column] = "Categorical" - self.target_column_type = "Categorical" + self.main_table.khiops_types[self.target_column_id] = "Categorical" else: - self.main_table.khiops_types[self.target_column] = "Numerical" - self.target_column_type = "Numerical" + self.main_table.khiops_types[self.target_column_id] = "Numerical" + @property def is_in_memory(self): - """Tests whether the dataset is in-memory + """bool : ``True`` if the dataset is in-memory A dataset is in-memory if it is constituted either of only pandas.DataFrame tables, numpy.ndarray, or scipy.sparse.spmatrix tables. @@ -793,28 +788,22 @@ def is_in_memory(self): return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) + @property def table_type(self): - """Returns the table type of the dataset tables + """type : The table type of this dataset's tables - Returns - ------- - type - The type of the tables in the dataset. Possible values: - - `PandasTable` - - `NumpyTable` - - `SparseTable` - - `FileTable` + Possible values: + + - `PandasTable` + - `NumpyTable` + - `SparseTable` + - `FileTable` """ return type(self.main_table) + @property def is_multitable(self): - """Tests whether the dataset is a multi-table one - - Returns - ------- - bool - ``True`` if the dataset is multi-table. - """ + """bool : ``True`` if the dataset is multitable""" return self.secondary_tables is not None and len(self.secondary_tables) > 0 def to_spec(self): @@ -831,7 +820,7 @@ def to_spec(self): if self.relations: ds_spec["relations"] = [] ds_spec["relations"].extend(self.relations) - if self.table_type() == FileTable: + if self.table_type == FileTable: ds_spec["format"] = (self.sep, self.header) return ds_spec @@ -880,7 +869,7 @@ def create_khiops_dictionary_domain(self): dictionary_domain.add_dictionary(main_dictionary) # For in-memory datasets: Add the target variable if available - if self.is_in_memory() and self.target_column is not None: + if self.is_in_memory and self.target_column is not None: variable = kh.Variable() variable.name = get_khiops_variable_name(self.target_column_id) if self.categorical_target: @@ -945,15 +934,18 @@ def create_table_files_for_khiops(self, output_dir, sort=True): # - The caller specifies not to do it (sort = False) # - The dataset is mono-table and the main table has no key sort_main_table = sort and ( - self.is_multitable() or self.main_table.key is not None + self.is_multitable or self.main_table.key is not None ) - if self.is_in_memory(): + + # In-memory dataset: Create the table files and add the target column + if self.is_in_memory: main_table_path = self.main_table.create_table_file_for_khiops( output_dir, sort=sort_main_table, target_column=self.target_column, target_column_id=self.target_column_id, ) + # File dataset: Create the table files (the target column is in the file) else: main_table_path = self.main_table.create_table_file_for_khiops( output_dir, @@ -973,6 +965,9 @@ def __repr__(self): return str(self.create_khiops_dictionary_domain()) +# pylint: enable=invalid-name + + class DatasetTable(ABC): """A generic dataset table""" @@ -1046,7 +1041,7 @@ def create_khiops_dictionary(self): dictionary = kh.Dictionary() dictionary.name = self.name if self.key is not None: - dictionary.key = list(self.key) + dictionary.key = self.key # For each column add a Khiops variable to the dictionary for column_id in self.column_ids: @@ -1065,18 +1060,16 @@ def create_khiops_dictionary(self): class PandasTable(DatasetTable): - """Table encapsulating the features dataframe X and the target labels y - - X is of type pandas.DataFrame. y is array-like. + """DatasetTable encapsulating a pandas dataframe Parameters ---------- name : str Name for the table. dataframe : `pandas.DataFrame` - The data frame to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key + The data frame to be encapsulated. It must be non-empty. + key : list of str, optional + The names of the columns composing the key. """ def __init__(self, name, dataframe, key=None): @@ -1144,7 +1137,7 @@ def create_table_file_for_khiops( output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the output dataframe - output_dataframe = self._create_dataframe_copy() + output_dataframe = self.data_source.copy() output_names = { column_id: get_khiops_variable_name(column_id) for column_id in self.column_ids @@ -1173,22 +1166,18 @@ def create_table_file_for_khiops( return output_table_path - def _create_dataframe_copy(self): - """Creates an in memory copy of the dataframe""" - return self.data_source.copy() - class NumpyTable(DatasetTable): - """Table encapsulating (X,y) pair with types (ndarray, ndarray) + """DatasetTable encapsulating a NumPy array Parameters ---------- name : str Name for the table. - array : :external:term:`array-like` of shape (n_samples, n_features_in) + array : `numpy.ndarray` of shape (n_samples, n_features_in) The data frame to be encapsulated. key : :external:term`array-like` of int, optional - The names of the columns composing the key + The names of the columns composing the key. """ def __init__(self, name, array, key=None): @@ -1259,10 +1248,7 @@ def create_table_file_for_khiops( class SparseTable(DatasetTable): - """Table encapsulating feature matrix X and target array y - - X is of type scipy.sparse.spmatrix. - y is array-like. + """DatasetTable encapsulating a SciPy sparse matrix Parameters ---------- @@ -1270,8 +1256,8 @@ class SparseTable(DatasetTable): Name for the table. matrix : `scipy.sparse.spmatrix` The sparse matrix to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key + key : list of str, optional + The names of the columns composing the key. """ def __init__(self, name, matrix, key=None): @@ -1405,7 +1391,7 @@ def create_table_file_for_khiops( class FileTable(DatasetTable): - """A table representing a delimited text file + """DatasetTable encapsulating a delimited text data file Parameters ---------- @@ -1413,12 +1399,12 @@ class FileTable(DatasetTable): Name for the table. path : str Path of the file containing the table. + key : list-like of str, optional + The names of the columns composing the key. sep : str, optional Field separator character. If not specified it will be inferred from the file. header : bool, optional - Indicates if the table - key : list-like of str, optional - The names of the columns composing the key + Indicates if the table. """ def __init__( @@ -1471,7 +1457,6 @@ def __init__( ) # Set the column names and types - assert json_domain["dictionaries"][0]["name"] == self.name variables = json_domain["dictionaries"][0]["variables"] self.column_ids = [var["name"] for var in variables] self.khiops_types = {var["name"]: var["type"] for var in variables} diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index d5c4f76d..e8c4d192 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -45,7 +45,7 @@ def sort_dataset(ds_spec, output_dir=None): ds = Dataset(ds_spec) # Check special arguments in function of the dataset - if ds.table_type() == FileTable and output_dir is None: + if ds.table_type == FileTable and output_dir is None: raise ValueError("'output_dir' must be specified for file based datasets") # Make a copy of the dataset (note: data sources are just reference) @@ -136,7 +136,7 @@ def train_test_split_dataset( ds = Dataset(ds_spec) # Check the parameter coherence - if not ds.is_in_memory(): + if not ds.is_in_memory: if target_column is not None: raise ValueError("'target_column' cannot be used with file path datasets") if output_dir is None: @@ -145,7 +145,7 @@ def train_test_split_dataset( raise TypeError(type_error_message("output_dir", output_dir, str)) # Perform the split for each type of dataset - if ds.is_in_memory(): + if ds.is_in_memory: # Obtain the keys for the other test_train_split function sklearn_split_params = {} for param in ("train_size", "random_state", "shuffle", "stratify"): diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 9662e224..be0034c6 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -171,7 +171,10 @@ def create_multitable_star_data_files(self, main_table_path, secondary_table_pat secondary_table.to_csv(secondary_table_path, sep="\t", index=False) def create_multitable_snowflake_dataframes(self): + # Set the random seed for reproducibility np.random.seed(31416) + + # Create the main table main_table_data = { "User_ID": [ "60B2Xk_3Fw", @@ -189,6 +192,7 @@ def create_multitable_snowflake_dataframes(self): } main_table = pd.DataFrame(main_table_data) + # Create the secondary tables secondary_table_data_1 = { "User_ID": np.random.choice(main_table["User_ID"], 20), "VAR_1": np.random.choice(["a", "b", "c", "d"], 20), @@ -197,7 +201,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(20).tolist(), 2), } secondary_table_1 = pd.DataFrame(secondary_table_data_1) - secondary_table_data_2 = { "User_ID": np.random.choice( main_table["User_ID"], len(main_table), replace=False @@ -210,7 +213,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(len(main_table)).tolist(), 2), } secondary_table_2 = pd.DataFrame(secondary_table_data_2) - tertiary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 100), "VAR_1": np.random.choice(["a", "b", "c", "d"], 100), @@ -218,7 +220,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_3": np.round(np.random.rand(100).tolist(), 2), } tertiary_table = pd.DataFrame(tertiary_table_data) - quaternary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 50), "VAR_1": np.random.choice(["a", "b", "c", "d"], 50),