diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 006ea669..561319a5 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -50,7 +50,12 @@ is_list_like, type_error_message, ) -from khiops.utils.dataset import Dataset, FileTable, read_internal_data_table +from khiops.utils.dataset import ( + Dataset, + FileTable, + get_khiops_variable_name, + read_internal_data_table, +) # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -123,14 +128,14 @@ def _check_dictionary_compatibility( def _check_categorical_target_type(ds): - if ds.target_column_type is None: + if ds.target_column is None: raise ValueError("Target vector is not specified.") if ds.is_in_memory() and not ( - isinstance(ds.target_column_type, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(ds.target_column_type) - or pd.api.types.is_integer_dtype(ds.target_column_type) - or pd.api.types.is_float_dtype(ds.target_column_type) + isinstance(ds.target_column_dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column_dtype) + or pd.api.types.is_integer_dtype(ds.target_column_dtype) + or pd.api.types.is_float_dtype(ds.target_column_dtype) ): raise ValueError( f"'y' has invalid type '{ds.target_column_type}'. " @@ -145,16 +150,16 @@ def _check_categorical_target_type(ds): def _check_numerical_target_type(ds): - if ds.target_column_type is None: + if ds.target_column is None: raise ValueError("Target vector is not specified.") if ds.is_in_memory(): - if not pd.api.types.is_numeric_dtype(ds.target_column_type): + if not pd.api.types.is_numeric_dtype(ds.target_column_dtype): raise ValueError( f"Unknown label type '{ds.target_column_type}'. " "Expected a numerical type." ) - if ds.main_table.target_column is not None: - assert_all_finite(ds.main_table.target_column) + if ds.target_column is not None: + assert_all_finite(ds.target_column) elif not ds.is_in_memory() and ds.target_column_type != "Numerical": raise ValueError( f"Target column has invalid type '{ds.target_column_type}'. " @@ -335,12 +340,12 @@ def fit(self, X, y=None, **kwargs): return self - def _fit(self, dataset, computation_dir, **kwargs): + def _fit(self, ds, computation_dir, **kwargs): """Template pattern of a fit method Parameters ---------- - dataset : `Dataset` + ds : `Dataset` The learning dataset. computation_dir : str Path or URI where the Khiops computation results will be stored. @@ -348,25 +353,25 @@ def _fit(self, dataset, computation_dir, **kwargs): The called methods are reimplemented in concrete sub-classes """ # Check model parameters - self._fit_check_params(dataset, **kwargs) + self._fit_check_params(ds, **kwargs) # Check the dataset - self._fit_check_dataset(dataset) + self._fit_check_dataset(ds) # Train the model - self._fit_train_model(dataset, computation_dir, **kwargs) - self.n_features_in_ = dataset.main_table.n_features() + self._fit_train_model(ds, computation_dir, **kwargs) + self.n_features_in_ = ds.main_table.n_features() # If the main attributes are of the proper type finish the fitting # Otherwise it means there was an abort (early return) of the previous steps if isinstance(self.model_, kh.DictionaryDomain) and isinstance( self.model_report_, kh.KhiopsJSONObject ): - self._fit_training_post_process(dataset) + self._fit_training_post_process(ds) self.is_fitted_ = True - self.is_multitable_model_ = dataset.is_multitable() + self.is_multitable_model_ = ds.is_multitable() - def _fit_check_params(self, dataset, **_): + def _fit_check_params(self, ds, **_): """Check the model parameters including those data dependent (in kwargs)""" if ( self.key is not None @@ -375,7 +380,7 @@ def _fit_check_params(self, dataset, **_): ): raise TypeError(type_error_message("key", self.key, str, "list-like")) - if not dataset.is_in_memory() and self.output_dir is None: + if not ds.is_in_memory() and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _fit_check_dataset(self, ds): @@ -1456,7 +1461,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): ds.create_khiops_dictionary_domain(), ds.main_table.name, main_table_path, - ds.main_table.get_khiops_variable_name(ds.main_table.target_column_id), + get_khiops_variable_name(ds.target_column_id), output_dir, ] @@ -1499,9 +1504,7 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Set the target variable name - self.model_target_variable_name_ = ds.main_table.get_khiops_variable_name( - ds.main_table.target_column_id - ) + self.model_target_variable_name_ = get_khiops_variable_name(ds.target_column_id) # Verify it has at least one dictionary and a root dictionary in multi-table if len(self.model_.dictionaries) == 1: @@ -1778,10 +1781,10 @@ def __init__( self._predicted_target_meta_data_tag = "Prediction" def _is_real_target_dtype_integer(self): - assert self._original_target_type is not None, "Original target type not set" - return pd.api.types.is_integer_dtype(self._original_target_type) or ( - isinstance(self._original_target_type, pd.CategoricalDtype) - and pd.api.types.is_integer_dtype(self._original_target_type.categories) + assert self._original_target_dtype is not None, "Original target type not set" + return pd.api.types.is_integer_dtype(self._original_target_dtype) or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_integer_dtype(self._original_target_dtype.categories) ) def _sorted_prob_variable_names(self): @@ -1843,14 +1846,14 @@ def _fit_check_dataset(self, ds): # Check that the target is for classification in in_memory_tables if ds.is_in_memory(): - current_type_of_target = type_of_target(ds.main_table.target_column) + current_type_of_target = type_of_target(ds.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( f"Unknown label type: '{current_type_of_target}' " "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ds.is_in_memory() and len(np.unique(ds.main_table.target_column)) == 1: + if ds.is_in_memory() and len(np.unique(ds.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -1863,7 +1866,10 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Save the target datatype - self._original_target_type = ds.target_column_type + if ds.is_in_memory(): + self._original_target_dtype = ds.target_column_dtype + else: + self._original_target_dtype = np.dtype("object") # Save class values in the order of deployment self.classes_ = [] @@ -1929,21 +1935,21 @@ def predict(self, X): y_pred = y_pred.to_numpy(copy=False).ravel() # If integer and string just transform - if pd.api.types.is_integer_dtype(self._original_target_type): - y_pred = y_pred.astype(self._original_target_type) - elif pd.api.types.is_string_dtype(self._original_target_type): + if pd.api.types.is_integer_dtype(self._original_target_dtype): + y_pred = y_pred.astype(self._original_target_dtype) + elif pd.api.types.is_string_dtype(self._original_target_dtype): y_pred = y_pred.astype(str, copy=False) # If category first coerce the type to the categories' type else: - assert pd.api.types.is_categorical_dtype(self._original_target_type), ( + assert pd.api.types.is_categorical_dtype(self._original_target_dtype), ( "_original_target_dtype is not categorical" - f", it is '{self._original_target_type}'" + f", it is '{self._original_target_dtype}'" ) if pd.api.types.is_integer_dtype( - self._original_target_type.categories.dtype + self._original_target_dtype.categories.dtype ): y_pred = y_pred.astype( - self._original_target_type.categories.dtype, copy=False + self._original_target_dtype.categories.dtype, copy=False ) else: y_pred = y_pred.astype(str, copy=False) diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 1e022244..6a1f4205 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -6,7 +6,6 @@ ###################################################################################### """Classes for handling diverse data tables""" import csv -import functools import io import json import os @@ -81,6 +80,10 @@ def check_table_entry(table_name, table_spec): str, ) ) + check_table_key(table_name, key) + + +def check_table_key(table_name, key): if key is not None and not is_list_like(key) and not isinstance(key, str): raise TypeError( type_error_message(f"'{table_name}' table's key", key, str, Sequence) @@ -299,6 +302,16 @@ def get_khiops_type(numpy_type): return khiops_type +def get_khiops_variable_name(column_id): + """Return the khiops variable name associated to a column id""" + if isinstance(column_id, str): + variable_name = column_id + else: + assert isinstance(column_id, np.int64) + variable_name = f"Var{column_id}" + return variable_name + + def read_internal_data_table(file_path_or_stream): """Reads into a DataFrame a data table file with the internal format settings @@ -390,27 +403,27 @@ def __init__(self, X, y=None, categorical_target=True, key=None): self.main_table = None self.secondary_tables = None self.relations = None + self.categorical_target = categorical_target + self.target_column = None + self.target_column_id = None + self.target_column_type = None + self.target_column_dtype = None # Only for in_memory datasets self.sep = None self.header = None # Initialization from different types of input "X" # A single pandas dataframe if isinstance(X, pd.DataFrame): - self._init_tables_from_dataframe( - X, y, categorical_target=categorical_target - ) - # A single numpy array (or compatible object) - elif hasattr(X, "__array__"): - self._init_tables_from_numpy_array( - X, - y, - categorical_target=categorical_target, - ) + self.main_table = PandasTable("main_table", X) + self.secondary_tables = [] # A sparse matrix elif isinstance(X, spmatrix): - self._init_tables_from_sparse_matrix( - X, y, categorical_target=categorical_target - ) + self.main_table = SparseTable("main_table", X) + self.secondary_tables = [] + # A single numpy array (or compatible object) + elif hasattr(X, "__array__"): + self.main_table = NumpyTable("main_table", X) + self.secondary_tables = [] # A tuple spec elif isinstance(X, tuple): warnings.warn( @@ -422,8 +435,17 @@ def __init__(self, X, y=None, categorical_target=True, key=None): ), stacklevel=3, ) - self._init_tables_from_tuple(X, y, categorical_target=categorical_target) - # A sequence + # Check the input tuple + self._check_input_tuple(X) + + # Obtain path and separator + path, sep = X + + # Initialization + self.main_table = FileTable("main_table", path=path, sep=sep) + self.secondary_tables = [] + + # A dataset sequence spec # We try first for compatible python arrays then the deprecated sequences spec elif is_list_like(X): # Try to transform to a numerical array with sklearn's check_array @@ -432,9 +454,8 @@ def __init__(self, X, y=None, categorical_target=True, key=None): # this branch's code try: X_checked = check_array(X, ensure_2d=True, force_all_finite=False) - self._init_tables_from_numpy_array( - X_checked, y, categorical_target=categorical_target - ) + self.main_table = NumpyTable("main_table", X_checked) + self.secondary_tables = [] except ValueError: warnings.warn( deprecation_message( @@ -445,16 +466,21 @@ def __init__(self, X, y=None, categorical_target=True, key=None): ), stacklevel=3, ) - self._init_tables_from_sequence(X, y, key=key) - # A dict specification + self._init_tables_from_sequence(X, key=key) + # A a dataset dict spec elif is_dict_like(X): - self._init_tables_from_mapping(X, y, categorical_target=categorical_target) + self._init_tables_from_mapping(X) # Fail if X is not recognized else: raise TypeError( type_error_message("X", X, "array-like", tuple, Sequence, Mapping) ) + # Initialization of the target column if any + if y is not None: + self._init_target_column(y) + + # Post-conditions assert self.main_table is not None, "'main_table' is 'None' after init" assert isinstance( self.secondary_tables, list @@ -462,116 +488,48 @@ def __init__(self, X, y=None, categorical_target=True, key=None): assert not self.is_multitable() or len( self.secondary_tables ), "'secondary_tables' is empty in a multi-table dataset" + assert ( + y is None or self.target_column is not None + ), "'target_column' is None but y wasn't" - def _init_tables_from_dataframe(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type pandas.DataFrame""" - assert isinstance(X, pd.DataFrame), "'X' must be a pandas.DataFrame" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - self.main_table = PandasTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_sparse_matrix(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type scipy.sparse.spmatrix""" - assert isinstance(X, spmatrix), "'X' must be a scipy.sparse.spmatrix" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - - self.main_table = SparseTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_numpy_array(self, X, y=None, categorical_target=True): - assert hasattr( - X, "__array__" - ), "'X' must be a numpy.ndarray or implement __array__" - - if y is not None: - y_checked = column_or_1d(y, warn=True) - else: - y_checked = None - self.main_table = NumpyTable( - "main_table", - X, - target_column=y_checked, - categorical_target=categorical_target, - ) - self.secondary_tables = [] - - def _init_tables_from_tuple(self, X, y=None, categorical_target=True): - """Initializes the spec from a 'X' of type tuple""" - assert isinstance(X, tuple), "'X' must be a tuple" - - # Check the input tuple - self._check_input_tuple(X, y) - - # Obtain path and separator - path, sep = X - - # Initialization - self.main_table = FileTable( - "main_table", - categorical_target=categorical_target, - target_column_id=y, - path=path, - sep=sep, - ) - self.secondary_tables = [] - - def _check_input_tuple(self, X, y=None): + def _check_input_tuple(self, X): if len(X) != 2: raise ValueError(f"'X' tuple input must have length 2 not {len(X)}") if not isinstance(X[0], str): raise TypeError(type_error_message("X[0]", X[0], str)) if not isinstance(X[1], str): raise TypeError(type_error_message("X[1]", X[1], str)) - if y is not None and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - def _init_tables_from_sequence(self, X, y=None, categorical_target=True, key=None): + def _init_tables_from_sequence(self, X, key=None): """Initializes the spec from a list-like 'X'""" assert is_list_like(X), "'X' must be a list-like" # Check the input sequence - self._check_input_sequence(X, y, key=key) + self._check_input_sequence(X, key=key) # Initialize the tables if isinstance(X[0], pd.DataFrame): - self.main_table = PandasTable( - "main_table", - X[0], - target_column=y, - categorical_target=categorical_target, - key=key, - ) + self.main_table = PandasTable("main_table", X[0], key=key) self.secondary_tables = [] for index, dataframe in enumerate(X[1:], start=1): self.secondary_tables.append( PandasTable(f"secondary_table_{index:02d}", dataframe, key=key) ) else: - self.main_table = FileTable( - "main_table", - X[0], - target_column_id=y, - categorical_target=categorical_target, - key=key, - ) + self.main_table = FileTable("main_table", X[0], key=key) self.secondary_tables = [] for index, table_path in enumerate(X[1:], start=1): self.secondary_tables.append( FileTable(f"secondary_table_{index:02d}", table_path, key=key) ) + # Create a list of relations main_table_name = self.main_table.name self.relations = [ (main_table_name, table.name, False) for table in self.secondary_tables ] - def _check_input_sequence(self, X, y=None, key=None): + def _check_input_sequence(self, X, key=None): # Check the first table if len(X) == 0: raise ValueError("'X' must be a non-empty sequence") @@ -583,35 +541,19 @@ def _check_input_sequence(self, X, y=None, key=None): for i, secondary_X in enumerate(X[1:], start=1): if not isinstance(secondary_X, main_table_type): raise TypeError( - type_error_message(f"X[{i}]", X[i], main_table_type) + type_error_message(f"Table at index {i}", X[i], main_table_type) + " as the first table in X" ) - # Check the type of y - if y is not None: - if isinstance(X[0], str) and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - elif isinstance(X[0], pd.DataFrame) and not isinstance(y, pd.Series): - raise TypeError(type_error_message("y", y, pd.Series)) - - # Check the type of key - if not is_list_like(key) and not isinstance(key, str): - raise TypeError(type_error_message("key", key, "list-like", str)) - if is_list_like(key): - for column_index, column_name in enumerate(key): - if not isinstance(column_name, str): - raise TypeError( - type_error_message( - f"key[{column_index}]", key[column_index], str - ) - ) + # Check the key for the main_table (it is the same for the others) + check_table_key("main_table", key) - def _init_tables_from_mapping(self, X, y=None, categorical_target=True): + def _init_tables_from_mapping(self, X): """Initializes the table spec from a dict-like 'X'""" assert is_dict_like(X), "'X' must be dict-like" # Check the input mapping - self._check_input_mapping(X, y) + check_dataset_spec(X) # Initialize tables objects if len(X["tables"]) == 1: @@ -636,8 +578,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.main_table = FileTable( main_table_name, main_table_source, - target_column_id=y, - categorical_target=categorical_target, key=main_table_key, sep=self.sep, header=self.header, @@ -662,8 +602,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): main_table_name, main_table_source, key=main_table_key, - target_column=y, - categorical_target=categorical_target, ) self.secondary_tables = [] for table_name, (table_source, table_key) in X["tables"].items(): @@ -677,8 +615,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): main_table_name, main_table_source, key=main_table_key, - target_column=y, - categorical_target=categorical_target, ) self.secondary_tables = [] # Initialize a numpyarray dataset (monotable) @@ -686,8 +622,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.main_table = NumpyTable( main_table_name, main_table_source, - target_column=y, - categorical_target=categorical_target, ) if len(X["tables"]) > 1: raise ValueError( @@ -712,40 +646,86 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): ) self.relations = relations - def _check_input_mapping(self, X, y=None): - # Check the dataset spec for X - check_dataset_spec(X) + def _init_target_column(self, y): + assert self.main_table is not None + assert self.secondary_tables is not None + # Check y's type + # For in memory target columns: + # - column_or_1d checks *and transforms* to a numpy.array if successful + # - warn=True in column_or_1d is necessary to pass sklearn checks + if isinstance(y, str): + y_checked = y + else: + y_checked = column_or_1d(y, warn=True) - # Check the target coherence with X's tables - if y is not None: - if len(X["tables"]) == 1: - main_table_source, _ = list(X["tables"].values())[0] + # Check the target type coherence with those of X's tables + if isinstance(self.main_table, PandasTable) and isinstance(y_checked, str): + raise TypeError( + type_error_message("y", y, "array-like") + + " (X's tables are of type pandas.DataFrame)" + ) + if isinstance(self.main_table, (SparseTable, NumpyTable)) and isinstance( + y_checked, str + ): + raise TypeError( + type_error_message("y", y, "array-like") + + " (X's tables are of type numpy.ndarray" + + " or scipy.sparse.spmatrix)" + ) + if isinstance(self.main_table.data_source, str) and not isinstance( + y_checked, str + ): + raise TypeError( + type_error_message("y", y, str) + + " (X's tables are of type str [file paths])" + ) + + # Initialize the members related to the target + # Case when y is a memory array + if hasattr(y_checked, "__array__"): + self.target_column = y_checked + self.target_column_dtype = self.target_column.dtype + + # Initialize the id of the target column + if isinstance(y, pd.Series) and y.name is not None: + self.target_column_id = y.name + elif isinstance(y, pd.DataFrame): + self.target_column_id = y.columns[0] else: - main_table_source, _ = X["tables"][X["main_table"]] - if ( - isinstance(main_table_source, pd.DataFrame) - and not isinstance(y, pd.Series) - and not isinstance(y, pd.DataFrame) - ): - raise TypeError( - type_error_message("y", y, pd.Series, pd.DataFrame) - + " (X's tables are of type pandas.DataFrame)" - ) - if ( - isinstance(main_table_source, spmatrix) - or hasattr(main_table_source, "__array__") - ) and not hasattr(y, "__array__"): - raise TypeError( - type_error_message("y", y, "array-like") - + " (X's tables are of type numpy.ndarray" - + " or scipy.sparse.spmatrix)" + if pd.api.types.is_integer_dtype(self.main_table.column_ids): + self.target_column_id = self.main_table.column_ids[-1] + 1 + else: + assert pd.api.types.is_string_dtype(self.main_table.column_ids) + self.target_column_id = "UnknownTargetColumn" + + # Fail if there is a column in the main_table with the target column's name + if self.target_column_id in self.main_table.column_ids: + raise ValueError( + f"Target column name '{self.target_column_id}' " + f"is already present in the main table. " + f"Column names: {list(self.main_table.column_ids)}" ) - if isinstance(main_table_source, str) and not isinstance(y, str): - raise TypeError( - type_error_message("y", y, str) - + " (X's tables are of type str [file paths])" + # Case when y is column id: Set both the column and the id to it + else: + assert isinstance(y, str), type_error_message("y", y, str) + self.target_column = y + self.target_column_id = y + + # Check the target column exists in the main table + if self.target_column_id not in self.main_table.column_ids: + raise ValueError( + f"Target column '{self.target_column}' " + f"not present in columns '{self.main_table.column_ids}'" ) + # Force the target column type from the parameters + if self.categorical_target: + self.main_table.khiops_types[self.target_column] = "Categorical" + self.target_column_type = "Categorical" + else: + self.main_table.khiops_types[self.target_column] = "Numerical" + self.target_column_type = "Numerical" + def is_in_memory(self): """Tests whether the dataset is in memory @@ -818,15 +798,25 @@ def create_khiops_dictionary_domain(self): # Create root dictionary and add it to the domain dictionary_domain = kh.DictionaryDomain() - root_dictionary = self.main_table.create_khiops_dictionary() - dictionary_domain.add_dictionary(root_dictionary) + main_dictionary = self.main_table.create_khiops_dictionary() + dictionary_domain.add_dictionary(main_dictionary) + + # For in-memory datasets: Add the target variable if available + if self.is_in_memory() and self.target_column is not None: + variable = kh.Variable() + variable.name = get_khiops_variable_name(self.target_column_id) + if self.categorical_target: + variable.type = "Categorical" + else: + variable.type = "Numerical" + main_dictionary.add_variable(variable) # Create the dictionaries for each secondary table and the table variables in # root dictionary that point to each secondary table # This is performed using a breadth-first-search over the graph of relations # Note: In general 'name' and 'object_type' fields of Variable can be different if self.secondary_tables: - root_dictionary.root = True + main_dictionary.root = True table_names = [table.name for table in self.secondary_tables] tables_to_visit = [self.main_table.name] while tables_to_visit: @@ -851,17 +841,18 @@ def create_khiops_dictionary_domain(self): table_variable.name = table.name table_variable.object_type = table.name parent_table_dictionary.add_variable(table_variable) + return dictionary_domain - def create_table_files_for_khiops(self, target_dir, sort=True): + def create_table_files_for_khiops(self, out_dir, sort=True): """Prepares the tables of the dataset to be used by Khiops If this is a multi-table dataset it will create sorted copies the tables. Parameters ---------- - target_dir : str - The directory where the sorted tables will be created + out_dir : str + The directory where the sorted tables will be created. Returns ------- @@ -878,22 +869,27 @@ def create_table_files_for_khiops(self, target_dir, sort=True): sort_main_table = sort and ( self.is_multitable() or self.main_table.key is not None ) - main_table_path = self.main_table.create_table_file_for_khiops( - target_dir, sort=sort_main_table - ) + if self.is_in_memory(): + main_table_path = self.main_table.create_table_file_for_khiops( + out_dir, + sort=sort_main_table, + target_column=self.target_column, + target_column_id=self.target_column_id, + ) + else: + main_table_path = self.main_table.create_table_file_for_khiops( + out_dir, + sort=sort_main_table, + ) # Create a copy of each secondary table secondary_table_paths = {} for table in self.secondary_tables: secondary_table_paths[table.name] = table.create_table_file_for_khiops( - target_dir, sort=sort + out_dir, sort=sort ) - return main_table_path, secondary_table_paths - @property - def target_column_type(self): - """The target column's type""" - return self.main_table.target_column_type + return main_table_path, secondary_table_paths def __repr__(self): return str(self.create_khiops_dictionary_domain()) @@ -902,7 +898,7 @@ def __repr__(self): class DatasetTable(ABC): """A generic dataset table""" - def __init__(self, name, categorical_target=True, key=None): + def __init__(self, name, key=None): # Check input if not isinstance(name, str): raise TypeError(type_error_message("name", name, str)) @@ -924,12 +920,10 @@ def __init__(self, name, categorical_target=True, key=None): # Initialization (must be completed by concrete sub-classes) self.name = name self.data_source = None - self.categorical_target = categorical_target if is_list_like(key) or key is None: self.key = key else: self.key = [key] - self.target_column_id = None self.column_ids = None self.khiops_types = None self.n_samples = None @@ -977,43 +971,25 @@ def create_khiops_dictionary(self): dictionary.key = list(self.key) # For each column add a Khiops variable to the dictionary - for column_id in self._get_all_column_ids(): + for column_id in self.column_ids: variable = kh.Variable() - - # Set the variable name for string and integer column indexes - if isinstance(column_id, str): - variable.name = str(column_id) - else: - assert isinstance(column_id, (np.int64, int)) - variable.name = f"Var{column_id}" + variable.name = get_khiops_variable_name(column_id) # Set the type of the column/variable # Case of a column in the key : Set to categorical if self.key is not None and column_id in self.key: variable.type = "Categorical" - # Case of the target column: Set to specified type - elif column_id == self.target_column_id: - assert self.target_column_id is not None - if self.categorical_target: - variable.type = "Categorical" - else: - variable.type = "Numerical" # The rest of columns: Obtain the type from dtypes else: variable.type = self.khiops_types[column_id] dictionary.add_variable(variable) return dictionary - @abstractmethod - def _get_all_column_ids(self): - """Returns the column ids including the target""" - class PandasTable(DatasetTable): """Table encapsulating the features dataframe X and the target labels y - X is of type pandas.DataFrame. - y is of type pandas.Series or pandas.DataFrame. + X is of type pandas.DataFrame. y is array-like. Parameters ---------- @@ -1023,45 +999,17 @@ class PandasTable(DatasetTable): The data frame to be encapsulated. key : list-like of str, optional The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, dataframe, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, dataframe, key=None): # Call the parent method - super().__init__(name, categorical_target=categorical_target, key=key) + super().__init__(name, key=key) # Check inputs specific to this sub-class if not isinstance(dataframe, pd.DataFrame): raise TypeError(type_error_message("dataframe", dataframe, pd.DataFrame)) if dataframe.shape[0] == 0: raise ValueError("'dataframe' is empty") - if target_column is not None: - if not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - if isinstance(target_column, pd.Series): - if ( - target_column.name is not None - and target_column.name in dataframe.columns - ): - raise ValueError( - f"Target series name '{target_column.name}' " - f"is already present in dataframe : {list(dataframe.columns)}" - ) - elif isinstance(target_column, pd.DataFrame): - number_of_target_columns = len(target_column.columns) - if number_of_target_columns != 1: - raise ValueError( - "Target dataframe should contain exactly one column. " - f"It contains {number_of_target_columns}." - ) - target_column = target_column.iloc[:, 0] # Initialize the attributes self.data_source = dataframe @@ -1091,21 +1039,6 @@ def __init__( for column_id in self.column_ids } - # Initialize target column (if any) - self.target_column = target_column - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - self.target_column_id = target_column.name - else: - if pd.api.types.is_integer_dtype(self.column_ids): - self.target_column_id = self.column_ids[-1] + 1 - else: - assert pd.api.types.is_string_dtype(self.column_ids) - self.target_column_id = "UnknownTargetColumn" - # Check key integrity self.check_key() @@ -1118,35 +1051,31 @@ def __repr__(self): f"dtypes={dtypes_str}; target={self.target_column_id}>" ) - def _get_all_column_ids(self): - if self.target_column is not None: - all_column_ids = list(self.column_ids) + [self.target_column_id] - else: - all_column_ids = list(self.column_ids) - return all_column_ids - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, np.int64) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): assert not sort or self.key is not None, "Cannot sort table without a key" assert not sort or is_list_like( self.key ), "Cannot sort table with a key is that is not list-like" assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None # Create the output table resource object output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the output dataframe output_dataframe = self._create_dataframe_copy() + output_names = { + column_id: get_khiops_variable_name(column_id) + for column_id in self.column_ids + } + output_dataframe.rename(columns=output_names, inplace=True) + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) # Sort by key if requested (as string) if sort: @@ -1168,42 +1097,7 @@ def create_table_file_for_khiops(self, output_dir, sort=True): def _create_dataframe_copy(self): """Creates an in memory copy of the dataframe with the target column""" - # Create a copy of the dataframe and add a copy of the target column (if any) - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - output_target_column = self.target_column.reset_index(drop=True) - else: - output_target_column = pd.Series( - self.target_column, name=self.target_column_id - ) - output_dataframe = pd.concat( - [self.data_source.reset_index(drop=True), output_target_column], - axis=1, - ) - else: - output_dataframe = self.data_source.copy() - - # Rename the columns - output_dataframe_column_names = {} - for column_id in self._get_all_column_ids(): - output_dataframe_column_names[column_id] = self.get_khiops_variable_name( - column_id - ) - output_dataframe.rename( - output_dataframe_column_names, axis="columns", inplace=True - ) - - return output_dataframe - - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type + return self.data_source.copy() class NumpyTable(DatasetTable): @@ -1217,38 +1111,19 @@ class NumpyTable(DatasetTable): The data frame to be encapsulated. key : :external:term`array-like` of int, optional The names of the columns composing the key - target_column : :external:term:`array-like` of shape (n_samples,) , optional - The series representing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, array, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, array, key=None): # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) + super().__init__(name, key=key) # Check the array's types and shape if not hasattr(array, "__array__"): raise TypeError(type_error_message("array", array, np.ndarray)) - # Check (and potentially transform with a copy) the array's data - checked_array = check_array(array, ensure_2d=True, force_all_finite=False) - - # Check the target's types and shape - if target_column is not None: - checked_target_column = column_or_1d(target_column, warn=True) - # Initialize the members - self.data_source = checked_array - self.column_ids = list(range(self.data_source.shape[1])) - self.target_column_id = self.data_source.shape[1] - if target_column is not None: - self.target_column = checked_target_column - else: - self.target_column = None - self.categorical_target = categorical_target + self.data_source = check_array(array, ensure_2d=True, force_all_finite=False) + self.column_ids = column_or_1d(range(self.data_source.shape[1])) self.khiops_types = { column_id: get_khiops_type(self.data_source.dtype) for column_id in self.column_ids @@ -1262,23 +1137,9 @@ def __repr__(self): f"dtype={dtype_str}; target={self.target_column_id}>" ) - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): assert not sort or self.key is not None, "Cannot sort table without a key" assert not sort or is_list_like( self.key @@ -1290,9 +1151,13 @@ def create_table_file_for_khiops(self, output_dir, sort=True): # Write the output dataframe output_dataframe = pd.DataFrame(self.data_source.copy()) - output_dataframe.columns = [f"Var{column_id}" for column_id in self.column_ids] - if self.target_column is not None: - output_dataframe[f"Var{self.target_column_id}"] = self.target_column + output_dataframe.columns = [ + get_khiops_variable_name(column_id) for column_id in self.column_ids + ] + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) # Sort by key if requested (as string) if sort: @@ -1313,13 +1178,6 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type - class SparseTable(DatasetTable): """Table encapsulating feature matrix X and target array y @@ -1335,18 +1193,12 @@ class SparseTable(DatasetTable): The sparse matrix to be encapsulated. key : list-like of str, optional The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, matrix, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, matrix, key=None): assert key is None, "'key' must be unset for sparse matrix tables" # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) + super().__init__(name, key=key) # Check the sparse matrix types if not isinstance(matrix, spmatrix): @@ -1358,21 +1210,11 @@ def __init__( type_error_message("'matrix' dtype", matrix.dtype, "numeric") ) - # Check the target's types - if target_column is not None and not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - # Initialize the members self.data_source = matrix - self.column_ids = list(range(self.data_source.shape[1])) - self.target_column_id = self.data_source.shape[1] - self.target_column = target_column - self.categorical_target = categorical_target + self.column_ids = column_or_1d(range(matrix.shape[1])) self.khiops_types = { - column_id: get_khiops_type(self.data_source.dtype) - for column_id in self.column_ids + column_id: get_khiops_type(matrix.dtype) for column_id in self.column_ids } self.n_samples = self.data_source.shape[0] @@ -1380,7 +1222,7 @@ def __repr__(self): dtype_str = str(self.data_source.dtype) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtype={dtype_str}; target={self.target_column_id}>" + f"dtype={dtype_str}>" ) def create_khiops_dictionary(self): @@ -1404,34 +1246,14 @@ def create_khiops_dictionary(self): # For each variable, add metadata, named `VarKey` variable_names = [variable.name for variable in dictionary.variables] - target_column_variable_name = self.get_khiops_variable_name( - self.target_column_id - ) for i, variable_name in enumerate(variable_names, 1): - if variable_name != target_column_variable_name: - variable = dictionary.remove_variable(variable_name) - variable.meta_data.add_value("VarKey", i) - variable_block.add_variable(variable) + variable = dictionary.remove_variable(variable_name) + variable.meta_data.add_value("VarKey", i) + variable_block.add_variable(variable) dictionary.add_variable_block(variable_block) return dictionary - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - def _flatten(self, iterable): if isinstance(iterable, Iterable): for iterand in iterable: @@ -1440,14 +1262,11 @@ def _flatten(self, iterable): else: yield iterand - def _write_sparse_block(self, row_index, stream, target=None): - assert row_index in range( - self.data_source.shape[0] - ), "'row_index' must be coherent with the shape of the sparse matrix" - if target is not None: - assert target in self.target_column, "'target' must be in the target column" - stream.write(f"{target}\t") + def _write_sparse_block(self, row_index, stream, target_value=None): + + # Access the sparse row row = self.data_source.getrow(row_index) + # Empty row in the sparse matrix: use the first variable as missing data # TODO: remove this part once Khiops bug # https://github.com/KhiopsML/khiops/issues/235 is solved @@ -1474,26 +1293,35 @@ def _write_sparse_block(self, row_index, stream, target=None): ] for variable_index, variable_value in zip(sorted_indices, sorted_data): stream.write(f"{variable_index + 1}:{variable_value} ") - stream.write("\n") - def create_table_file_for_khiops(self, output_dir, sort=True): + # Write the target value at the end of the record if available + if target_value is not None: + stream.write(f"\t{target_value}\n") + else: + stream.write("\n") + + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None + # Create the output table resource object output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the sparse matrix to an internal table file with io.StringIO() as output_sparse_matrix_stream: - if self.target_column is not None: - target_column_name = self.get_khiops_variable_name( - self.target_column_id - ) + if target_column is not None: output_sparse_matrix_stream.write( - f"{target_column_name}\tSparseVariables\n" + f"SparseVariables\t{get_khiops_variable_name(target_column_id)}\n" ) - for target, row_index in zip( - self.target_column, range(self.data_source.shape[0]) + for target_value, row_index in zip( + target_column, range(self.data_source.shape[0]) ): self._write_sparse_block( - row_index, output_sparse_matrix_stream, target=target + row_index, + output_sparse_matrix_stream, + target_value=target_value, ) else: output_sparse_matrix_stream.write("SparseVariables\n") @@ -1506,13 +1334,6 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type - class FileTable(DatasetTable): """A table representing a delimited text file @@ -1529,24 +1350,18 @@ class FileTable(DatasetTable): Indicates if the table key : list-like of str, optional The names of the columns composing the key - target_column_id : str, optional - Name of the target variable column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ def __init__( self, name, path, - target_column_id=None, - categorical_target=True, key=None, sep="\t", header=True, ): # Initialize parameters - super().__init__(name=name, categorical_target=categorical_target, key=key) + super().__init__(name=name, key=key) # Check the parameters specific to this sub-class if not isinstance(path, str): @@ -1558,7 +1373,6 @@ def __init__( self.data_source = path self.sep = sep self.header = header - self.target_column_id = target_column_id # Build a dictionary file from the input data table # Note: We use export_dictionary_as_json instead of read_dictionary_file @@ -1574,7 +1388,7 @@ def __init__( header_line=header, ) kh.export_dictionary_as_json(tmp_kdic_path, tmp_kdicj_path) - with open(tmp_kdicj_path) as tmp_kdicj: + with open(tmp_kdicj_path, encoding="utf8") as tmp_kdicj: json_domain = json.load(tmp_kdicj) finally: os.remove(tmp_kdic_path) @@ -1593,33 +1407,9 @@ def __init__( self.column_ids = [var["name"] for var in variables] self.khiops_types = {var["name"]: var["type"] for var in variables} - # Check the target column exists - if ( - self.target_column_id is not None - and target_column_id not in self.column_ids - ): - raise ValueError( - f"Target column '{target_column_id}'" - f"not present in columns '{self.column_ids}'" - ) - - # Force the target column type from the parameters - if self.target_column_id is not None: - if categorical_target: - self.khiops_types[target_column_id] = "Categorical" - else: - self.khiops_types[target_column_id] = "Numerical" - # Check key integrity self.check_key() - def _get_all_column_ids(self): - return list(self.column_ids) - - def get_khiops_variable_name(self, column_id): - assert column_id in self._get_all_column_ids() - return column_id - def create_table_file_for_khiops(self, output_dir, sort=True): assert not sort or self.key is not None, "key is 'None'" @@ -1662,12 +1452,3 @@ def create_table_file_for_khiops(self, output_dir, sort=True): fs.write(output_table_file_path, fs.read(self.data_source)) return output_table_file_path - - @property - def target_column_type(self): - target_column_type = None - if self.target_column_id is not None: - target_column_type = ( - "Categorical" if self.categorical_target else "Numerical" - ) - return target_column_type diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index e4ebfe06..e65c5577 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -485,7 +485,7 @@ def test_dataset_is_correctly_built(self): self.assertEqual(dataset.main_table.name, "A") self.assertEqual(len(dataset.secondary_tables), 4) dataset_secondary_table_names = set( - [secondary_table.name for secondary_table in dataset.secondary_tables] + secondary_table.name for secondary_table in dataset.secondary_tables ) self.assertEqual(dataset_secondary_table_names, {"B", "C", "D", "E"}) self.assertEqual(len(dataset.relations), 4) @@ -521,8 +521,8 @@ def test_out_file_from_dataframe_monotable(self): # Check that the dataframes are equal assert_frame_equal( - out_table, ref_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, ) def test_out_file_from_numpy_array_monotable(self): @@ -557,12 +557,14 @@ def _create_test_sparse_matrix_with_target(self): return sparse_matrix, target_array def _load_khiops_sparse_file(self, stream): - # skip header + # Skip header next(stream) + + # Read the sparse file target_vector = [] feature_matrix = [] for line in stream: - target, features = line.split(b"\t") + features, target_value = line.split(b"\t") feature_row = np.zeros(100) for feature in features.strip().split(b" "): feature_index, feature_value = feature.split(b":") @@ -573,7 +575,7 @@ def _load_khiops_sparse_file(self, stream): feature_value = 0.0 feature_row[int(feature_index) - 1] = feature_value feature_matrix.append(feature_row) - target_vector.append(float(target)) + target_vector.append(float(target_value)) target_array = np.array(target_vector) sparse_matrix = sp.csr_matrix(feature_matrix) return sparse_matrix, target_array diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index c7fab7b9..01fd81dc 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -13,7 +13,6 @@ import numpy as np import pandas as pd -from khiops.core.exceptions import KhiopsRuntimeError from khiops.core.internals.common import type_error_message from khiops.utils.dataset import Dataset, FileTable, PandasTable @@ -449,14 +448,44 @@ def test_y_type_must_be_str_or_array_like_1d(self): dataframe.to_csv(table_path, sep="\t", index=False) tuple_spec = (table_path, "\t") bad_y = dataframe["class"] - expected_msg = type_error_message("y", bad_y, str) + expected_msg = ( + type_error_message("y", bad_y, str) + + " (X's tables are of type str [file paths])" + ) self.assert_dataset_fails(tuple_spec, bad_y, TypeError, expected_msg) # Test when X is a dataframe: expects array-like - bad_y = AnotherType() - expected_msg = type_error_message("y", bad_y, "array-like") + bad_y = "TargetColumn" + expected_msg = ( + type_error_message("y", bad_y, "array-like") + + " (X's tables are of type pandas.DataFrame)" + ) self.assert_dataset_fails(dataframe, bad_y, TypeError, expected_msg) + def test_df_dataset_fails_if_target_column_is_already_in_the_features(self): + """Test in-memory table failing when the target is already in the features""" + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) + features_table = spec["tables"]["Reviews"][0] + bad_y = features_table["Recommended IND"] + with self.assertRaises(ValueError) as context: + Dataset(spec, bad_y) + output_error_msg = str(context.exception) + expected_msg_prefix = ( + "Target column name 'Recommended IND' is already present in the main table." + ) + self.assertIn(expected_msg_prefix, output_error_msg) + + def test_file_dataset_fails_if_table_does_not_contain_the_target_column(self): + """Test FileTable failing if the table does not contain the target column""" + table_path = os.path.join(self.output_dir, "table.csv") + table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + table.to_csv(table_path, sep="\t", index=False) + with self.assertRaises(ValueError) as context: + Dataset({"tables": {"main_table": (table_path, None)}}, y="TargetColumn") + output_error_msg = str(context.exception) + expected_msg_prefix = "Target column 'TargetColumn' not present in" + self.assertIn(expected_msg_prefix, output_error_msg) + ##################################### # Tests for dictionary dataset spec # ##################################### @@ -636,11 +665,11 @@ def test_dict_spec_format_tuple_1st_element_must_be_a_single_character(self): self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_y_type_must_be_series_or_df_when_x_is_df_spec(self): - """Test Dataset raising TypeError if X a is df-dict-spec and y isn't a Series""" + """Test Dataset raising TypeError if X a is ds-spec and y isn't array-like""" spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) - bad_y = AnotherType() + bad_y = "TargetColumnName" expected_msg = ( - type_error_message("y", bad_y, pd.Series, pd.DataFrame) + type_error_message("y", bad_y, "array-like") + " (X's tables are of type pandas.DataFrame)" ) self.assert_dataset_fails(spec, bad_y, TypeError, expected_msg) @@ -650,7 +679,7 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): spec, _ = self.create_fixture_dataset_spec( output_dir=self.output_dir, data_type="file" ) - bad_y = AnotherType() + bad_y = np.array([1, 2, 3]) expected_msg = ( type_error_message("y", bad_y, str) + " (X's tables are of type str [file paths])" @@ -659,13 +688,12 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): def test_dict_spec_table_name_must_be_str(self): """Test Dataset raising TypeError when a table name is not a str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: PandasTable( AnotherType(), features_table, - target_column=y, ) output_error_msg = str(context.exception) expected_msg = type_error_message("name", AnotherType(), str) @@ -684,41 +712,30 @@ def test_dict_spec_key_type_must_be_str_or_list_like(self): """Test Dataset raising TypeError when a key is not of the proper type""" bad_key = AnotherType() expected_error_msg = type_error_message("key", bad_key, str, int, "list-like") - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=bad_key, - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) self.assertEqual(output_error_msg, expected_error_msg) def test_dict_spec_key_column_type_must_be_str_or_int(self): """Test Dataset raising TypeError when a key column is not of the proper type""" - bad_key = {"not-a-str-or-int": []} + bad_key = [AnotherType()] expected_error_msg = ( - type_error_message("key[0]", bad_key, str, int) + " at table 'reviews'" + type_error_message("key[0]", AnotherType(), str, int) + + " at table 'reviews'" ) - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=[bad_key], - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) - self.assertEqual(output_error_msg, expected_error_msg) + self.assertEqual(expected_error_msg, output_error_msg) def test_dict_spec_relations_must_be_list_like(self): """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" @@ -854,59 +871,18 @@ def test_pandas_table_input_type_must_be_dataframe(self): def test_pandas_table_input_table_must_not_be_empty(self): """Test PandasTable raising ValueError if the input dataframe is empty""" with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - pd.DataFrame(), - target_column="class", - ) + PandasTable("reviews", pd.DataFrame()) output_error_msg = str(context.exception) expected_msg = "'dataframe' is empty" self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_target_column_must_be_series(self): - """Test PandasTable raising TypeError if the input target col. isn't a Series""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=AnotherType(), - ) - output_error_msg = str(context.exception) - expected_msg = type_error_message("target_column", AnotherType(), "array-like") - self.assertEqual(output_error_msg, expected_msg) - - def test_pandas_table_fails_if_target_column_is_already_in_the_features(self): - """Test in-memory table failing when the target is already in the features""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - y = features_table["Recommended IND"] - with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - features_table, - target_column=y, - ) - output_error_msg = str(context.exception) - expected_msg = ( - "Target series name 'Recommended IND' is already present in" - " dataframe : ['User_ID', 'Age', 'Clothing ID', 'Date', 'New'," - " 'Title', 'Recommended IND', 'Positive Feedback average']" - ) - self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_column_ids_must_all_be_int_or_str(self): """Test that in-memory dataset all columns ids must be int or str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] features_table.rename(columns={"User_ID": 1}, inplace=True) with self.assertRaises(TypeError) as context: - PandasTable("reviews", features_table, target_column=y) + PandasTable("reviews", features_table) output_error_msg = str(context.exception) expected_msg = ( "Dataframe column ids must be either all integers or all " @@ -917,21 +893,10 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self): def test_file_table_fails_with_non_existent_table_file(self): """Test FileTable failing when it is created with a non-existent file""" with self.assertRaises(ValueError) as context: - FileTable("reviews", "Review.csv", target_column_id="class") + FileTable("reviews", "Review.csv") output_error_msg = str(context.exception) expected_msg = "Non-existent data table file: Review.csv" - self.assertEqual(output_error_msg, expected_msg) - - def test_file_table_fails_if_table_does_not_contain_the_target_column(self): - """Test FileTable failing if the table does not contain the target column""" - table_path = os.path.join(self.output_dir, "table.csv") - table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - table.to_csv(table_path, sep="\t", index=False) - with self.assertRaises(ValueError) as context: - table = FileTable("table", table_path, target_column_id="class") - output_error_msg = str(context.exception) - expected_msg_prefix = "Target column" - self.assertIn(expected_msg_prefix, output_error_msg) + self.assertEqual(expected_msg, output_error_msg) def test_file_table_internal_file_creation_fails_on_an_existing_path(self): """Test FileTable failing to create an internal file to a existing path""" @@ -941,12 +906,7 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self): old_file_path = spec["tables"]["Reviews"][0] new_file_path = old_file_path.replace("Reviews.csv", "copy_Reviews.txt") os.rename(old_file_path, new_file_path) - file_table = FileTable( - "Reviews", - new_file_path, - target_column_id="class", - key="User_ID", - ) + file_table = FileTable("Reviews", new_file_path, key="User_ID") with self.assertRaises(ValueError) as context: file_table.create_table_file_for_khiops(self.output_dir, sort=False) output_error_msg = str(context.exception) @@ -1003,6 +963,7 @@ def test_sequence_spec_must_be_str_or_df(self): # Test that the second element is not str bad_spec = ["table_1", AnotherType()] expected_msg = ( - type_error_message("X[1]", bad_spec[1], str) + " as the first table in X" + type_error_message("Table at index 1", bad_spec[1], str) + + " as the first table in X" ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index ece61467..e7d0f669 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -2260,20 +2260,37 @@ def test_sklearn_check_estimator(self): KhiopsEncoder(n_trees=0, transform_type_numerical="0-1_normalization"), ] - # Execute sklearn's estimator test battery - for khiops_estimator in khiops_estimators: - for estimator, check in check_estimator( - khiops_estimator, generate_only=True - ): - # Skip some checks for KhiopsEncoder as they yield "empty" - # deployed tables; they need to be implemented manually - check_name = check.func.__name__ - if check_name in [ - "check_fit_score_takes_y", - "check_fit_idempotent", - ] and isinstance(estimator, KhiopsEncoder): - continue - with self.subTest( - sklearn_check_name=check_name, sklearn_check_kwargs=check.keywords + # Ignore the "No informative variables" warnings + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+no informative variables" + ) + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+No informative input variable" + ) + + # Execute sklearn's estimator test battery + print("") + for khiops_estimator in khiops_estimators: + for estimator, check in check_estimator( + khiops_estimator, generate_only=True ): - check(estimator) + # Skip some checks for KhiopsEncoder as they yield "empty" + # deployed tables; they need to be implemented manually + check_name = check.func.__name__ + if check_name in [ + "check_fit_score_takes_y", + "check_fit_idempotent", + ] and isinstance(estimator, KhiopsEncoder): + continue + print( + f">>> Executing {check_name} on " + f"{estimator.__class__.__name__}... ", + end="", + ) + with self.subTest( + sklearn_check_name=check_name, + sklearn_check_kwargs=check.keywords, + ): + check(estimator) + print("Done")