From 78f91a2c1a2243847d998d49bd181e834e0018b9 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 29 Mar 2024 18:20:43 +0100 Subject: [PATCH 01/12] Refactor dataset dictionary spec methods - Move the dataset spec check methods out of the Dataset class - Simplify the messages of the aformentioned check errors - In particular eliminate all referenoces to `X` or `y` - Add a few new tests to `tests/test_dataset_errors.py` - Uniformize the pattern of the `tests/test_dataset_errors.py` tests --- khiops/sklearn/tables.py | 517 ++++++++++++++++++----------------- tests/test_dataset_errors.py | 283 +++++++++++-------- 2 files changed, 431 insertions(+), 369 deletions(-) diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index b4709671..0b347968 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd -import scipy.sparse as sp +from scipy import sparse as sp from sklearn.utils import check_array from sklearn.utils.validation import column_or_1d @@ -33,6 +33,247 @@ # pylint: disable=invalid-name +def check_dataset_spec(ds_spec): + # Check the "tables" field + if "tables" not in ds_spec: + raise ValueError("'tables' entry missing from dataset dict spec") + if not is_dict_like(ds_spec["tables"]): + raise TypeError( + type_error_message("'tables' entry", ds_spec["tables"], Mapping) + ) + if len(ds_spec["tables"]) == 0: + raise ValueError("'tables' dictionary cannot be empty") + for table_name, table_entry in ds_spec["tables"].items(): + check_table_entry(table_name, table_entry) + + # Multi-table specific table checks + if len(ds_spec["tables"]) > 1: + check_multitable_spec(ds_spec) + + # Check the 'format' field + if "format" in ds_spec: + check_format_entry(ds_spec["format"]) + + +def check_table_entry(table_name, table_spec): + if not isinstance(table_spec, tuple): + raise TypeError( + type_error_message(f"'{table_name}' table entry", table_spec, tuple) + ) + if len(table_spec) != 2: + raise ValueError( + f"'{table_name}' table entry must have size 2, not {len(table_spec)}" + ) + source, key = table_spec + if not isinstance(source, (pd.DataFrame, sp.spmatrix, str)) and not hasattr( + source, "__array__" + ): + raise TypeError( + type_error_message( + f"'{table_name}' table's source", + source, + "array-like", + "scipy.sparse.spmatrix", + str, + ) + ) + _check_table_key(table_name, key) + + +def _check_table_key(table_name, key): + if key is not None: + if not is_list_like(key) and not isinstance(key, str): + raise TypeError( + type_error_message(f"'{table_name}' table's key", key, str, Sequence) + ) + if len(key) == 0: + raise ValueError(f"'{table_name}' table's key is empty") + for column_name in key: + if not isinstance(column_name, str): + raise TypeError( + type_error_message( + f"'{table_name}' table's key column name", + column_name, + str, + ) + ) + + +def check_multitable_spec(ds_spec): + assert len(ds_spec) > 1 + # Check the main table + if "main_table" not in ds_spec: + raise ValueError( + "'main_table' entry must be specified for multi-table datasets" + ) + if not isinstance(ds_spec["main_table"], str): + raise TypeError( + type_error_message("'main_table' entry", ds_spec["main_table"], str) + ) + if ds_spec["main_table"] not in ds_spec["tables"]: + raise ValueError( + f"A table entry with the main table's name ('{ds_spec['main_table']}') " + f"must be present in the 'tables' dictionary" + ) + + # Check that all tables have non-None keys + for table_name, (_, table_key) in ds_spec["tables"].items(): + if table_key is None: + table_type = "main" if ds_spec["main_table"] == table_name else "secondary" + raise ValueError( + f"key of {table_type} table '{table_name}' is 'None': " + "table keys must be specified in multi-table datasets" + ) + + # If the 'relations' entry exists check it + if "relations" in ds_spec: + relations_spec = ds_spec["relations"] + # Otherwise build a star schema relations spec and check it + else: + relations_spec = [ + (ds_spec["main_table"], table) + for table in ds_spec["tables"].keys() + if table != ds_spec["main_table"] + ] + check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) + + +def check_relations_entry(main_table_name, tables_spec, relations_spec): + # Check the types and size of the relation entries + if not is_list_like(relations_spec): + raise TypeError( + type_error_message("'relations' entry", relations_spec, "list-like") + ) + for i, relation in enumerate(relations_spec, 1): + # Check that the relation is a 2 or 3 tuple + if not isinstance(relation, tuple): + raise TypeError(type_error_message("Relation", relation, tuple)) + if len(relation) not in (2, 3): + raise ValueError(f"A relation must be of size 2 or 3, not {len(relation)}") + + # Check the types of the tuple contents + parent_table, child_table = relation[:2] + if not isinstance(parent_table, str): + raise TypeError( + type_error_message(f"Relation #{i}'s parent table", parent_table, str) + ) + if not isinstance(child_table, str): + raise TypeError( + type_error_message(f"Relation #{i}'s child table", child_table, str) + ) + if len(relation) == 3 and not isinstance(relation[2], bool): + raise TypeError( + type_error_message( + f"Relation #{i} ({parent_table}, {child_table}) 1-1 flag", + relation[2], + bool, + ) + ) + + # Check structure and coherence with the rest of the spec + parents_and_children = [relation[:2] for relation in relations_spec] + for i, relation in enumerate(relations_spec, 1): + parent_table, child_table = relation[:2] + if parent_table == child_table: + raise ValueError( + f"Relation #{i}'s tables are equal: ({parent_table}, {child_table}). " + "They must be different." + ) + for table in (parent_table, child_table): + if not table in tables_spec.keys(): + raise ValueError( + f"Relation #{i} ({parent_table}, {child_table}) contains " + f"non-existent table '{table}'. All relation tables must exist " + "in the 'tables' entry." + ) + if parents_and_children.count(relation[:2]) > 1: + raise ValueError( + f"Relation #{i} ({parent_table}, {child_table}) occurs " + f"{parents_and_children.count(relation[:2])} times. " + f"Each relation must be unique." + ) + + # Check hierachical keys + check_hierarchical_keys( + i, + parent_table, + tables_spec[parent_table][1], + child_table, + tables_spec[child_table][1], + ) + + # Check there are no cycles + check_no_cycles(relations_spec, main_table_name) + + +def check_hierarchical_keys( + relation_id, parent_table, parent_table_key, child_table, child_table_key +): + """Check that the parent table's key is contained in the child table's key""" + table_key_error = False + if isinstance(parent_table_key, str) and isinstance(child_table_key, str): + table_key_error = child_table_key != parent_table_key + elif isinstance(parent_table_key, str) and is_list_like(child_table_key): + table_key_error = parent_table_key not in child_table_key + elif is_list_like(parent_table_key) and is_list_like(child_table_key): + table_key_error = not set(parent_table_key).issubset(set(child_table_key)) + elif is_list_like(parent_table_key) and isinstance(child_table_key, str): + table_key_error = True + + if table_key_error: + if isinstance(child_table_key, str): + child_table_key_msg = f"[{child_table_key}]" + else: + child_table_key_msg = f"[{', '.join(child_table_key)}]" + if isinstance(parent_table_key, str): + parent_table_key_msg = f"[{parent_table_key}]" + else: + parent_table_key_msg = f"[{', '.join(parent_table_key)}]" + raise ValueError( + f"Relation #{relation_id} child table '{child_table}' " + f"key ({child_table_key_msg}) does not contain that of parent table " + f"'{parent_table}' ({parent_table_key_msg})." + ) + + +def check_no_cycles(relations_spec, main_table_name): + """Check that there are no cycles in the 'relations' entry""" + tables_to_visit = [main_table_name] + tables_visited = set() + while tables_to_visit: + current_table = tables_to_visit.pop(0) + tables_visited.add(current_table) + for relation in relations_spec: + parent_table, child_table = relation[:2] + if parent_table == current_table: + tables_to_visit.append(child_table) + if tables_visited.intersection(tables_to_visit): + raise ValueError( + "'relations' entry contains a cycle that includes " + f"the relation ({parent_table}, {child_table})." + ) + + +def check_format_entry(format_spec): + if not isinstance(format_spec, tuple): + raise TypeError(type_error_message("'format' entry", format_spec, tuple)) + if len(format_spec) != 2: + raise ValueError( + f"'format' entry must be a tuple of size 2, not {len(format_spec)}" + ) + sep, header = format_spec + if not isinstance(sep, str): + raise TypeError( + type_error_message("'format' tuple's 1st element (separator)", sep, str) + ) + if not isinstance(header, bool): + raise TypeError( + type_error_message("'format' tuple's 2nd element (header)", header, bool) + ) + if len(sep) != 1: + raise ValueError(f"'format' separator must be a single char, got '{sep}'") + + def get_khiops_type(numpy_type): """Translates a numpy type to a Khiops dictionary type @@ -389,7 +630,7 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): # Check the input mapping self._check_input_mapping(X, y) - # Initialize tables + # Initialize tables objects if len(X["tables"]) == 1: main_table_name = list(X["tables"])[0] main_table_source, main_table_key = list(X["tables"].values())[0] @@ -399,7 +640,7 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): main_table_name = X["main_table"] main_table_source, main_table_key = X["tables"][main_table_name] - # Case of paths + # Initialize a file dataset if isinstance(main_table_source, str): warnings.warn( deprecation_message( @@ -410,11 +651,14 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): ), stacklevel=4, ) + # Obtain the file format parameters if "format" in X: self.sep, self.header = X["format"] else: self.sep = "\t" self.header = True + + # Initialize the tables self.main_table = FileTable( main_table_name, main_table_source, @@ -438,7 +682,7 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): header=self.header, ) ) - # Case of dataframes + # Initialize a Pandas dataset elif isinstance(main_table_source, pd.DataFrame): self.main_table = PandasTable( main_table_name, @@ -453,7 +697,7 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.secondary_tables.append( PandasTable(table_name, table_source, key=table_key) ) - # Case of sparse matrices + # Initialize a sparse dataset (monotable) elif isinstance(main_table_source, sp.spmatrix): self.main_table = SparseTable( main_table_name, @@ -463,7 +707,7 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): categorical_target=categorical_target, ) self.secondary_tables = [] - # Case of numpyarray + # Initialize a numpyarray dataset (monotable) else: self.main_table = NumpyTable( main_table_name, @@ -474,260 +718,29 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): if len(X["tables"]) > 1: raise ValueError( "Multi-table schemas are only allowed " - "with pandas dataframe source tables." + "with pandas dataframe source tables" ) self.secondary_tables = [] + # If the relations are not specified intialize to a star schema if "relations" not in X: - # the schema is by default 'star' - # create a list of relations [(main_table, secondary_table, False), ...] self.relations = [ (self.main_table.name, table.name, False) for table in self.secondary_tables ] + # Otherwise initialize the relations in the spec else: - # the schema could be 'star' or 'snowflake' - # unify the size of all relation tuples - # by adding 'False' to non-entities - # check user-specified relations - self._check_relations(X) relations = [] for relation in X["relations"]: parent, child = relation[:2] relations.append( - ( - parent, - child, - relation[2] if len(relation) == 3 else False, - ) + (parent, child, relation[2] if len(relation) == 3 else False) ) self.relations = relations - def _check_cycle_exists(self, relations, main_table_name): - """Check existence of a cycle into 'relations'""" - tables_to_visit = [main_table_name] - tables_visited = set() - while tables_to_visit: - current_table = tables_to_visit.pop(0) - tables_visited.add(current_table) - for relation in relations: - parent_table, child_table = relation[:2] - if parent_table == current_table: - tables_to_visit.append(child_table) - if tables_visited.intersection(tables_to_visit): - raise ValueError( - f"Relations at X['relations'] contain a cycle which" - f" includes the relation '{relation}'" - ) - - def _check_relation_keys(self, X, left_table_name, right_table_name): - """Check coherence of keys""" - _, left_table_key = X["tables"][left_table_name] - _, right_table_key = X["tables"][right_table_name] - table_key_error = False - if isinstance(left_table_key, str) and isinstance(right_table_key, str): - table_key_error = right_table_key != left_table_key - elif isinstance(left_table_key, str) and is_list_like(right_table_key): - table_key_error = left_table_key not in right_table_key - elif is_list_like(left_table_key) and is_list_like(right_table_key): - table_key_error = not set(left_table_key).issubset(set(right_table_key)) - elif is_list_like(left_table_key) and isinstance(right_table_key, str): - table_key_error = True - - if table_key_error: - if isinstance(right_table_key, str): - right_table_key_msg = f"[{right_table_key}]" - else: - right_table_key_msg = f"[{', '.join(right_table_key)}]" - if isinstance(left_table_key, str): - left_table_key_msg = f"[{left_table_key}]" - else: - left_table_key_msg = f"[{', '.join(left_table_key)}]" - raise ValueError( - f"key for table '{right_table_name}' " - f"{right_table_key_msg} is incompatible with " - f"that of table " - f"'{left_table_name}' {left_table_key_msg}" - ) - - def _check_relations(self, X): - """Check relations""" - main_table_name = X["main_table"] - relations = X["relations"] - parents_and_children = [relation[:2] for relation in relations] - for relation in relations: - parent_table, child_table = relation[:2] - for table in (parent_table, child_table): - if not isinstance(table, str): - raise TypeError( - type_error_message("Table of a relation", table, str) - ) - if parent_table == child_table: - raise ValueError( - f"Tables in relation '({parent_table}, {child_table})' " - f"are the same. They must be different." - ) - if parents_and_children.count(relation[:2]) > 1: - raise ValueError( - f"Relation '({parent_table}, {child_table})' occurs " - f"'{parents_and_children.count(relation[:2])}' times. " - f"Each relation must be unique." - ) - if not parent_table in X["tables"].keys(): - raise ValueError( - f"X['tables'] does not contain a table named '{parent_table}'. " - f"All tables in X['relations'] must be declared in X['tables']" - ) - if not child_table in X["tables"].keys(): - raise ValueError( - f"X['tables'] does not contain a table named '{child_table}'. " - f"All tables in X['relations'] must be declared in X['tables']." - ) - if len(relation) == 3: - is_one_to_one_relation = relation[2] - if not isinstance(is_one_to_one_relation, bool): - raise TypeError( - type_error_message( - f"1-1 flag for relation " - f"({parent_table}, {child_table})", - is_one_to_one_relation, - bool, - ) - ) - self._check_relation_keys(X, parent_table, child_table) - self._check_cycle_exists(relations, main_table_name) - def _check_input_mapping(self, X, y=None): - # Check the "tables" field (basic) - if "tables" not in X: - raise ValueError("Mandatory key 'tables' missing from dict 'X'") - if not is_dict_like(X["tables"]): - raise TypeError(type_error_message("X['tables']", X["tables"], Mapping)) - if len(X["tables"]) == 0: - raise ValueError("X['tables'] cannot be empty") - - # Check coherence of each table specification - for table_name, table_input in X["tables"].items(): - if not isinstance(table_input, tuple): - raise TypeError( - type_error_message( - f"Table input at X['tables']['{table_name}']", - table_input, - tuple, - ) - ) - if len(table_input) != 2: - raise ValueError( - f"Table input tuple at X['tables']['{table_name}'] " - f"must have size 2 not {len(table_input)}" - ) - table_source, table_key = table_input - if not isinstance( - table_source, (pd.DataFrame, sp.spmatrix, str) - ) and not hasattr(table_source, "__array__"): - raise TypeError( - type_error_message( - f"Table source at X['tables']['{table_name}']", - table_source, - "array-like or scipy.sparse.spmatrix", - str, - ) - ) - if ( - table_key is not None - and not is_list_like(table_key) - and not isinstance(table_key, str) - ): - raise TypeError( - type_error_message( - f"Table key at X['tables']['{table_name}']", - table_key, - str, - Sequence, - ) - ) - - if table_key is not None: - for column_name in table_key: - if not isinstance(column_name, str): - raise TypeError( - type_error_message( - "Column name of table key " - f"at X['tables']['{table_name}']", - column_name, - str, - ) - ) - - # Multi-table specific table checks - if len(X["tables"]) > 1: - # Check the "main_table" field - if "main_table" not in X: - raise ValueError( - "'main_table' must be specified for multi-table datasets" - ) - if not isinstance(X["main_table"], str): - raise TypeError( - type_error_message("X['main_table']", X["main_table"], str) - ) - if X["main_table"] not in X["tables"]: - raise ValueError( - f"X['main_table'] ({X['main_table']}) " - f"must be present in X['tables']" - ) - main_table_source, main_table_key = X["tables"][X["main_table"]] - if main_table_key is None: - raise ValueError("key of the root table is 'None'") - if len(main_table_key) == 0: - raise ValueError( - "key of the root table must be non-empty for multi-table datasets" - ) - - # Check that all secondary tables have non-None keys - for table_name, (_, table_key) in X["tables"].items(): - if table_name != X["main_table"] and table_key is None: - raise ValueError( - f"key of the secondary table '{table_name}' is 'None':" - " table keys must be specified in multitable datasets" - ) - - if "relations" in X: - # check the 'relations' field - if not is_list_like(X["relations"]): - raise TypeError( - type_error_message( - "Relations at X['tables']['relations']", - X["relations"], - "list-like", - ) - ) - else: - for relation in X["relations"]: - if not isinstance(relation, tuple): - raise TypeError( - type_error_message("Relation", relation, tuple) - ) - if len(relation) not in (2, 3): - raise ValueError( - f"A relation must be of size 2 or 3, " - f"not {len(relation)}" - ) - - # Check the 'format' field - if "format" in X: - if not isinstance(X["format"], tuple): - raise TypeError(type_error_message("X['format']", X["format"], tuple)) - if not isinstance(X["format"][0], str): - raise TypeError( - type_error_message("X['format'] 1st element", X["format"][0], str) - ) - if not isinstance(X["format"][1], bool): - raise TypeError( - type_error_message("X['format'] 2nd element", X["format"][1], bool) - ) - sep, _ = X["format"][0], X["format"][1] - if len(sep) != 1: - raise ValueError(f"Separator must be a single character. Value: {sep}") + # Check the dataset spec for X + check_dataset_spec(X) # Check the target coherence with X's tables if y is not None: @@ -787,25 +800,25 @@ def copy(self): Referenced dataframes in tables are copied as references """ - dataset_spec = {} - dataset_spec["main_table"] = self.main_table.name - dataset_spec["tables"] = {} + ds_spec = {} + ds_spec["main_table"] = self.main_table.name + ds_spec["tables"] = {} if self.is_in_memory(): - dataset_spec["tables"][self.main_table.name] = ( + ds_spec["tables"][self.main_table.name] = ( self.main_table.dataframe, self.main_table.key, ) for table in self.secondary_tables: - dataset_spec["tables"][table.name] = (table.dataframe, table.key) + ds_spec["tables"][table.name] = (table.dataframe, table.key) else: - dataset_spec["tables"][self.main_table.name] = ( + ds_spec["tables"][self.main_table.name] = ( self.main_table.path, self.main_table.key, ) for table in self.secondary_tables: - dataset_spec["tables"][table.name] = (table.path, table.key) - dataset_spec["format"] = (self.sep, self.header) - return Dataset(dataset_spec) + ds_spec["tables"][table.name] = (table.path, table.key) + ds_spec["format"] = (self.sep, self.header) + return Dataset(ds_spec) def create_khiops_dictionary_domain(self): """Creates a Khiops dictionary domain representing this dataset diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 3cd09f40..97b8442d 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -456,111 +456,29 @@ def test_y_type_must_be_str_or_array_like_1d(self): expected_msg = type_error_message("y", bad_y, "array-like") self.assert_dataset_fails(dataframe, bad_y, TypeError, expected_msg) - ######################### - # Tests for X dict spec # - ######################### - - def test_dict_spec_relations_must_be_list_like(self): - """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"] = AnotherType() - expected_msg = type_error_message( - "Relations at X['tables']['relations']", - bad_spec["relations"], - "list-like", - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relations_must_be_tuple(self): - """Test Dataset raising TypeError when a relation is not a tuple""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = AnotherType() - expected_msg = type_error_message("Relation", bad_spec["relations"][0], "tuple") - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relations_must_be_of_size_2_or_3(self): - """Test Dataset raising ValueError when a relation is not of size 2 or 3""" - bad_spec, y = self.create_fixture_dataset_spec() - for size in [0, 1, 4, 5]: - bad_spec["relations"][0] = tuple((f"Table{i}" for i in range(size))) - expected_msg = f"A relation must be of size 2 or 3, not {size}" - with self.subTest(tuple_size=size): - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - - def test_dict_spec_table_relation_must_be_str(self): - """Test Dataset raising TypeError when a relation table is not a str""" - # Test the error in the left table - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = (AnotherType(), "BTable") - expected_msg = type_error_message( - "Table of a relation", bad_spec["relations"][0][0], str - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - # Test the error in the right table - bad_spec["relations"][0] = ("ATable", AnotherType()) - expected_msg = type_error_message( - "Table of a relation", bad_spec["relations"][0][1], str - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_entiy_flag_relation_must_be_bool(self): - """Test Dataset raising TypeError when the entity flag is not boolean""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("B", "D", AnotherType()) - expected_msg = type_error_message( - "1-1 flag for relation (B, D)", bad_spec["relations"][0][2], bool - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relation_tables_must_not_be_the_same(self): - """Test Dataset raising TypeError when tables of a relation are the same""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("Table", "Table") - expected_msg = ( - "Tables in relation '(Table, Table)' are the same. " - "They must be different." - ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - - def test_dict_spec_relation_table_must_be_in_table_list(self): - """Test Dataset raising ValueError when a rel. table is not in the table list""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("NonExistentTable", "D") - expected_msg = ( - "X['tables'] does not contain a table named 'NonExistentTable'. " - "All tables in X['relations'] must be declared in X['tables']" - ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - - def test_dict_spec_relation_must_appear_once(self): - """Test Dataset raising ValueError if a relation appears more than once""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"].append(("B", "D")) - expected_msg = ( - "Relation '(B, D)' occurs '2' times. Each relation must be unique." - ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + ##################################### + # Tests for dictionary dataset spec # + ##################################### def test_dict_spec_key_tables_must_be_present(self): """Test Dataset raising ValueError if the 'tables' key is missing""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["tables"] - expected_msg = "Mandatory key 'tables' missing from dict 'X'" + expected_msg = "'tables' entry missing from dataset dict spec" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_key_tables_must_be_mapping(self): """Test Dataset raising TypeError if the 'tables' key is not a mapping""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"] = AnotherType() - expected_msg = type_error_message("X['tables']", bad_spec["tables"], Mapping) + expected_msg = type_error_message("'tables' entry", bad_spec["tables"], Mapping) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) def test_dict_spec_table_list_cannot_be_empty(self): """Test Dataset raising ValueError if the 'tables' key is empty""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"] = {} - expected_msg = "X['tables'] cannot be empty" + expected_msg = "'tables' dictionary cannot be empty" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_table_input_type_must_be_a_tuple(self): @@ -568,7 +486,7 @@ def test_dict_spec_table_input_type_must_be_a_tuple(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = list(bad_spec["tables"]["D"]) expected_msg = type_error_message( - "Table input at X['tables']['D']", bad_spec["tables"]["D"], tuple + "'D' table entry", bad_spec["tables"]["D"], tuple ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -576,7 +494,7 @@ def test_dict_spec_table_input_tuple_must_have_size_2(self): """Test Dataset raising ValueError when a table entry is a tuple of size != 2""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (*bad_spec["tables"]["D"], "AnotherT", "YetAnotherT") - expected_msg = "Table input tuple at X['tables']['D'] must have size 2 not 4" + expected_msg = "'D' table entry must have size 2, not 4" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_source_table_type_must_be_adequate(self): @@ -584,9 +502,10 @@ def test_dict_spec_source_table_type_must_be_adequate(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (AnotherType(), bad_spec["tables"]["D"][-1]) expected_msg = type_error_message( - "Table source at X['tables']['D']", + "'D' table's source", bad_spec["tables"]["D"][0], - "array-like or scipy.sparse.spmatrix", + "array-like", + "scipy.sparse.spmatrix", str, ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -596,7 +515,7 @@ def test_dict_spec_table_key_must_be_str_or_sequence(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (bad_spec["tables"]["D"][0], AnotherType()) expected_msg = type_error_message( - "Table key at X['tables']['D']", + "'D' table's key", bad_spec["tables"]["D"][1], str, Sequence, @@ -610,15 +529,15 @@ def test_dict_spec_table_key_column_type_must_be_str(self): bad_key = ["User_ID", AnotherType(), "VAR_2"] bad_spec["tables"]["D"] = (dataframe, bad_key) expected_msg = type_error_message( - "Column name of table key at X['tables']['D']", bad_key[1], str + "'D' table's key column name", bad_key[1], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) def test_dict_spec_main_table_must_be_specified_for_multitable_datasets(self): - """Test Dataset raising ValueError if 'main_table' is not a key in a MT spec""" + """Test Dataset raising ValueError if 'main_table' is not a key in an MT spec""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["main_table"] - expected_msg = "'main_table' must be specified for multi-table datasets" + expected_msg = "'main_table' entry must be specified for multi-table datasets" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_main_table_must_be_str(self): @@ -626,7 +545,7 @@ def test_dict_spec_main_table_must_be_str(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["main_table"] = 1 expected_msg = type_error_message( - "X['main_table']", bad_spec["main_table"], str + "'main_table' entry", bad_spec["main_table"], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -634,35 +553,39 @@ def test_dict_spec_main_table_not_declared_in_tables(self): """Test Dataset raising ValueError if the main table is not in the table list""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["tables"][bad_spec["main_table"]] - expected_msg = "X['main_table'] (A) must be present in X['tables']" + expected_msg = ( + "A table entry with the main table's name ('A') " + "must be present in the 'tables' dictionary" + ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - def test_dic_spec_main_table_key_must_be_specified(self): - """Test Dataset raising ValueError if a MT spec doesn't have a main table key""" + def test_dict_spec_main_table_key_must_be_specified(self): + """Test Dataset raise ValueError if an MT spec doesn't have a main table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"][bad_spec["main_table"]] bad_spec["tables"][bad_spec["main_table"]] = (dataframe, None) - expected_msg = "key of the root table is 'None'" + expected_msg = ( + "key of main table 'A' is 'None': " + "table keys must be specified in multi-table datasets" + ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - def test_dict_spec_main_table_key_must_be_non_empty_for_multitable_datasets(self): - """Test Dataset raising ValueError if a MT spec have an empty main table key""" + def test_dict_spec_table_key_must_be_non_empty_for_multitable_datasets(self): + """Test Dataset raising ValueError if an MT spec have an empty table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"][bad_spec["main_table"]] bad_spec["tables"][bad_spec["main_table"]] = (dataframe, []) - expected_msg = ( - "key of the root table must be non-empty for multi-table datasets" - ) + expected_msg = f"'{bad_spec['main_table']}' table's key is empty" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_secondary_table_key_must_be_specified(self): - """Test Dataset raising ValueError if a MT spec doesn't have a sec. table key""" + """Test Dataset raise ValueError if an MT spec doesn't have a sec. table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"]["D"] bad_spec["tables"]["D"] = (dataframe, None) expected_msg = ( - "key of the secondary table 'D' is 'None': " - "table keys must be specified in multitable datasets" + "key of secondary table 'D' is 'None': " + "table keys must be specified in multi-table datasets" ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) @@ -670,15 +593,22 @@ def test_dict_spec_format_must_be_tuple(self): """Test Dataset raising a TypeError if the format field is not a tuple""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["format"] = AnotherType() - expected_msg = type_error_message("X['format']", bad_spec["format"], tuple) + expected_msg = type_error_message("'format' entry", bad_spec["format"], tuple) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + def test_dict_spec_format_must_have_size_2(self): + """Test Dataset raising a ValueError if its 'format' entry is not of size 2""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["format"] = (",", True, AnotherType(), AnotherType(), AnotherType()) + expected_msg = "'format' entry must be a tuple of size 2, not 5" + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + def test_dict_spec_format_tuple_1st_element_must_be_str(self): """Test Dataset raising a TypeError if any of the format fields are not str""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["format"] = (AnotherType(), True) expected_msg = type_error_message( - "X['format'] 1st element", bad_spec["format"][0], str + "'format' tuple's 1st element (separator)", bad_spec["format"][0], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -690,7 +620,7 @@ def test_dict_spec_format_tuple_2nd_element_must_be_bool(self): ) bad_spec["format"] = (",", AnotherType()) expected_msg = type_error_message( - "X['format'] 2nd element", bad_spec["format"][1], bool + "'format' tuple's 2nd element (header)", bad_spec["format"][1], bool ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -701,7 +631,7 @@ def test_dict_spec_format_tuple_1st_element_must_be_a_single_character(self): data_type="file", ) bad_spec["format"] = (";;", True) - expected_msg = "Separator must be a single character. Value: ;;" + expected_msg = "'format' separator must be a single char, got ';;'" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_y_type_must_be_series_or_df_when_x_is_df_spec(self): @@ -754,7 +684,7 @@ def test_dict_spec_table_name_must_be_str(self): expected_msg = type_error_message("name", AnotherType(), str) self.assertEqual(output_error_msg, expected_msg) - def test_dict_spec_table_name_is_empty_string(self): + def test_dict_spec_table_nameis_empty_string(self): """Test Dataset raising ValueError when a table name is empty""" spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) with self.assertRaises(ValueError) as context: @@ -803,6 +733,125 @@ def test_dict_spec_key_column_type_must_be_str_or_int(self): output_error_msg = str(context.exception) self.assertEqual(output_error_msg, expected_error_msg) + def test_dict_spec_relations_must_be_list_like(self): + """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"] = AnotherType() + expected_msg = type_error_message( + "'relations' entry", + bad_spec["relations"], + "list-like", + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relations_must_be_tuple(self): + """Test Dataset raising TypeError when a relation is not a tuple""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = AnotherType() + expected_msg = type_error_message("Relation", bad_spec["relations"][0], "tuple") + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relations_must_be_of_size_2_or_3(self): + """Test Dataset raising ValueError when a relation is not of size 2 or 3""" + bad_spec, y = self.create_fixture_dataset_spec() + for size in [0, 1, 4, 5]: + bad_spec["relations"][0] = tuple((f"Table{i}" for i in range(size))) + expected_msg = f"A relation must be of size 2 or 3, not {size}" + with self.subTest(tuple_size=size): + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_table_relation_must_be_str(self): + """Test Dataset raising TypeError when a relation table is not a str""" + # Test the error in the left table + bad_spec, y = self.create_fixture_dataset_spec() + first_relation = bad_spec["relations"][0] + bad_spec["relations"][0] = (AnotherType(), "D") + expected_msg = type_error_message( + "Relation #1's parent table", bad_spec["relations"][0][0], str + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + # Test the error in the right table + bad_spec["relations"][0] = first_relation + bad_spec["relations"][1] = ("A", AnotherType()) + expected_msg = type_error_message( + "Relation #2's child table", bad_spec["relations"][1][1], str + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_entiy_flag_relation_must_be_bool(self): + """Test Dataset raising TypeError when the entity flag is not boolean""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("B", "D", AnotherType()) + expected_msg = type_error_message( + "Relation #1 (B, D) 1-1 flag", bad_spec["relations"][0][2], bool + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relation_tables_must_not_be_the_same(self): + """Test Dataset raising ValueError when tables of a relation are equal""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("Table", "Table") + expected_msg = ( + "Relation #1's tables are equal: (Table, Table). They must be different." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_table_must_be_in_table_list(self): + """Test Dataset raising ValueError when a rel. table is not in the table list""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("NonExistentTable", "D") + expected_msg = ( + "Relation #1 (NonExistentTable, D) contains " + "non-existent table 'NonExistentTable'. " + "All relation tables must exist in the 'tables' entry." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_must_appear_once(self): + """Test Dataset raising ValueError if a relation appears more than once""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"].append(("B", "D")) + expected_msg = ( + "Relation #1 (B, D) occurs 2 times. Each relation must be unique." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_non_hierarchical_key(self): + """Test Dataset raising ValueError on non-hierarchical table keys""" + ref_spec, y = self.create_fixture_dataset_spec() + bad_spec = { + "main_table": "B", + "tables": { + "A": ref_spec["tables"]["A"], + "B": ref_spec["tables"]["B"], + "C": ref_spec["tables"]["C"], + }, + "relations": [("A", "C"), ("B", "A")], + } + expected_msg = ( + "Relation #2 child table 'A' key ([User_ID]) " + "does not contain that of parent table 'B' ([User_ID, VAR_1])." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_cycle(self): + """Test Dataset raising ValueError when there is a relation cycle""" + ref_spec, y = self.create_fixture_dataset_spec() + bad_spec = { + "main_table": "A", + "tables": { + "A": ref_spec["tables"]["A"], + "B": ref_spec["tables"]["B"], + "C": ref_spec["tables"]["C"], + }, + "relations": [("A", "C"), ("A", "B"), ("C", "A")], + } + expected_msg = ( + "'relations' entry contains a cycle that includes " "the relation (C, A)." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + ############################ # Tests for DatasetTable's # ############################ @@ -917,9 +966,9 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self): expected_msg_prefix = "Cannot overwrite this table's path" self.assertIn(expected_msg_prefix, output_error_msg) - #################################################### - # Tests for X tuple and sequence spec (deprecated) # - #################################################### + ########################################################## + # Tests for tuple and sequence dataset spec (deprecated) # + ########################################################## def test_tuple_spec_must_have_length_2(self): """Test that `.Dataset` raises `ValueError` when the tuple is not of size 2""" From 40f0d2aa9d823db62527a00aa775bee72375ddf1 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 15 Apr 2024 17:24:41 +0200 Subject: [PATCH 02/12] Undeprecate file-based datasets --- khiops/sklearn/estimators.py | 2 +- khiops/sklearn/tables.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index d863e532..e4980857 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -525,7 +525,7 @@ def _transform_deploy_model( def _transform_check_dataset(self, dataset): """Checks the dataset before deploying a model on them""" - if not dataset.is_in_memory() and self.output_dir is None: + if not dataset.is_in_memory() or self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _transform_deployment_post_process( diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 0b347968..062ba39f 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -642,15 +642,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): # Initialize a file dataset if isinstance(main_table_source, str): - warnings.warn( - deprecation_message( - "File-path dataset input", - "11.0.0", - "dataframe-based dataset or khiops.core API", - quote=False, - ), - stacklevel=4, - ) # Obtain the file format parameters if "format" in X: self.sep, self.header = X["format"] From dbb4a5a8a20c569b4cd183ec3d9124d3279ebcc6 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:53:34 +0200 Subject: [PATCH 03/12] Use Khiops to detect type for file datasets Also: - Simplification of the dictionary dataset tests - Add exceptions to dictionary dataset fixtures - Use a fixed seed for the generated data --- khiops/sklearn/estimators.py | 48 ++- khiops/sklearn/tables.py | 139 +++++-- tests/test_dataset_class.py | 759 ++++++++++++----------------------- tests/test_dataset_errors.py | 39 +- tests/test_sklearn.py | 8 +- 5 files changed, 408 insertions(+), 585 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index e4980857..dd320c05 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -122,34 +122,44 @@ def _check_dictionary_compatibility( ) -def _check_categorical_target_type(dataset): - assert ( - dataset.main_table.target_column_id is not None - ), "Target column not specified in dataset." - if not ( - isinstance(dataset.target_column_type, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(dataset.target_column_type) - or pd.api.types.is_integer_dtype(dataset.target_column_type) - or pd.api.types.is_float_dtype(dataset.target_column_type) +def _check_categorical_target_type(ds): + if ds.target_column_type is None: + raise ValueError("Target vector is not specified.") + + if ds.is_in_memory() and not ( + isinstance(ds.target_column_type, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column_type) + or pd.api.types.is_integer_dtype(ds.target_column_type) + or pd.api.types.is_float_dtype(ds.target_column_type) ): raise ValueError( - f"'y' has invalid type '{dataset.target_column_type}'. " + f"'y' has invalid type '{ds.target_column_type}'. " "Only string, integer, float and categorical types " "are accepted for the target." ) + elif not ds.is_in_memory() and ds.target_column_type != "Categorical": + raise ValueError( + f"Target column has invalid type '{ds.target_column_type}'. " + "Only Categorical types are accepted for file datasets." + ) -def _check_numerical_target_type(dataset): - assert ( - dataset.main_table.target_column_id is not None - ), "Target column not specified in dataset." - if not pd.api.types.is_numeric_dtype(dataset.target_column_type): +def _check_numerical_target_type(ds): + if ds.target_column_type is None: + raise ValueError("Target vector is not specified.") + if ds.is_in_memory(): + if not pd.api.types.is_numeric_dtype(ds.target_column_type): + raise ValueError( + f"Unknown label type '{ds.target_column_type}'. " + "Expected a numerical type." + ) + if ds.main_table.target_column is not None: + assert_all_finite(ds.main_table.target_column) + elif not ds.is_in_memory() and ds.target_column_type != "Numerical": raise ValueError( - f"Unknown label type '{dataset.target_column_type}'. " - "Expected a numerical type." + f"Target column has invalid type '{ds.target_column_type}'. " + "Only Numerical types are accepted for file datasets." ) - if dataset.is_in_memory() and dataset.main_table.target_column is not None: - assert_all_finite(dataset.main_table.target_column) def _cleanup_dir(target_dir): diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 062ba39f..2136e13e 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -7,6 +7,8 @@ """Classes for handling diverse data tables""" import csv import io +import json +import os import warnings from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence @@ -20,6 +22,7 @@ import khiops.core as kh import khiops.core.internals.filesystems as fs from khiops.core.dictionary import VariableBlock +from khiops.core.exceptions import KhiopsRuntimeError from khiops.core.internals.common import ( deprecation_message, is_dict_like, @@ -125,6 +128,17 @@ def check_multitable_spec(ds_spec): "table keys must be specified in multi-table datasets" ) + # Check that all the tables have the same type as the main + main_table_type = type(ds_spec["tables"][ds_spec["main_table"]][0]) + for table_name, (table_source, _) in ds_spec["tables"].items(): + if table_name != ds_spec["main_table"]: + if not isinstance(table_source, main_table_type): + raise ValueError( + f"Secondary table '{table_name}' has type " + f"'{type(table_source).__name__}' which is different from the " + f"main table's type '{main_table_type.__name__}'." + ) + # If the 'relations' entry exists check it if "relations" in ds_spec: relations_spec = ds_spec["relations"] @@ -290,14 +304,16 @@ def get_khiops_type(numpy_type): lower_numpy_type = str(numpy_type).lower() # timedelta64 and datetime64 types - if "time" in lower_numpy_type: - return "Timestamp" + if "datetime64" in lower_numpy_type or "timedelta64" in lower_numpy_type: + khiops_type = "Timestamp" # float, int, uint types elif "int" in lower_numpy_type or "float" in lower_numpy_type: - return "Numerical" + khiops_type = "Numerical" # bool_ and object, character, bytes_, str_, void, record and other types else: - return "Categorical" + khiops_type = "Categorical" + + return khiops_type def read_internal_data_table(file_path_or_stream): @@ -898,14 +914,7 @@ def create_table_files_for_khiops(self, target_dir, sort=True): @property def target_column_type(self): """The target column's type""" - if self.main_table.target_column_id is None: - raise ValueError("Target column is not set") - if self.is_in_memory(): - return self.main_table.target_column.dtype - else: - return self.main_table.table_sample_df.dtypes[ - self.main_table.target_column_id - ] + return self.main_table.target_column_type def __repr__(self): return str(self.create_khiops_dictionary_domain()) @@ -1209,6 +1218,13 @@ def _create_dataframe_copy(self): return output_dataframe + @property + def target_column_type(self): + target_column_type = None + if self.target_column is not None: + target_column_type = self.target_column.dtype + return target_column_type + class NumpyTable(DatasetTable): """Table encapsulating (X,y) pair with types (ndarray, ndarray) @@ -1317,6 +1333,13 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path + @property + def target_column_type(self): + target_column_type = None + if self.target_column is not None: + target_column_type = self.target_column.dtype + return target_column_type + class SparseTable(DatasetTable): """Table encapsulating feature matrix X and target array y @@ -1494,6 +1517,13 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path + @property + def target_column_type(self): + target_column_type = None + if self.target_column is not None: + target_column_type = self.target_column.dtype + return target_column_type + class FileTable(DatasetTable): """A table representing a delimited text file @@ -1529,39 +1559,67 @@ def __init__( # Initialize parameters super().__init__(name=name, categorical_target=categorical_target, key=key) - # Check inputs specific to this sub-class + # Check the parameters specific to this sub-class if not isinstance(path, str): raise TypeError(type_error_message("path", path, str)) if not fs.exists(path): raise ValueError(f"Non-existent data table file: {path}") # Initialize members specific to this sub-class - self.path = path + self.data_source = path self.sep = sep self.header = header self.target_column_id = target_column_id - # Obtain the columns and their types from a sample of the data table - # We build the sample by reading the first 100 rows / 4MB of the file - table_file_head_contents = fs.read(self.path, size=4096 * 1024 - 1) - with io.BytesIO(table_file_head_contents) as table_file_head_contents_stream: - self.table_sample_df = pd.read_csv( - table_file_head_contents_stream, - nrows=100, - sep=self.sep, - header=0 if self.header else None, + # Build a dictionary file from the input data table + # Note: We use export_dictionary_as_json instead of read_dictionary_file + # because it makes fail the sklearn mocked tests (this is technical debt) + try: + tmp_kdic_path = kh.get_runner().create_temp_file("file_table_", ".kdic") + tmp_kdicj_path = kh.get_runner().create_temp_file("file_table_", ".kdicj") + kh.build_dictionary_from_data_table( + self.data_source, + self.name, + tmp_kdic_path, + field_separator=self.sep, + header_line=header, ) + kh.export_dictionary_as_json(tmp_kdic_path, tmp_kdicj_path) + with open(tmp_kdicj_path, encoding="utf8") as tmp_kdicj: + json_domain = json.load(tmp_kdicj) + finally: + os.remove(tmp_kdic_path) + os.remove(tmp_kdicj_path) + + # Alert the user if the parsing failed + if len(json_domain["dictionaries"]) == 0: + raise KhiopsRuntimeError( + f"Failed to build a dictionary " + f"from data table file: {self.data_source}" + ) + + # Set the column names and types + assert json_domain["dictionaries"][0]["name"] == self.name + variables = json_domain["dictionaries"][0]["variables"] + self.column_ids = [var["name"] for var in variables] + self.khiops_types = {var["name"]: var["type"] for var in variables} - # Raise error if there is no data in the table - if self.table_sample_df.shape[0] == 0: - raise ValueError(f"Empty data table file: {self.path}") + # Check the target column exists + if ( + self.target_column_id is not None + and target_column_id not in self.column_ids + ): + raise ValueError( + f"Target column '{target_column_id}'" + f"not present in columns '{self.column_ids}'" + ) - # Save the columns and their types - self.column_ids = self.table_sample_df.columns.values - self.khiops_types = { - column_id: get_khiops_type(data_type) - for column_id, data_type in self.table_sample_df.dtypes.items() - } + # Force the target column type from the parameters + if self.target_column_id is not None: + if categorical_target: + self.khiops_types[target_column_id] = "Categorical" + else: + self.khiops_types[target_column_id] = "Numerical" # Check key integrity self.check_key() @@ -1587,8 +1645,8 @@ def create_table_file_for_khiops(self, output_dir, sort=True): ) # Fail if they have the same path - if output_table_file_path == self.path: - raise ValueError(f"Cannot overwrite this table's path: {self.path}") + if output_table_file_path == self.data_source: + raise ValueError(f"Cannot overwrite this table's path: {self.data_source}") # Create a sorted copy if requested if sort: @@ -1601,7 +1659,7 @@ def create_table_file_for_khiops(self, output_dir, sort=True): kh.sort_data_table( sort_dictionary_domain, self.name, - self.path, + self.data_source, output_table_file_path, self.key, field_separator=self.sep, @@ -1612,6 +1670,15 @@ def create_table_file_for_khiops(self, output_dir, sort=True): # Otherwise copy the contents to the output file else: - fs.write(output_table_file_path, fs.read(self.path)) + fs.write(output_table_file_path, fs.read(self.data_source)) return output_table_file_path + + @property + def target_column_type(self): + target_column_type = None + if self.target_column_id is not None: + target_column_type = ( + "Categorical" if self.categorical_target else "Numerical" + ) + return target_column_type diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 3a0d417f..b1dba2d4 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -8,6 +8,7 @@ import os import shutil import unittest +import warnings import numpy as np import pandas as pd @@ -19,7 +20,7 @@ from khiops.sklearn.tables import Dataset -class KhiopsConsistensyOfFilesAndDictionariesWithInputDataTests(unittest.TestCase): +class DatasetInputOutputConsistency(unittest.TestCase): """Test consistency of the created files with the input data The following tests allow to verify that: @@ -85,7 +86,7 @@ def create_monotable_dataframe(self): "2019-03-29", "2019-03-30", "2019-03-31", - ] + ], ), "New": [ True, @@ -170,6 +171,7 @@ def create_multitable_star_data_files(self, main_table_path, secondary_table_pat secondary_table.to_csv(secondary_table_path, sep="\t", index=False) def create_multitable_snowflake_dataframes(self): + np.random.seed(31416) main_table_data = { "User_ID": [ "60B2Xk_3Fw", @@ -255,55 +257,53 @@ def create_multitable_snowflake_data_files( tertiary_table.to_csv(tertiary_table_path, sep="\t", index=False) quaternary_table.to_csv(quaternary_table_path, sep="\t", index=False) - def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema): + def create_fixture_ds_spec(self, output_dir, data_type, multitable, schema): if not multitable: if data_type == "df": - reference_table = self.create_monotable_dataframe() - features = reference_table.drop(["class"], axis=1) - dataset_spec = { + ref_table = self.create_monotable_dataframe() + features = ref_table.drop(["class"], axis=1) + ds_spec = { "main_table": "Reviews", "tables": {"Reviews": (features, "User_ID")}, } - label = reference_table["class"] + label = ref_table["class"] else: assert data_type == "file" - reference_table_path = os.path.join(output_dir, "Reviews.csv") - self.create_monotable_data_file(reference_table_path) - dataset_spec = { + ref_table_path = os.path.join(output_dir, "Reviews.csv") + self.create_monotable_data_file(ref_table_path) + ds_spec = { "main_table": "Reviews", - "tables": {"Reviews": (reference_table_path, "User_ID")}, + "tables": {"Reviews": (ref_table_path, "User_ID")}, "format": ("\t", True), } label = "class" elif schema == "star": if data_type == "df": ( - reference_main_table, - reference_secondary_table, + ref_main_table, + ref_secondary_table, ) = self.create_multitable_star_dataframes() - features_reference_main_table = reference_main_table.drop( - "class", axis=1 - ) - dataset_spec = { + features_ref_main_table = ref_main_table.drop("class", axis=1) + ds_spec = { "main_table": "id_class", "tables": { - "id_class": (features_reference_main_table, "User_ID"), - "logs": (reference_secondary_table, "User_ID"), + "id_class": (features_ref_main_table, "User_ID"), + "logs": (ref_secondary_table, "User_ID"), }, } - label = reference_main_table["class"] + label = ref_main_table["class"] else: assert data_type == "file" - reference_main_table_path = os.path.join(output_dir, "id_class.csv") - reference_secondary_table_path = os.path.join(output_dir, "logs.csv") + ref_main_table_path = os.path.join(output_dir, "id_class.csv") + ref_secondary_table_path = os.path.join(output_dir, "logs.csv") self.create_multitable_star_data_files( - reference_main_table_path, reference_secondary_table_path + ref_main_table_path, ref_secondary_table_path ) - dataset_spec = { + ds_spec = { "main_table": "id_class", "tables": { - "id_class": (reference_main_table_path, "User_ID"), - "logs": (reference_secondary_table_path, "User_ID"), + "id_class": (ref_main_table_path, "User_ID"), + "logs": (ref_secondary_table_path, "User_ID"), }, "format": ("\t", True), } @@ -312,30 +312,28 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) assert schema == "snowflake" if data_type == "df": ( - reference_main_table, - reference_secondary_table_1, - reference_secondary_table_2, - reference_tertiary_table, - reference_quaternary_table, + ref_main_table, + ref_secondary_table_1, + ref_secondary_table_2, + ref_tertiary_table, + ref_quaternary_table, ) = self.create_multitable_snowflake_dataframes() - features_reference_main_table = reference_main_table.drop( - "class", axis=1 - ) - dataset_spec = { + features_ref_main_table = ref_main_table.drop("class", axis=1) + ds_spec = { "main_table": "A", "tables": { "D": ( - reference_tertiary_table, + ref_tertiary_table, ["User_ID", "VAR_1", "VAR_2"], ), - "B": (reference_secondary_table_1, ["User_ID", "VAR_1"]), + "B": (ref_secondary_table_1, ["User_ID", "VAR_1"]), "E": ( - reference_quaternary_table, + ref_quaternary_table, ["User_ID", "VAR_1", "VAR_2", "VAR_3"], ), - "C": (reference_secondary_table_2, ["User_ID"]), - "A": (features_reference_main_table, "User_ID"), + "C": (ref_secondary_table_2, ["User_ID"]), + "A": (features_ref_main_table, "User_ID"), }, "relations": [ ("B", "D", False), @@ -344,40 +342,40 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) ("A", "B", False), ], } - label = reference_main_table["class"] + label = ref_main_table["class"] else: assert data_type == "file" - reference_main_table_path = os.path.join(output_dir, "A.csv") - reference_secondary_table_path_1 = os.path.join(output_dir, "B.csv") - reference_secondary_table_path_2 = os.path.join(output_dir, "C.csv") - reference_tertiary_table_path = os.path.join(output_dir, "D.csv") - reference_quaternary_table_path = os.path.join(output_dir, "E.csv") + ref_main_table_path = os.path.join(output_dir, "A.csv") + ref_secondary_table_path_1 = os.path.join(output_dir, "B.csv") + ref_secondary_table_path_2 = os.path.join(output_dir, "C.csv") + ref_tertiary_table_path = os.path.join(output_dir, "D.csv") + ref_quaternary_table_path = os.path.join(output_dir, "E.csv") self.create_multitable_snowflake_data_files( - reference_main_table_path, - reference_secondary_table_path_1, - reference_secondary_table_path_2, - reference_tertiary_table_path, - reference_quaternary_table_path, + ref_main_table_path, + ref_secondary_table_path_1, + ref_secondary_table_path_2, + ref_tertiary_table_path, + ref_quaternary_table_path, ) - dataset_spec = { + ds_spec = { "main_table": "A", "tables": { "B": ( - reference_secondary_table_path_1, + ref_secondary_table_path_1, ["User_ID", "VAR_1"], ), "E": ( - reference_quaternary_table_path, + ref_quaternary_table_path, ["User_ID", "VAR_1", "VAR_2", "VAR_3"], ), "C": ( - reference_secondary_table_path_2, + ref_secondary_table_path_2, ["User_ID"], ), - "A": (reference_main_table_path, "User_ID"), + "A": (ref_main_table_path, "User_ID"), "D": ( - reference_tertiary_table_path, + ref_tertiary_table_path, ["User_ID", "VAR_1", "VAR_2"], ), }, @@ -391,12 +389,12 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) } label = "class" - return dataset_spec, label + return ds_spec, label - def get_reference_dictionaries(self, multitable, schema=None): - reference_dictionaries = [] + def get_ref_var_types(self, multitable, data_type="df", schema=None): + ref_var_types = {} if not multitable: - reference_dictionary = { + ref_var_types["Reviews"] = { "User_ID": "Categorical", "Age": "Numerical", "Clothing ID": "Numerical", @@ -407,32 +405,42 @@ def get_reference_dictionaries(self, multitable, schema=None): "Positive Feedback average": "Numerical", "class": "Categorical", } - reference_dictionaries.extend([reference_dictionary]) + # Special type changes for file datasets: + # - "Date" field from "Timestamp" to "Date", the type Khiops detects + # - "Recommended IND" field from "Numerical" to "Categorical" because + # Khiops doesn't parse it well + if data_type == "file": + ref_var_types["Reviews"]["Date"] = "Date" + ref_var_types["Reviews"]["Recommended IND"] = "Categorical" + warnings.warn("Changed field `Recommended IND` to avoid a Khiops bug") elif schema == "star": - reference_main_dictionary = { + ref_var_types["id_class"] = { "User_ID": "Categorical", "class": "Categorical", "logs": "Table", } - reference_secondary_dictionary = { + ref_var_types["logs"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", "VAR_3": "Numerical", "VAR_4": "Numerical", } - reference_dictionaries.extend( - [reference_main_dictionary, reference_secondary_dictionary] - ) + # Special change for the file type: + # - logs.VAR_3 is binary and detected as "Categorical" by Khiops + if data_type == "file": + ref_var_types["logs"]["VAR_3"] = "Categorical" else: - assert schema == "snowflake" - reference_main_dictionary = { + assert ( + schema == "snowflake" + ), f"'schema' should be 'snowflake' not '{schema}'" + ref_var_types["A"] = { "User_ID": "Categorical", "class": "Categorical", "B": "Table", "C": "Entity", } - reference_secondary_dictionary_1 = { + ref_var_types["B"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", @@ -440,55 +448,52 @@ def get_reference_dictionaries(self, multitable, schema=None): "VAR_4": "Numerical", "D": "Table", } - reference_secondary_dictionary_2 = { + ref_var_types["C"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", "VAR_3": "Numerical", "VAR_4": "Numerical", } - reference_tertiary_dictionary = { + ref_var_types["D"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Categorical", "VAR_3": "Numerical", "E": "Table", } - reference_quaternary_dictionary = { + ref_var_types["E"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Categorical", "VAR_3": "Categorical", "VAR_4": "Categorical", } - reference_dictionaries.extend( - [ - reference_main_dictionary, - reference_secondary_dictionary_1, - reference_secondary_dictionary_2, - reference_tertiary_dictionary, - reference_quaternary_dictionary, - ] - ) + # Special change for the file type: + # - B.VAR_3 is binary and detected as "Categorical" by Khiops + # - C.VAR_3 is binary and detected as "Categorical" by Khiops + if data_type == "file": + ref_var_types["B"]["VAR_3"] = "Categorical" + ref_var_types["C"]["VAR_3"] = "Categorical" - return reference_dictionaries + return ref_var_types def test_dataset_is_correctly_built(self): """Test that the dataset structure is consistent with the input spec""" - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="snowflake" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) self.assertEqual(dataset.main_table.name, "A") self.assertEqual(len(dataset.secondary_tables), 4) - dataset_secondary_table_names = set( + dataset_secondary_table_names = { secondary_table.name for secondary_table in dataset.secondary_tables - ) + } self.assertEqual(dataset_secondary_table_names, {"B", "C", "D", "E"}) self.assertEqual(len(dataset.relations), 4) - spec_relations = dataset_spec["relations"] + spec_relations = ds_spec["relations"] for relation, spec_relation in zip(dataset.relations, spec_relations): self.assertEqual(relation[:2], spec_relation[:2]) if len(spec_relation) == 3: @@ -496,34 +501,34 @@ def test_dataset_is_correctly_built(self): else: self.assertFalse(relation[2]) - def test_created_file_from_dataframe_monotable(self): + def test_out_file_from_dataframe_monotable(self): """Test consistency of the created data file with the input dataframe - This test verifies that the content of the input dataframe is equal to that of the csv file created by khiops.sklearn. """ # Create a monotable dataset object from fixture data - spec, y = self.create_fixture_dataset_spec( + spec, y = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=False, schema=None ) dataset = Dataset(spec, y=y) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = pd.read_csv(created_table_path, sep="\t") + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = pd.read_csv(out_table_path, sep="\t") # Cast "Date" columns to datetime as we don't automatically recognize dates - created_table["Date"] = created_table["Date"].astype("datetime64[ns]") - reference_table = spec["tables"]["Reviews"][0] - reference_table["class"] = y + out_table["Date"] = out_table["Date"].astype("datetime64[ns]") + ref_table = spec["tables"]["Reviews"][0] + ref_table["class"] = y # Check that the dataframes are equal assert_frame_equal( - created_table, - reference_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, + ref_table.sort_values(by="User_ID").reset_index(drop=True), ) - def test_created_file_from_numpy_array_monotable(self): + def test_out_file_from_numpy_array_monotable(self): """Test consistency of the created data file with the input numpy array""" # Create a monotable dataset from a numpy array iris = datasets.load_iris() @@ -531,14 +536,12 @@ def test_created_file_from_numpy_array_monotable(self): dataset = Dataset(spec, y=iris.target, categorical_target=True) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = np.loadtxt( - created_table_path, delimiter="\t", skiprows=1, ndmin=2 - ) + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = np.loadtxt(out_table_path, delimiter="\t", skiprows=1, ndmin=2) # Check that the arrays are equal assert_equal( - created_table, + out_table, np.concatenate( (iris.data, iris.target.reshape(len(iris.target), 1)), axis=1 ), @@ -580,7 +583,7 @@ def _load_khiops_sparse_file(self, stream): sparse_matrix = sp.csr_matrix(feature_matrix) return sparse_matrix, target_array - def test_created_file_from_sparse_matrix_monotable(self): + def test_out_file_from_sparse_matrix_monotable(self): """Test consistency of the created data file with the input sparse matrix""" # Load input sparse matrix and target array @@ -594,10 +597,10 @@ def test_created_file_from_sparse_matrix_monotable(self): X=input_sparse_matrix, y=input_target, categorical_target=True ) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - with open(created_table_path, "rb") as created_table_stream: + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(out_table_path, "rb") as out_table_stream: sparse_matrix, target_array = self._load_khiops_sparse_file( - created_table_stream + out_table_stream ) # Check that the arrays are equal @@ -614,7 +617,7 @@ def test_created_file_from_sparse_matrix_monotable(self): ), ) - def test_created_file_from_sparse_matrix_monotable_specification(self): + def test_out_file_from_sparse_matrix_monotable_specification(self): """Test consistency of the created data file with the input sparse matrix""" # Load input sparse matrix and target array @@ -628,10 +631,10 @@ def test_created_file_from_sparse_matrix_monotable_specification(self): dataset = Dataset(spec, y=input_target, categorical_target=True) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - with open(created_table_path, "rb") as created_table_stream: + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(out_table_path, "rb") as out_table_stream: sparse_matrix, target_array = self._load_khiops_sparse_file( - created_table_stream + out_table_stream ) # Check that the arrays are equal @@ -648,31 +651,31 @@ def test_created_file_from_sparse_matrix_monotable_specification(self): ), ) - def test_created_file_from_data_file_monotable(self): + def test_out_file_from_data_file_monotable(self): """Test consistency of the created data file with the input data file - This test verifies that the content of the input data file is equal to that of the csv file created by khiops.sklearn. """ # Create the test dataset - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=False, schema=None ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = pd.read_csv(created_table_path, sep="\t") + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = pd.read_csv(out_table_path, sep="\t") - reference_table_path = dataset_spec["tables"]["Reviews"][0] - reference_table = pd.read_csv(reference_table_path, sep="\t") + ref_table_path = ds_spec["tables"]["Reviews"][0] + ref_table = pd.read_csv(ref_table_path, sep="\t") # Check that the dataframes are equal assert_frame_equal( - created_table, - reference_table.sort_values(by="User_ID").reset_index(drop=True), + ref_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, ) - def test_created_files_from_dataframes_multitable_star(self): + def test_out_files_from_dataframes_multitable_star(self): """Test consistency of the created data files with the input dataframes - This test verifies that the content of the input dataframes, defined through a @@ -680,10 +683,10 @@ def test_created_files_from_dataframes_multitable_star(self): schema of the dataset is "star". """ # Create the test dataset - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="star" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) # Create the Khiops intermediary files ( @@ -693,487 +696,237 @@ def test_created_files_from_dataframes_multitable_star(self): # Load the intermediary files secondary_table_path = secondary_table_paths["logs"] - created_main_table = pd.read_csv(main_table_path, sep="\t") - created_secondary_table = pd.read_csv(secondary_table_path, sep="\t") + out_main_table = pd.read_csv(main_table_path, sep="\t") + out_secondary_table = pd.read_csv(secondary_table_path, sep="\t") - reference_main_table = dataset_spec["tables"]["id_class"][0] - reference_main_table["class"] = label - reference_secondary_table = dataset_spec["tables"]["logs"][0] + ref_main_table = ds_spec["tables"]["id_class"][0] + ref_main_table["class"] = label + ref_secondary_table = ds_spec["tables"]["logs"][0] # Clean created test data assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) assert_frame_equal( - created_secondary_table.sort_values( - by=created_secondary_table.columns.tolist(), ascending=True + ref_secondary_table.sort_values( + by=ref_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_secondary_table.sort_values( - by=reference_secondary_table.columns.tolist(), ascending=True + out_secondary_table.sort_values( + by=out_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_data_files_multitable_star(self): + def test_out_files_from_data_files_multitable_star(self): """Test consistency of the created data files with the input data files - This test verifies that the content of the input data files, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "star". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=True, schema="star" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) main_table_path, dico_secondary_table = dataset.create_table_files_for_khiops( self.output_dir ) secondary_table_path = dico_secondary_table["logs"] - created_main_table = pd.read_csv(main_table_path, sep="\t") - created_secondary_table = pd.read_csv(secondary_table_path, sep="\t") - - reference_table_path = dataset_spec["tables"]["id_class"][0] - reference_main_table = pd.read_csv(reference_table_path, sep="\t") - reference_secondary_table_path = dataset_spec["tables"]["logs"][0] - reference_secondary_table = pd.read_csv( - reference_secondary_table_path, sep="\t" - ) + out_main_table = pd.read_csv(main_table_path, sep="\t") + out_secondary_table = pd.read_csv(secondary_table_path, sep="\t") + + ref_table_path = ds_spec["tables"]["id_class"][0] + ref_main_table = pd.read_csv(ref_table_path, sep="\t") + ref_secondary_table_path = ds_spec["tables"]["logs"][0] + ref_secondary_table = pd.read_csv(ref_secondary_table_path, sep="\t") # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) assert_frame_equal( - created_secondary_table.sort_values( - by=created_secondary_table.columns.tolist(), ascending=True + ref_secondary_table.sort_values( + by=ref_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_secondary_table.sort_values( - by=reference_secondary_table.columns.tolist(), ascending=True + out_secondary_table.sort_values( + by=out_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_dataframes_multitable_snowflake(self): + def test_out_files_from_dataframes_multitable_snowflake(self): """Test consistency of the created data files with the input dataframes - This test verifies that the content of the input dataframes, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "snowflake". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="snowflake" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) ( main_table_path, additional_table_paths, ) = dataset.create_table_files_for_khiops(self.output_dir) - created_main_table = pd.read_csv(main_table_path, sep="\t") - reference_main_table = dataset_spec["tables"]["A"][0] - reference_main_table["class"] = label + out_main_table = pd.read_csv(main_table_path, sep="\t") + ref_main_table = ds_spec["tables"]["A"][0] + ref_main_table["class"] = label # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) additional_table_names = list(additional_table_paths.keys()) for name in additional_table_names: additional_table_path = additional_table_paths[name] - created_additional_table = pd.read_csv(additional_table_path, sep="\t") - reference_additional_table = dataset_spec["tables"][name][0] + out_additional_table = pd.read_csv(additional_table_path, sep="\t") + ref_additional_table = ds_spec["tables"][name][0] assert_frame_equal( - created_additional_table.sort_values( - by=created_additional_table.columns.tolist(), ascending=True + ref_additional_table.sort_values( + by=ref_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_additional_table.sort_values( - by=reference_additional_table.columns.tolist(), ascending=True + out_additional_table.sort_values( + by=out_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_data_files_multitable_snowflake(self): + def test_out_files_from_data_files_multitable_snowflake(self): """Test consistency of the created s with the input data files - This test verifies that the content of the input data files, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "snowflake". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=True, schema="snowflake", ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) main_table_path, additional_table_paths = dataset.create_table_files_for_khiops( self.output_dir ) - created_main_table = pd.read_csv(main_table_path, sep="\t") - reference_main_table_path = dataset_spec["tables"]["A"][0] - reference_main_table = pd.read_csv(reference_main_table_path, sep="\t") + out_main_table = pd.read_csv(main_table_path, sep="\t") + ref_main_table_path = ds_spec["tables"]["A"][0] + ref_main_table = pd.read_csv(ref_main_table_path, sep="\t") # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) additional_table_names = list(additional_table_paths.keys()) for name in additional_table_names: additional_table_path = additional_table_paths[name] - created_additional_table = pd.read_csv(additional_table_path, sep="\t") - reference_additional_table_path = dataset_spec["tables"][name][0] - reference_additional_table = pd.read_csv( - reference_additional_table_path, sep="\t" - ) + out_additional_table = pd.read_csv(additional_table_path, sep="\t") + ref_additional_table_path = ds_spec["tables"][name][0] + ref_additional_table = pd.read_csv(ref_additional_table_path, sep="\t") assert_frame_equal( - created_additional_table.sort_values( - by=created_additional_table.columns.tolist(), ascending=True + out_additional_table.sort_values( + by=out_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_additional_table.sort_values( - by=reference_additional_table.columns.tolist(), ascending=True + ref_additional_table.sort_values( + by=ref_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_dictionary_from_dataframe_monotable(self): - """Test consistency of the created dictionary with the input dataframe - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input monotable dataset. Data is here provided through a dataframe. - """ - - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=False, schema=None - ) - - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_dictionary = created_dictionary_domain.dictionaries[0] - created_dictionary_variable_types = { - var.name: var.type for var in created_dictionary.variables - } - reference_dictionary_variable_types = self.get_reference_dictionaries( - multitable=False - )[0] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 1) - self.assertEqual(created_dictionary.name, "Reviews") - self.assertEqual(created_dictionary.root, False) - self.assertEqual(len(created_dictionary.key), 1) - self.assertEqual( - created_dictionary_variable_types, reference_dictionary_variable_types - ) - - def test_created_dictionary_from_data_file_monotable(self): - """Test consistency of the created dictionary with the input data file - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input monotable dataset. Data is here provided through a data file. - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=False, schema=None - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_dictionary = created_dictionary_domain.dictionaries[0] - created_dictionary_variable_types = { - var.name: var.type for var in created_dictionary.variables - } - reference_dictionary_variable_types = self.get_reference_dictionaries( - multitable=False - )[0] - reference_dictionary_variable_types["Date"] = "Categorical" - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 1) - self.assertEqual(created_dictionary.name, "Reviews") - self.assertEqual(created_dictionary.root, False) - self.assertEqual(len(created_dictionary.key), 1) - self.assertEqual( - created_dictionary_variable_types, reference_dictionary_variable_types - ) - - def test_created_dictionary_from_dataframes_multitable_star(self): - """Test consistency of the created dictionaries with the input dataframes - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through dataframes - and its schema is "star". - """ - - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=True, schema="star" - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_main_dictionary = created_dictionary_domain.dictionaries[0] - created_secondary_dictionary = created_dictionary_domain.dictionaries[1] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 2) - self.assertEqual(created_main_dictionary.name, "id_class") - self.assertEqual(created_secondary_dictionary.name, "logs") - self.assertEqual(created_main_dictionary.root, True) - self.assertEqual(created_secondary_dictionary.root, False) - self.assertEqual(created_main_dictionary.key[0], "User_ID") - - created_main_dictionary_variable_types = { - var.name: var.type for var in created_main_dictionary.variables - } - created_secondary_dictionary_variable_types = { - var.name: var.type for var in created_secondary_dictionary.variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="star" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types = reference_dictionaries[1] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types, - reference_secondary_dictionary_variable_types, - ) - - def test_created_dictionary_from_data_files_multitable_star(self): - """Test consistency of the created dictionaries with the input data files - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through data files - and its schema is "star". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=True, schema="star" - ) - - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_main_dictionary = created_dictionary_domain.dictionaries[0] - created_secondary_dictionary = created_dictionary_domain.dictionaries[1] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 2) - self.assertEqual(created_main_dictionary.name, "id_class") - self.assertEqual(created_secondary_dictionary.name, "logs") - self.assertEqual(created_main_dictionary.root, True) - self.assertEqual(created_secondary_dictionary.root, False) - self.assertEqual(created_main_dictionary.key[0], "User_ID") - - created_main_dictionary_variable_types = { - var.name: var.type for var in created_main_dictionary.variables - } - created_secondary_dictionary_variable_types = { - var.name: var.type for var in created_secondary_dictionary.variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="star" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types = reference_dictionaries[1] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types, - reference_secondary_dictionary_variable_types, - ) - - def test_created_dictionary_from_dataframes_multitable_snowflake(self): - """Test consistency of the created dictionaries with the input dataframes - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through dataframes - and its schema is "snowflake". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=True, schema="snowflake" - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - table_names = dataset_spec["tables"].keys() - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 5) - for name in table_names: - created_dictionary = created_dictionary_domain.get_dictionary(name) - self.assertEqual(created_dictionary.name, name) - if name == "A": - self.assertEqual(created_dictionary.root, True) - self.assertEqual( - created_dictionary.key[0], dataset_spec["tables"][name][1] - ) - else: - self.assertEqual(created_dictionary.root, False) - self.assertEqual( - created_dictionary.key, dataset_spec["tables"][name][1] - ) - - created_main_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("A").variables - } - created_secondary_dictionary_variable_types_1 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("B").variables - } - created_secondary_dictionary_variable_types_2 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("C").variables - } - created_tertiary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("D").variables - } - created_quaternary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("E").variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="snowflake" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types_1 = reference_dictionaries[1] - reference_secondary_dictionary_variable_types_2 = reference_dictionaries[2] - reference_tertiary_dictionary_variable_types = reference_dictionaries[3] - reference_quaternary_dictionary_variable_types = reference_dictionaries[4] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_1, - reference_secondary_dictionary_variable_types_1, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_2, - reference_secondary_dictionary_variable_types_2, - ) - self.assertEqual( - created_tertiary_dictionary_variable_types, - reference_tertiary_dictionary_variable_types, - ) - self.assertEqual( - created_quaternary_dictionary_variable_types, - reference_quaternary_dictionary_variable_types, - ) - - def test_created_dictionary_from_data_files_multitable_snowflake(self): - """Test consistency of the created dictionaries with the input data files - - - This test verifies that the dictionary file created by khiops.sklearn - contains information that is consistent with the input multitable dataset. - Data is here provided through data files and its schema is "snowflake". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, - data_type="file", - multitable=True, - schema="snowflake", - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - table_names = dataset_spec["tables"].keys() - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 5) - for name in table_names: - created_dictionary = created_dictionary_domain.get_dictionary(name) - self.assertEqual(created_dictionary.name, name) - - if name == "A": - self.assertEqual(created_dictionary.root, True) - self.assertEqual( - created_dictionary.key[0], dataset_spec["tables"][name][1] - ) - - else: - self.assertEqual(created_dictionary.root, False) - self.assertEqual( - created_dictionary.key, dataset_spec["tables"][name][1] + def test_create_khiops_domain(self): + """Test consistency of the dataset method create_khiops_domain""" + fixtures = [ + { + "output_dir": None, + "data_type": "df", + "multitable": False, + "schema": None, + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": False, + "schema": None, + }, + { + "output_dir": None, + "data_type": "df", + "multitable": True, + "schema": "star", + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": True, + "schema": "star", + }, + { + "output_dir": None, + "data_type": "df", + "multitable": True, + "schema": "snowflake", + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": True, + "schema": "snowflake", + }, + ] + + for fixture in fixtures: + with self.subTest(**fixture): + ds = Dataset(*self.create_fixture_ds_spec(**fixture)) + ref_var_types = self.get_ref_var_types( + multitable=fixture["multitable"], + data_type=fixture["data_type"], + schema=fixture["schema"], ) + self._test_domain_coherence(ds, ref_var_types) - created_main_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("A").variables - } - - created_secondary_dictionary_variable_types_1 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("B").variables - } - - created_secondary_dictionary_variable_types_2 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("C").variables - } + def _test_domain_coherence(self, ds, ref_var_types): + # Create the dictionary domain associated to the fixture dataset + out_domain = ds.create_khiops_dictionary_domain() - created_tertiary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("D").variables - } + # Check that the domain has the same number of tables as the dataset + self.assertEqual(len(out_domain.dictionaries), 1 + len(ds.secondary_tables)) - created_quaternary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("E").variables + # Check that the domain has the same table names as the reference + ref_table_names = { + table.name for table in [ds.main_table] + ds.secondary_tables } + out_table_names = {dictionary.name for dictionary in out_domain.dictionaries} + self.assertEqual(ref_table_names, out_table_names) - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="snowflake" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types_1 = reference_dictionaries[1] - reference_secondary_dictionary_variable_types_2 = reference_dictionaries[2] - reference_tertiary_dictionary_variable_types = reference_dictionaries[3] - reference_quaternary_dictionary_variable_types = reference_dictionaries[4] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) + # Check that the output domain has a root table iff the dataset is multitable self.assertEqual( - created_secondary_dictionary_variable_types_1, - reference_secondary_dictionary_variable_types_1, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_2, - reference_secondary_dictionary_variable_types_2, - ) - self.assertEqual( - created_tertiary_dictionary_variable_types, - reference_tertiary_dictionary_variable_types, - ) - self.assertEqual( - created_quaternary_dictionary_variable_types, - reference_quaternary_dictionary_variable_types, - ) + ds.is_multitable, out_domain.get_dictionary(ds.main_table.name).root + ) + + # Check that: + # - the table keys are the same as the dataset + # - the domain has the same variable names as the reference + for table in [ds.main_table] + ds.secondary_tables: + with self.subTest(table=table.name): + self.assertEqual(table.key, out_domain.get_dictionary(table.name).key) + out_dictionary_var_types = { + var.name: var.type + for var in out_domain.get_dictionary(table.name).variables + } + self.assertEqual(ref_var_types[table.name], out_dictionary_var_types) diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 97b8442d..49a815da 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -589,6 +589,19 @@ def test_dict_spec_secondary_table_key_must_be_specified(self): ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + def test_dict_spec_secondary_tables_must_have_the_same_type_as_the_main_table(self): + """Test Dataset raising ValueError if main and sec. table's types don't match""" + bad_spec, _ = self.create_fixture_dataset_spec() + alt_spec, _ = self.create_fixture_dataset_spec( + data_type="file", output_dir=self.output_dir + ) + bad_spec["tables"]["D"] = alt_spec["tables"]["D"] + expected_msg = ( + "Secondary table 'D' has type 'str' which is different " + "from the main table's type 'DataFrame'." + ) + self.assert_dataset_fails(bad_spec, None, ValueError, expected_msg) + def test_dict_spec_format_must_be_tuple(self): """Test Dataset raising a TypeError if the format field is not a tuple""" bad_spec, y = self.create_fixture_dataset_spec() @@ -656,20 +669,6 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): ) self.assert_dataset_fails(spec, bad_y, TypeError, expected_msg) - def test_dict_spec_target_column_must_be_specified_to_be_accessed(self): - """Test Dataset raising ValueError when accessing a non specified target col""" - # Disable pointless statement because it is necessary for the test - # pylint: disable=pointless-statement - spec, _ = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=False, schema=None - ) - dataset = Dataset(spec, None) - with self.assertRaises(ValueError) as context: - dataset.target_column_type - output_error_msg = str(context.exception) - expected_error_msg = "Target column is not set" - self.assertEqual(output_error_msg, expected_error_msg) - def test_dict_spec_table_name_must_be_str(self): """Test Dataset raising TypeError when a table name is not a str""" spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) @@ -935,15 +934,15 @@ def test_file_table_fails_with_non_existent_table_file(self): expected_msg = "Non-existent data table file: Review.csv" self.assertEqual(output_error_msg, expected_msg) - def test_file_table_fails_with_empty_table_file(self): - """Test FileTable failing if it is created with an empty table""" - table_path = os.path.join(self.output_dir, "empty_table.csv") - table = pd.DataFrame(columns=["a", "b"]) + def test_file_table_fails_if_table_does_not_contain_the_target_column(self): + """Test FileTable failing if the table does not contain the target column""" + table_path = os.path.join(self.output_dir, "table.csv") + table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) table.to_csv(table_path, sep="\t", index=False) with self.assertRaises(ValueError) as context: - FileTable("empty_table", table_path, target_column_id="class") + table = FileTable("table", table_path, target_column_id="class") output_error_msg = str(context.exception) - expected_msg_prefix = "Empty data table file" + expected_msg_prefix = "Target column" self.assertIn(expected_msg_prefix, output_error_msg) def test_file_table_internal_file_creation_fails_on_an_existing_path(self): diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 0024f1b3..ece61467 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -68,12 +68,6 @@ def assertEqualAdditionalDataTableNames( @classmethod def setUpClass(cls): """Prepare datasets for tests""" - # Disable file-path warnings - warnings.filterwarnings( - "ignore", - message="File-path dataset input is deprecated and will be removed", - ) - # Grab output_dir for subsequent deletion cls.output_dir = os.path.join( "resources", "tmp", "test_sklearn_parameter_transfer" @@ -1372,7 +1366,7 @@ def _check_dictionary_domain( expected_additional_data_table_names=(), ): """Check assertions on dictionary domains""" - self.assertIsInstance(dictionary_domain, kh.dictionary.DictionaryDomain) + self.assertIsInstance(dictionary_domain, kh.DictionaryDomain) if expected_n_dictionaries is not None: self.assertEqual( len(dictionary_domain.dictionaries), expected_n_dictionaries From 5f4adb52fe1d13b1a1d7c40b389f9f4875af1fa0 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:10:35 +0200 Subject: [PATCH 04/12] Add file name parameter to estimator _transform method --- khiops/sklearn/estimators.py | 9 ++++++++- .../{transformed.txt => predict.txt} | 0 .../{transformed.txt => transform.txt} | 0 .../{transformed.txt => predict.txt} | 0 .../{transformed.txt => predict.txt} | 0 .../{transformed.txt => transform.txt} | 0 .../{transformed.txt => transform.txt} | 0 .../{transformed.txt => predict.txt} | 0 tests/test_remote_access.py | 5 +++-- tests/test_sklearn.py | 17 ++++++++++++++--- 10 files changed, 25 insertions(+), 6 deletions(-) rename tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/{transformed.txt => predict.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/{transformed.txt => transform.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/{transformed.txt => predict.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/{transformed.txt => predict.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/{transformed.txt => transform.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/{transformed.txt => transform.txt} (100%) rename tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/{transformed.txt => predict.txt} (100%) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index dd320c05..19c6f498 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -403,6 +403,7 @@ def _transform( computation_dir, _transform_create_deployment_model_fun, drop_key, + transformed_file_name, ): """Generic template method to implement transform, predict and predict_proba""" # Check if the model is fitted @@ -426,6 +427,7 @@ def _transform( deployment_dictionary_domain, self.model_main_dictionary_name_, computation_dir, + transformed_file_name, ) # Post-process to return the correct output type @@ -446,6 +448,7 @@ def _transform_deploy_model( model_dictionary_domain, model_dictionary_name, computation_dir, + transformed_file_name, ): """Deploys a generic Khiops transformation model @@ -505,7 +508,7 @@ def _transform_deploy_model( # Set output path files output_dir = self._get_output_dir(computation_dir) log_file_path = fs.get_child_path(output_dir, "khiops.log") - output_data_table_path = fs.get_child_path(output_dir, "transformed.txt") + output_data_table_path = fs.get_child_path(output_dir, transformed_file_name) # Set the format parameters depending on the type of dataset if deployment_dataset.is_in_memory(): @@ -1191,6 +1194,7 @@ def predict(self, X): computation_dir, self._transform_prepare_deployment_model_for_predict, False, + "predict.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -1704,6 +1708,7 @@ def predict(self, X): computation_dir, self._transform_prepare_deployment_model_for_predict, True, + "predict.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -2124,6 +2129,7 @@ def predict_proba(self, X): computation_dir, self._transform_prepare_deployment_model_for_predict_proba, True, + "predict_proba.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -2733,6 +2739,7 @@ def transform(self, X): computation_dir, self.model_.copy, True, + "transform.txt", ) # Cleanup and restore the runner's temporary dir finally: diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/predict.txt diff --git a/tests/test_remote_access.py b/tests/test_remote_access.py index f3bfbc78..f43b15b4 100644 --- a/tests/test_remote_access.py +++ b/tests/test_remote_access.py @@ -132,7 +132,8 @@ def test_khiops_classifier_with_remote_access(self): iris_df = pd.read_csv(iris_data_file, sep="\t") iris_df.pop("Class") classifier.predict(iris_df) - self.assertTrue(fs.exists(fs.get_child_path(output_dir, "transformed.txt"))) + predict_path = fs.get_child_path(output_dir, "predict.txt") + self.assertTrue(fs.exists(predict_path), msg=f"Path: {predict_path}") # Cleanup for filename in fs.list_dir(output_dir): @@ -190,7 +191,7 @@ def test_train_predictor_fail_and_log_with_remote_access(self): log_file_path=log_file_path, ) # Check and remove log file - self.assertTrue(fs.exists(log_file_path)) + self.assertTrue(fs.exists(log_file_path), f"Path: {log_file_path}") fs.remove(log_file_path) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index ece61467..8bafaf34 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1619,7 +1619,7 @@ def _retrieve_data( ): return self.datasets[schema_type][source_type][estimation_process] - def _define_resources(self, dataset, estimator_type): + def _define_resources(self, dataset, estimator_type, estimator_method): # Set the resources directory for the arguments head_dir = os.path.join( KhiopsTestHelper.get_resources_dir(), "sklearn", "results" @@ -1649,7 +1649,18 @@ def _define_resources(self, dataset, estimator_type): report_path = os.path.join(ref_reports_dir, report_name) model_kdic_path = os.path.join(ref_models_dir, f"{kdic_name}.kdic") model_kdicj_path = os.path.join(ref_models_dir, f"{kdic_name}.kdicj") - prediction_table_path = os.path.join(ref_predictions_dir, "transformed.txt") + if estimator_type in (KhiopsCoclustering, KhiopsEncoder): + prediction_table_path = os.path.join(ref_predictions_dir, "transform.txt") + else: + if estimator_method == "predict": + prediction_table_path = os.path.join(ref_predictions_dir, "predict.txt") + elif estimator_method == "predict_proba": + prediction_table_path = os.path.join( + ref_predictions_dir, "predict_proba.txt" + ) + else: + assert estimator_method == "fit", f"Real: {estimator_method}" + prediction_table_path = "" # Buld the resources resources = { @@ -1774,7 +1785,7 @@ def _test_template( X_test_data = data["test"] dataset = self.dataset_of_schema_type[schema_type] - resources = self._define_resources(dataset, estimator_type) + resources = self._define_resources(dataset, estimator_type, estimator_method) estimator_type_key = ( KhiopsPredictor From 8cbea7972ff5bfc0f26b553ecf7438a91f01608f Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 16 Apr 2024 19:10:47 +0200 Subject: [PATCH 05/12] Implement multi-table sort helper function --- doc/internal/index.rst | 2 +- khiops/sklearn/estimators.py | 10 +- khiops/utils/__init__.py | 0 .../{sklearn/tables.py => utils/dataset.py} | 125 +++++++------ khiops/utils/helpers.py | 90 ++++++++++ tests/test_dataset_class.py | 4 +- tests/test_dataset_errors.py | 4 +- tests/test_helper_functions.py | 165 ++++++++++++++++++ 8 files changed, 335 insertions(+), 65 deletions(-) create mode 100644 khiops/utils/__init__.py rename khiops/{sklearn/tables.py => utils/dataset.py} (95%) create mode 100644 khiops/utils/helpers.py diff --git a/doc/internal/index.rst b/doc/internal/index.rst index 743d9615..63a5a779 100644 --- a/doc/internal/index.rst +++ b/doc/internal/index.rst @@ -7,7 +7,7 @@ for completeness. :nosignatures: :toctree: generated - khiops.sklearn.tables + khiops.utils.dataset khiops.core.internals.common khiops.core.internals.filesystems khiops.core.internals.io diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 19c6f498..3425045b 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -50,7 +50,7 @@ is_list_like, type_error_message, ) -from khiops.sklearn.tables import Dataset, read_internal_data_table +from khiops.utils.dataset import Dataset, read_internal_data_table # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -555,7 +555,7 @@ def _transform_deployment_post_process( # - Because transformed data table file is sorted by key # - Drop the key columns if specified if deployment_dataset.is_multitable(): - key_df = deployment_dataset.main_table.dataframe[ + key_df = deployment_dataset.main_table.data_source[ deployment_dataset.main_table.key ] output_table_df_or_path = key_df.merge( @@ -1247,7 +1247,7 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): # Extract the keys from the main table keys_table_dataframe = pd.DataFrame( { - self.model_id_column: dataset.main_table.dataframe[ + self.model_id_column: dataset.main_table.data_source[ self.model_id_column ].unique() } @@ -1259,7 +1259,7 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): self.model_id_column, ) deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.dataframe, + dataset.main_table.data_source, self.model_id_column, ) else: @@ -1291,7 +1291,7 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): self.model_id_column, ) deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.path, + dataset.main_table.data_source, self.model_id_column, ) deploy_dataset_spec["format"] = (dataset.sep, dataset.header) diff --git a/khiops/utils/__init__.py b/khiops/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/khiops/sklearn/tables.py b/khiops/utils/dataset.py similarity index 95% rename from khiops/sklearn/tables.py rename to khiops/utils/dataset.py index 2136e13e..3245f1c7 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/utils/dataset.py @@ -6,6 +6,7 @@ ###################################################################################### """Classes for handling diverse data tables""" import csv +import functools import io import json import os @@ -32,7 +33,7 @@ # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names tables.py +# pylint --disable=all --enable=invalid-names dataset.py # pylint: disable=invalid-name @@ -289,7 +290,7 @@ def check_format_entry(format_spec): def get_khiops_type(numpy_type): - """Translates a numpy type to a Khiops dictionary type + """Translates a numpy dtype to a Khiops dictionary type Parameters ---------- @@ -399,7 +400,7 @@ class Dataset: ``False`` it is considered as numeric. Ignored if ``y`` is ``None``. key : str The name of the key column for all tables. - **Deprecated:** Will be removed in pyKhiops 11. + **Deprecated:** Will be removed in khiops-python 11. """ def __init__(self, X, y=None, categorical_target=True, key=None): @@ -481,7 +482,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None): self._init_tables_from_sequence(X, y, key=key) # A dict specification elif is_dict_like(X): - self._init_tables_from_mapping(X, y) + self._init_tables_from_mapping(X, y, categorical_target=categorical_target) # Fail if X is not recognized else: raise TypeError( @@ -780,17 +781,27 @@ def _check_input_mapping(self, X, y=None): ) def is_in_memory(self): - """Tests whether the dataset is in memory + """Tests whether the dataset is in-memory - A dataset is in memory if it is constituted either of only pandas.DataFrame + A dataset is in-memory if it is constituted either of only pandas.DataFrame tables, numpy.ndarray, or scipy.sparse.spmatrix tables. + """ + + return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) + + def table_type(self): + """Returns the table type of the dataset tables Returns ------- - bool - `True` if the dataset is constituted of pandas.DataFrame tables. + type + The type of the tables in the dataset. Possible values: + - `PandasTable` + - `NumpyTable` + - `SparseTable` + - `FileTable` """ - return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) + return type(self.main_table) def is_multitable(self): """Tests whether the dataset is a multi-table one @@ -802,30 +813,32 @@ def is_multitable(self): """ return self.secondary_tables is not None and len(self.secondary_tables) > 0 - def copy(self): - """Creates a copy of the dataset - - Referenced dataframes in tables are copied as references - """ + def to_spec(self): + """Returns a dictionary specification of this dataset""" ds_spec = {} ds_spec["main_table"] = self.main_table.name ds_spec["tables"] = {} - if self.is_in_memory(): - ds_spec["tables"][self.main_table.name] = ( - self.main_table.dataframe, - self.main_table.key, - ) - for table in self.secondary_tables: - ds_spec["tables"][table.name] = (table.dataframe, table.key) - else: - ds_spec["tables"][self.main_table.name] = ( - self.main_table.path, - self.main_table.key, - ) - for table in self.secondary_tables: - ds_spec["tables"][table.name] = (table.path, table.key) + ds_spec["tables"][self.main_table.name] = ( + self.main_table.data_source, + self.main_table.key, + ) + for table in self.secondary_tables: + ds_spec["tables"][table.name] = (table.data_source, table.key) + if self.relations: + ds_spec["relations"] = [] + ds_spec["relations"].extend(self.relations) + if self.table_type() == FileTable: ds_spec["format"] = (self.sep, self.header) - return Dataset(ds_spec) + + return ds_spec + + def copy(self): + """Creates a copy of the dataset + + Referenced pandas.DataFrame's, numpy.nparray's and scipy.sparse.spmatrix's in + tables are copied as references. + """ + return Dataset(self.to_spec()) def create_khiops_dictionary_domain(self): """Creates a Khiops dictionary domain representing this dataset @@ -944,6 +957,7 @@ def __init__(self, name, categorical_target=True, key=None): # Initialization (must be completed by concrete sub-classes) self.name = name + self.data_source = None self.categorical_target = categorical_target if is_list_like(key) or key is None: self.key = key @@ -1084,11 +1098,11 @@ def __init__( target_column = target_column.iloc[:, 0] # Initialize the attributes - self.dataframe = dataframe - self.n_samples = len(self.dataframe) + self.data_source = dataframe + self.n_samples = len(self.data_source) # Initialize feature columns and verify their types - self.column_ids = self.dataframe.columns.values + self.column_ids = self.data_source.columns.values if not np.issubdtype(self.column_ids.dtype, np.integer): if np.issubdtype(self.column_ids.dtype, object): for i, column_id in enumerate(self.column_ids): @@ -1107,7 +1121,7 @@ def __init__( # Initialize Khiops types self.khiops_types = { - column_id: get_khiops_type(self.dataframe.dtypes[column_id]) + column_id: get_khiops_type(self.data_source.dtypes[column_id]) for column_id in self.column_ids } @@ -1131,7 +1145,7 @@ def __init__( def __repr__(self): dtypes_str = ( - str(self.dataframe.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") + str(self.data_source.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") ) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " @@ -1187,7 +1201,7 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path def _create_dataframe_copy(self): - """Creates an in memory copy of the dataframe with the target column""" + """Creates an in-memory copy of the dataframe with the target column""" # Create a copy of the dataframe and add a copy of the target column (if any) if self.target_column is not None: if ( @@ -1200,11 +1214,11 @@ def _create_dataframe_copy(self): self.target_column, name=self.target_column_id ) output_dataframe = pd.concat( - [self.dataframe.reset_index(drop=True), output_target_column], + [self.data_source.reset_index(drop=True), output_target_column], axis=1, ) else: - output_dataframe = self.dataframe.copy() + output_dataframe = self.data_source.copy() # Rename the columns output_dataframe_column_names = {} @@ -1261,22 +1275,22 @@ def __init__( checked_target_column = column_or_1d(target_column, warn=True) # Initialize the members - self.array = checked_array - self.column_ids = list(range(self.array.shape[1])) - self.target_column_id = self.array.shape[1] + self.data_source = checked_array + self.column_ids = list(range(self.data_source.shape[1])) + self.target_column_id = self.data_source.shape[1] if target_column is not None: self.target_column = checked_target_column else: self.target_column = None self.categorical_target = categorical_target self.khiops_types = { - column_id: get_khiops_type(self.array.dtype) + column_id: get_khiops_type(self.data_source.dtype) for column_id in self.column_ids } - self.n_samples = len(self.array) + self.n_samples = len(self.data_source) def __repr__(self): - dtype_str = str(self.array.dtype) + dtype_str = str(self.data_source.dtype) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " f"dtype={dtype_str}; target={self.target_column_id}>" @@ -1309,7 +1323,8 @@ def create_table_file_for_khiops(self, output_dir, sort=True): output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the output dataframe - output_dataframe = pd.DataFrame(self.array.copy()) + # Note: This is not optimized for memory. + output_dataframe = pd.DataFrame(self.data_source.copy()) output_dataframe.columns = [f"Var{column_id}" for column_id in self.column_ids] if self.target_column is not None: output_dataframe[f"Var{self.target_column_id}"] = self.target_column @@ -1385,19 +1400,19 @@ def __init__( ) # Initialize the members - self.matrix = matrix - self.column_ids = list(range(self.matrix.shape[1])) - self.target_column_id = self.matrix.shape[1] + self.data_source = matrix + self.column_ids = list(range(self.data_source.shape[1])) + self.target_column_id = self.data_source.shape[1] self.target_column = target_column self.categorical_target = categorical_target self.khiops_types = { - column_id: get_khiops_type(self.matrix.dtype) + column_id: get_khiops_type(self.data_source.dtype) for column_id in self.column_ids } - self.n_samples = self.matrix.shape[0] + self.n_samples = self.data_source.shape[0] def __repr__(self): - dtype_str = str(self.matrix.dtype) + dtype_str = str(self.data_source.dtype) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " f"dtype={dtype_str}; target={self.target_column_id}>" @@ -1462,12 +1477,12 @@ def _flatten(self, iterable): def _write_sparse_block(self, row_index, stream, target=None): assert row_index in range( - self.matrix.shape[0] + self.data_source.shape[0] ), "'row_index' must be coherent with the shape of the sparse matrix" if target is not None: assert target in self.target_column, "'target' must be in the target column" stream.write(f"{target}\t") - row = self.matrix.getrow(row_index) + row = self.data_source.getrow(row_index) # Variable indices are not always sorted in `row.indices` # Khiops needs variable indices to be sorted sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") @@ -1476,7 +1491,7 @@ def _write_sparse_block(self, row_index, stream, target=None): # is not homogeneous with other sparse matrices: it stores # opaque Python lists as elements # Thus: - # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then + # - if isinstance(self.data_source, sp.lil_matrix) and Python 3.8, then # row.data is np.array([list([...])]) # - else, row.data is np.array([...]) # TODO: remove this flattening once Python 3.8 support is dropped @@ -1501,14 +1516,14 @@ def create_table_file_for_khiops(self, output_dir, sort=True): f"{target_column_name}\tSparseVariables\n" ) for target, row_index in zip( - self.target_column, range(self.matrix.shape[0]) + self.target_column, range(self.data_source.shape[0]) ): self._write_sparse_block( row_index, output_sparse_matrix_stream, target=target ) else: output_sparse_matrix_stream.write("SparseVariables\n") - for row_index in range(self.matrix.shape[0]): + for row_index in range(self.data_source.shape[0]): self._write_sparse_block(row_index, output_sparse_matrix_stream) fs.write( output_table_path, diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py new file mode 100644 index 00000000..f052b376 --- /dev/null +++ b/khiops/utils/helpers.py @@ -0,0 +1,90 @@ +"""General helper functions""" + +import os + +from khiops import core as kh +from khiops.core.internals.common import is_dict_like, type_error_message +from khiops.utils.dataset import Dataset, FileTable, PandasTable + + +def sort_dataset(ds_spec, output_dir=None): + """Sorts a dataset by its table key columns + + + The dataset may be multi-table or not. If it is monotable the key of the only table + must be specified. + + Parameters + ---------- + ds_spec: dict + The dataset dictionary specification. The tables must be either + `pandas.DataFrame` or file path references. + output_dir: str, optional + _Only for file datasets:_ The output directory for the sorted files. + + + Notes + ----- + + The sorting algorithm is mergesort, which ensures sort stability. The sorting engine + for dataframes is Pandas and for file-based datasets is Khiops. + + """ + # Check the types + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) + + # Build the dataset + ds = Dataset(ds_spec) + + # Check special arguments in function of the dataset + if ds.table_type() == FileTable and output_dir is None: + raise ValueError("'output_dir' must be specified for file based datasets") + + # Make a copy of the dataset (note: data sources are just reference) + out_ds = ds.copy() + + # Replace each datasource with the sorted table + for table in [out_ds.main_table] + out_ds.secondary_tables: + if isinstance(table, PandasTable): + table.data_source = _sort_df_table(table) + else: + assert isinstance(table, FileTable) + table.data_source = _sort_file_table(table, ds.sep, ds.header, output_dir) + + return out_ds.to_spec() + + +def _sort_df_table(table): + assert isinstance(table, PandasTable), type_error_message( + "table", table, PandasTable + ) + out_data_source = table.data_source.sort_values( + by=table.key, + key=lambda array: array.astype("str"), + inplace=False, + kind="mergesort", + ) + + return out_data_source + + +def _sort_file_table(table, sep, header, output_dir): + assert isinstance(table, FileTable), type_error_message("table", table, FileTable) + + domain = kh.DictionaryDomain() + dictionary = table.create_khiops_dictionary() + domain.add_dictionary(dictionary) + out_data_source = os.path.join(output_dir, f"{dictionary.name}.txt") + kh.sort_data_table( + domain, + dictionary.name, + table.data_source, + out_data_source, + field_separator=sep, + header_line=header, + output_field_separator=sep, + output_header_line=header, + ) + + return out_data_source diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index b1dba2d4..e8707dd4 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -17,10 +17,10 @@ from pandas.testing import assert_frame_equal from sklearn import datasets -from khiops.sklearn.tables import Dataset +from khiops.utils.dataset import Dataset -class DatasetInputOutputConsistency(unittest.TestCase): +class DatasetInputOutputConsistencyTests(unittest.TestCase): """Test consistency of the created files with the input data The following tests allow to verify that: diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 49a815da..314cff70 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -14,12 +14,12 @@ import pandas as pd from khiops.core.internals.common import type_error_message -from khiops.sklearn.tables import Dataset, FileTable, PandasTable +from khiops.utils.dataset import Dataset, FileTable, PandasTable # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names estimators.py +# pylint --disable=all --enable=invalid-names test_dataset_errors.py # pylint: disable=invalid-name class AnotherType(object): """A placeholder class that is not of any basic type to test TypeError's""" diff --git a/tests/test_helper_functions.py b/tests/test_helper_functions.py index 86d43aa6..b69d8331 100644 --- a/tests/test_helper_functions.py +++ b/tests/test_helper_functions.py @@ -5,10 +5,16 @@ # see the "LICENSE.md" file for more details. # ###################################################################################### """Tests for checking the output types of predictors""" +import contextlib +import io +import tempfile import unittest +import pandas as pd + from khiops.core.dictionary import DictionaryDomain from khiops.core.helpers import build_multi_table_dictionary_domain +from khiops.utils.helpers import sort_dataset class KhiopsHelperFunctions(unittest.TestCase): @@ -91,3 +97,162 @@ def test_build_multi_table_dictionary_domain(self): for test_var, ref_var in zip(test_dict.variables, ref_dict.variables): self.assertEqual(test_var.name, ref_var.name) self.assertEqual(test_var.type, ref_var.type) + + def test_sort_dataset_dataframe(self): + """Tests that the sort_dataset function works for dataframe datasets""" + # Create the fixture dataset + clients_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CALLS_CSV)) + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_df, ["id"]), + "calls": (calls_df, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False)], + } + + # Call the sort_dataset function + sorted_ds_spec = sort_dataset(ds_spec) + ref_sorted_table_dfs = { + "clients": pd.read_csv(io.StringIO(TEST_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TEST_CALLS_CSV)), + } + + # Check that the structure of the sorted dataset + self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + for table_name in ds_spec["tables"]: + # Check that the dataframes are equal (ignoring the index) + self._assert_frame_equal( + ref_sorted_table_dfs[table_name].reset_index(drop=True), + sorted_ds_spec["tables"][table_name][0].reset_index(drop=True), + ) + + def test_sort_dataset_file(self): + """Tests that the sort_dataset function works for file datasets""" + # Create a execution context with temporary files and directories + with contextlib.ExitStack() as exit_stack: + # Create temporary files and a temporary directory + clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) + + # Create the fixture dataset + clients_csv_file.write(bytes(UNSORTED_TEST_CLIENTS_CSV, encoding="utf8")) + calls_csv_file.write(bytes(UNSORTED_TEST_CALLS_CSV, encoding="utf8")) + clients_csv_file.flush() + calls_csv_file.flush() + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_csv_file.name, ["id"]), + "calls": (calls_csv_file.name, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False)], + "format": (",", True), + } + + # Call the sort_dataset function + sorted_ds_spec = sort_dataset(ds_spec, output_dir=tmp_dir) + + # Check that the structure of the sorted dataset + self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + ref_sorted_tables = {"clients": TEST_CLIENTS_CSV, "calls": TEST_CALLS_CSV} + for table_name, _ in ds_spec["tables"].items(): + # Read the contents of the sorted table to a list of strings + sorted_table_spec = sorted_ds_spec["tables"][table_name] + sorted_table_file = exit_stack.enter_context( + open(sorted_table_spec[0], encoding="ascii") + ) + sorted_table = sorted_table_file.readlines() + + # Transform the reference table string to a list of strings + ref_sorted_table = ref_sorted_tables[table_name].splitlines( + keepends=True + ) + + # Check that the sorted table is equal to the reference + self.assertEqual(ref_sorted_table, sorted_table) + + def _assert_sorted_dataset_keeps_structure(self, ds_spec, sorted_ds_spec): + """Asserts that the sorted dataset keeps the structure of the input dataset + + It does not check the contents of the tables. + """ + # Check that the spec dictionary is the same excluding the tables + self.assertIn("main_table", sorted_ds_spec) + self.assertIn("tables", sorted_ds_spec) + self.assertIn("relations", sorted_ds_spec) + self.assertEqual(ds_spec["main_table"], sorted_ds_spec["main_table"]) + self.assertEqual(ds_spec["relations"], sorted_ds_spec["relations"]) + self.assertEqual(ds_spec["tables"].keys(), sorted_ds_spec["tables"].keys()) + + # Check that the table keys are equal + for table_name, table_spec in ds_spec["tables"].items(): + self.assertEqual(table_spec[1], sorted_ds_spec["tables"][table_name][1]) + + def _assert_frame_equal(self, ref_df, out_df): + """Wrapper for the assert_frame_equal pandas function + + In case of failure of assert_frame_equal we capture the AssertionError thrown by + it and make a unittest call to fail. This reports the error found by + assert_frame_equal while avoiding a double thrown exception. + """ + failure_error = None + try: + pd.testing.assert_frame_equal(ref_df, out_df) + except AssertionError as error: + failure_error = error + if failure_error is not None: + self.fail(failure_error) + + +# pylint: disable=line-too-long +# fmt: off +TEST_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date +1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" +""".lstrip() + +TEST_CALLS_CSV = """ +id,call_id,duration +1,1,38 +1,20,29 +10,2,7 +13,25,329 +13,3,1 +13,30,8 +4,14,48 +4,2,543 +7,4,339 +""".lstrip() + +UNSORTED_TEST_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date +1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" +""".lstrip() + +UNSORTED_TEST_CALLS_CSV = """ +id,call_id,duration +1,1,38 +10,2,7 +13,25,329 +4,2,543 +13,30,8 +13,3,1 +4,14,48 +1,20,29 +7,4,339 +""".lstrip() From 4eb8cc60b07aa17e17ae97e4e71300ecdc21944c Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:18:13 +0200 Subject: [PATCH 06/12] Abbreviate dataset variables to ds --- khiops/sklearn/estimators.py | 318 +++++++++++++++++------------------ khiops/utils/dataset.py | 1 - 2 files changed, 152 insertions(+), 167 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3425045b..6cab5644 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -50,7 +50,7 @@ is_list_like, type_error_message, ) -from khiops.utils.dataset import Dataset, read_internal_data_table +from khiops.utils.dataset import Dataset, FileTable, read_internal_data_table # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -86,15 +86,15 @@ def _extract_basic_dictionary(dictionary): def _check_dictionary_compatibility( model_dictionary, - dataset_dictionary, + ds_dictionary, estimator_class_name, ): # Prefix for all error messages - error_msg_prefix = f"X contains incompatible table '{dataset_dictionary.name}'" + error_msg_prefix = f"X contains incompatible table '{ds_dictionary.name}'" # Save variable arrays and their size model_variables = model_dictionary.variables - dataset_variables = dataset_dictionary.variables + dataset_variables = ds_dictionary.variables # Error if different number of variables if len(model_variables) != len(dataset_variables): @@ -333,12 +333,12 @@ def fit(self, X, y=None, **kwargs): return self - def _fit(self, dataset, computation_dir, **kwargs): + def _fit(self, ds, computation_dir, **kwargs): """Template pattern of a fit method Parameters ---------- - dataset : `Dataset` + ds : `Dataset` The learning dataset. computation_dir : str Path or URI where the Khiops computation results will be stored. @@ -346,25 +346,25 @@ def _fit(self, dataset, computation_dir, **kwargs): The called methods are reimplemented in concrete sub-classes """ # Check model parameters - self._fit_check_params(dataset, **kwargs) + self._fit_check_params(ds, **kwargs) # Check the dataset - self._fit_check_dataset(dataset) + self._fit_check_dataset(ds) # Train the model - self._fit_train_model(dataset, computation_dir, **kwargs) - self.n_features_in_ = dataset.main_table.n_features() + self._fit_train_model(ds, computation_dir, **kwargs) + self.n_features_in_ = ds.main_table.n_features() # If the main attributes are of the proper type finish the fitting # Otherwise it means there was an abort (early return) of the previous steps if isinstance(self.model_, kh.DictionaryDomain) and isinstance( self.model_report_, kh.KhiopsJSONObject ): - self._fit_training_post_process(dataset) + self._fit_training_post_process(ds) self.is_fitted_ = True - self.is_multitable_model_ = dataset.is_multitable() + self.is_multitable_model_ = ds.is_multitable() - def _fit_check_params(self, dataset, **_): + def _fit_check_params(self, ds, **_): """Check the model parameters including those data dependent (in kwargs)""" if ( self.key is not None @@ -373,33 +373,30 @@ def _fit_check_params(self, dataset, **_): ): raise TypeError(type_error_message("key", self.key, str, "list-like")) - if not dataset.is_in_memory() and self.output_dir is None: + if not ds.is_in_memory() and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") - def _fit_check_dataset(self, dataset): + def _fit_check_dataset(self, ds): """Checks the pre-conditions of the tables to build the model""" - if ( - dataset.main_table.n_samples is not None - and dataset.main_table.n_samples <= 1 - ): + if ds.main_table.n_samples is not None and ds.main_table.n_samples <= 1: raise ValueError( "Table contains one sample or less. It must contain at least 2." ) @abstractmethod - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): """Builds the model with one or more calls to khiops.core.api It must return the path of the ``.kdic`` Khiops model file and the JSON report. """ @abstractmethod - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): """Loads the model's data from Khiops files into the object""" def _transform( self, - dataset, + ds, computation_dir, _transform_create_deployment_model_fun, drop_key, @@ -410,20 +407,18 @@ def _transform( check_is_fitted(self) # Check if the dataset is consistent with the model - self._transform_check_dataset(dataset) + self._transform_check_dataset(ds) # Create a deployment dataset # Note: The input dataset is not necessarily ready to be deployed - deployment_dataset = self._transform_create_deployment_dataset( - dataset, computation_dir - ) + deployment_ds = self._transform_create_deployment_dataset(ds, computation_dir) # Create a deployment dictionary deployment_dictionary_domain = _transform_create_deployment_model_fun() # Deploy the model output_table_path = self._transform_deploy_model( - deployment_dataset, + deployment_ds, deployment_dictionary_domain, self.model_main_dictionary_name_, computation_dir, @@ -432,19 +427,19 @@ def _transform( # Post-process to return the correct output type return self._transform_deployment_post_process( - deployment_dataset, output_table_path, drop_key + deployment_ds, output_table_path, drop_key ) - def _transform_create_deployment_dataset(self, dataset, _): + def _transform_create_deployment_dataset(self, ds, _): """Creates if necessary a new dataset to execute the model deployment The default behavior is to return the same dataset. """ - return dataset + return ds def _transform_deploy_model( self, - deployment_dataset, + deployment_ds, model_dictionary_domain, model_dictionary_name, computation_dir, @@ -482,7 +477,7 @@ def _transform_deploy_model( ( main_table_path, secondary_table_paths, - ) = deployment_dataset.create_table_files_for_khiops( + ) = deployment_ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) @@ -511,12 +506,12 @@ def _transform_deploy_model( output_data_table_path = fs.get_child_path(output_dir, transformed_file_name) # Set the format parameters depending on the type of dataset - if deployment_dataset.is_in_memory(): + if deployment_ds.is_in_memory(): field_separator = "\t" header_line = True else: - field_separator = deployment_dataset.main_table.sep - header_line = deployment_dataset.main_table.header + field_separator = deployment_ds.main_table.sep + header_line = deployment_ds.main_table.header # Call to core function deploy_model kh.deploy_model( @@ -536,16 +531,16 @@ def _transform_deploy_model( return output_data_table_path - def _transform_check_dataset(self, dataset): + def _transform_check_dataset(self, ds): """Checks the dataset before deploying a model on them""" - if not dataset.is_in_memory() or self.output_dir is None: + if ds.table_type == FileTable and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _transform_deployment_post_process( - self, deployment_dataset, output_table_path, drop_key + self, deployment_ds, output_table_path, drop_key ): # Return a dataframe for dataframe based datasets - if deployment_dataset.is_in_memory(): + if deployment_ds.is_in_memory(): # Read the transformed table with the internal table settings with io.BytesIO(fs.read(output_table_path)) as output_table_stream: output_table_df = read_internal_data_table(output_table_stream) @@ -554,16 +549,16 @@ def _transform_deployment_post_process( # - Reorder the table to the original table order # - Because transformed data table file is sorted by key # - Drop the key columns if specified - if deployment_dataset.is_multitable(): - key_df = deployment_dataset.main_table.data_source[ - deployment_dataset.main_table.key + if deployment_ds.is_multitable(): + key_df = deployment_ds.main_table.data_source[ + deployment_ds.main_table.key ] output_table_df_or_path = key_df.merge( - output_table_df, on=deployment_dataset.main_table.key + output_table_df, on=deployment_ds.main_table.key ) if drop_key: output_table_df_or_path.drop( - deployment_dataset.main_table.key, axis=1, inplace=True + deployment_ds.main_table.key, axis=1, inplace=True ) # On mono-table: Return the read dataframe as-is else: @@ -752,7 +747,7 @@ def fit(self, X, y=None, **kwargs): """ return super().fit(X, y=y, **kwargs) - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Check that at least one of the build methods parameters is set if not ( self.build_name_var or self.build_distance_vars or self.build_frequency_vars @@ -775,7 +770,7 @@ def _fit_check_params(self, dataset, **kwargs): raise TypeError( type_error_message(f"columns[{i}]", column_id, str) ) - if column_id not in dataset.main_table.column_ids: + if column_id not in ds.main_table.column_ids: raise ValueError(f"columns[{i}] ('{column_id}') not found in X") # Check that 'id_column': @@ -787,7 +782,7 @@ def _fit_check_params(self, dataset, **kwargs): raise ValueError("'id_column' is a mandatory parameter") if not isinstance(id_column, str): raise TypeError(type_error_message("key_columns", id_column, str)) - if id_column not in dataset.main_table.column_ids: + if id_column not in ds.main_table.column_ids: raise ValueError(f"id column '{id_column}' not found in X") # Deprecate the 'max_part_numbers' parameter @@ -802,11 +797,11 @@ def _fit_check_params(self, dataset, **kwargs): ) ) - def _fit_train_model(self, dataset, computation_dir, **kwargs): - assert not dataset.is_multitable(), "Coclustering not available in multitable" + def _fit_train_model(self, ds, computation_dir, **kwargs): + assert not ds.is_multitable(), "Coclustering not available in multitable" # Prepare the table files and dictionary for Khiops - main_table_path, _ = dataset.create_table_files_for_khiops( + main_table_path, _ = ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) @@ -820,12 +815,12 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): elif self.variables is not None: variables = self.variables else: - variables = list(dataset.main_table.column_ids) + variables = list(ds.main_table.column_ids) # Train the coclustering model coclustering_file_path = kh.train_coclustering( - dataset.create_khiops_dictionary_domain(), - dataset.main_table.name, + ds.create_khiops_dictionary_domain(), + ds.main_table.name, main_table_path, variables, output_dir, @@ -867,18 +862,16 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): # Create a multi-table dictionary from the schema of the table # The root table contains the key of the table and points to the main table - tmp_domain = dataset.create_khiops_dictionary_domain() - main_table_dictionary = tmp_domain.get_dictionary(dataset.main_table.name) + tmp_domain = ds.create_khiops_dictionary_domain() + main_table_dictionary = tmp_domain.get_dictionary(ds.main_table.name) if not main_table_dictionary.key: main_table_dictionary.key = [self.model_id_column] - main_table_dictionary.name = ( - f"{self._khiops_model_prefix}{dataset.main_table.name}" - ) + main_table_dictionary.name = f"{self._khiops_model_prefix}{ds.main_table.name}" self.model_main_dictionary_name_ = ( - f"{self._khiops_model_prefix}Keys_{dataset.main_table.name}" + f"{self._khiops_model_prefix}Keys_{ds.main_table.name}" ) self.model_secondary_table_variable_name = ( - f"{self._khiops_model_prefix}{dataset.main_table.name}" + f"{self._khiops_model_prefix}{ds.main_table.name}" ) self._create_coclustering_model_domain( tmp_domain, coclustering_file_path, output_dir @@ -901,7 +894,7 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): self.model_report_ = simplified_cc.model_report_ self.model_report_raw_ = self.model_report_.json_data - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): assert ( len(self.model_.dictionaries) == 2 ), "'model_' does not have exactly 2 dictionaries" @@ -1059,16 +1052,16 @@ def _simplify( ) # Get dataset dictionary from model; it should not be root - dataset_dictionary = self.model_.get_dictionary( + ds_dictionary = self.model_.get_dictionary( self.model_secondary_table_variable_name ) assert ( - not dataset_dictionary.root + not ds_dictionary.root ), "Dataset dictionary in the coclustering model should not be root" - if not dataset_dictionary.key: - dataset_dictionary.key = self.model_id_column + if not ds_dictionary.key: + ds_dictionary.key = self.model_id_column domain = DictionaryDomain() - domain.add_dictionary(dataset_dictionary) + domain.add_dictionary(ds_dictionary) simplified_coclustering_file_path = fs.get_child_path( output_dir, "Coclustering.khcj" ) @@ -1185,12 +1178,12 @@ def predict(self, X): kh.get_runner().root_temp_dir = computation_dir # Create the input dataset - dataset = Dataset(X) + ds = Dataset(X) # Call the template transform method try: y_pred = super()._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict, False, @@ -1202,25 +1195,25 @@ def predict(self, X): kh.get_runner().root_temp_dir = initial_runner_temp_dir # Transform to numpy.array for in-memory inputs - if dataset.is_in_memory(): + if ds.is_in_memory(): y_pred = y_pred.to_numpy() return y_pred - def _transform_check_dataset(self, dataset): + def _transform_check_dataset(self, ds): """Checks the tables before deploying a model on them""" assert ( len(self.model_.dictionaries) == 2 ), "'model' does not have exactly 2 dictionaries" # Call the parent method - super()._transform_check_dataset(dataset) + super()._transform_check_dataset(ds) # Coclustering models are special: # - They are mono-table only # - They are deployed with a multitable model whose main table contain # the keys of the input table and the secondary table is the input table - if dataset.is_multitable(): + if ds.is_multitable(): raise ValueError("Coclustering models not available in multi-table mode") # The "model dictionary domain" in the coclustering case it is just composed @@ -1231,23 +1224,23 @@ def _transform_check_dataset(self, dataset): if dictionary.name != self.model_main_dictionary_name_: _check_dictionary_compatibility( dictionary, - dataset.main_table.create_khiops_dictionary(), + ds.main_table.create_khiops_dictionary(), self.__class__.__name__, ) - def _transform_create_deployment_dataset(self, dataset, computation_dir): - assert not dataset.is_multitable(), "'dataset' is multitable" + def _transform_create_deployment_dataset(self, ds, computation_dir): + assert not ds.is_multitable(), "'dataset' is multitable" # Build the multitable deployment dataset - keys_table_name = f"keys_{dataset.main_table.name}" + keys_table_name = f"keys_{ds.main_table.name}" deploy_dataset_spec = {} deploy_dataset_spec["main_table"] = keys_table_name deploy_dataset_spec["tables"] = {} - if dataset.is_in_memory(): + if ds.is_in_memory(): # Extract the keys from the main table keys_table_dataframe = pd.DataFrame( { - self.model_id_column: dataset.main_table.data_source[ + self.model_id_column: ds.main_table.data_source[ self.model_id_column ].unique() } @@ -1258,20 +1251,20 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): keys_table_dataframe, self.model_id_column, ) - deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.data_source, + deploy_dataset_spec["tables"][ds.main_table.name] = ( + ds.main_table.data_source, self.model_id_column, ) else: # Create the table to extract the keys (sorted) - keyed_dataset = dataset.copy() + keyed_dataset = ds.copy() keyed_dataset.main_table.key = [self.model_id_column] main_table_path = keyed_dataset.main_table.create_table_file_for_khiops( computation_dir, sort=self.auto_sort ) # Create a table storing the main table keys - keys_table_name = f"keys_{dataset.main_table.name}" + keys_table_name = f"keys_{ds.main_table.name}" keys_table_file_path = fs.get_child_path( computation_dir, f"raw_{keys_table_name}.txt" ) @@ -1280,21 +1273,21 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): keyed_dataset.main_table.name, main_table_path, keys_table_file_path, - header_line=dataset.header, - field_separator=dataset.sep, - output_header_line=dataset.header, - output_field_separator=dataset.sep, + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, trace=self.verbose, ) deploy_dataset_spec["tables"][keys_table_name] = ( keys_table_file_path, self.model_id_column, ) - deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.data_source, + deploy_dataset_spec["tables"][ds.main_table.name] = ( + ds.main_table.data_source, self.model_id_column, ) - deploy_dataset_spec["format"] = (dataset.sep, dataset.header) + deploy_dataset_spec["format"] = (ds.sep, ds.header) return Dataset(deploy_dataset_spec) @@ -1302,11 +1295,11 @@ def _transform_prepare_deployment_model_for_predict(self): return self.model_ def _transform_deployment_post_process( - self, deployment_dataset, output_table_path, drop_key + self, deployment_ds, output_table_path, drop_key ): - assert deployment_dataset.is_multitable() + assert deployment_ds.is_multitable() return super()._transform_deployment_post_process( - deployment_dataset, output_table_path, drop_key + deployment_ds, output_table_path, drop_key ) def fit_predict(self, X, y=None, **kwargs): @@ -1354,12 +1347,12 @@ def __init__( def _more_tags(self): return {"require_y": True} - def _fit_check_dataset(self, dataset): - super()._fit_check_dataset(dataset) - self._check_target_type(dataset) + def _fit_check_dataset(self, ds): + super()._fit_check_dataset(ds) + self._check_target_type(ds) @abstractmethod - def _check_target_type(self, dataset): + def _check_target_type(self, ds): """Checks that the target type has the correct type for the estimator""" def fit(self, X, y=None, **kwargs): @@ -1400,9 +1393,9 @@ def fit(self, X, y=None, **kwargs): super().fit(X, y=y, **kwargs) return self - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Call parent method - super()._fit_check_params(dataset, **kwargs) + super()._fit_check_params(ds, **kwargs) # Check supervised estimator parameters if not isinstance(self.n_features, int): @@ -1418,10 +1411,10 @@ def _fit_check_params(self, dataset, **kwargs): if self.n_pairs < 0: raise ValueError("'n_pairs' must be positive") - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): # Train the model with Khiops train_args, train_kwargs = self._fit_prepare_training_function_inputs( - dataset, computation_dir + ds, computation_dir ) report_file_path, model_kdic_file_path = self._fit_core_training_function( *train_args, **train_kwargs @@ -1443,33 +1436,29 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): def _fit_core_training_function(self, *args, **kwargs): """A wrapper to the khiops.core training function for the estimator""" - def _fit_prepare_training_function_inputs(self, dataset, computation_dir): + def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Set output path files output_dir = self._get_output_dir(computation_dir) log_file_path = fs.get_child_path(output_dir, "khiops.log") - main_table_path, secondary_table_paths = dataset.create_table_files_for_khiops( + main_table_path, secondary_table_paths = ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) # Build the 'additional_data_tables' argument - dataset_domain = dataset.create_khiops_dictionary_domain() - secondary_data_paths = dataset_domain.extract_data_paths( - dataset.main_table.name - ) + ds_domain = ds.create_khiops_dictionary_domain() + secondary_data_paths = ds_domain.extract_data_paths(ds.main_table.name) additional_data_tables = {} for data_path in secondary_data_paths: - dictionary = dataset_domain.get_dictionary_at_data_path(data_path) + dictionary = ds_domain.get_dictionary_at_data_path(data_path) additional_data_tables[data_path] = secondary_table_paths[dictionary.name] # Build the mandatory arguments args = [ - dataset.create_khiops_dictionary_domain(), - dataset.main_table.name, + ds.create_khiops_dictionary_domain(), + ds.main_table.name, main_table_path, - dataset.main_table.get_khiops_variable_name( - dataset.main_table.target_column_id - ), + ds.main_table.get_khiops_variable_name(ds.main_table.target_column_id), output_dir, ] @@ -1487,12 +1476,12 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): # Set the format parameters depending on the type of dataset kwargs["detect_format"] = False - if dataset.is_in_memory(): + if ds.is_in_memory(): kwargs["field_separator"] = "\t" kwargs["header_line"] = True else: - kwargs["field_separator"] = dataset.main_table.sep - kwargs["header_line"] = dataset.main_table.header + kwargs["field_separator"] = ds.main_table.sep + kwargs["header_line"] = ds.main_table.header # Rename parameters to be compatible with khiops.core kwargs["max_constructed_variables"] = kwargs.pop("n_features") @@ -1509,13 +1498,13 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): return args, kwargs - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Set the target variable name - self.model_target_variable_name_ = dataset.main_table.get_khiops_variable_name( - dataset.main_table.target_column_id + self.model_target_variable_name_ = ds.main_table.get_khiops_variable_name( + ds.main_table.target_column_id ) # Verify it has at least one dictionary and a root dictionary in multi-table @@ -1530,7 +1519,7 @@ def _fit_training_post_process(self, dataset): initial_dictionary_name = dictionary.name.replace( self._khiops_model_prefix, "", 1 ) - if initial_dictionary_name == dataset.main_table.name: + if initial_dictionary_name == ds.main_table.name: self.model_main_dictionary_name_ = dictionary.name if self.model_main_dictionary_name_ is None: raise ValueError("No model dictionary after Khiops call") @@ -1598,29 +1587,29 @@ def _fit_training_post_process(self, dataset): self.feature_evaluated_importances_ = np.array([x[1] for x in combined]) self.n_features_evaluated_ = len(combined) - def _transform_check_dataset(self, dataset): - assert isinstance(dataset, Dataset), "'dataset' is not 'Dataset'" + def _transform_check_dataset(self, ds): + assert isinstance(ds, Dataset), "'ds' is not 'Dataset'" # Call the parent method - super()._transform_check_dataset(dataset) + super()._transform_check_dataset(ds) # Check the coherence between thi input table and the model - if self.is_multitable_model_ and not dataset.is_multitable(): + if self.is_multitable_model_ and not ds.is_multitable(): raise ValueError( "You are trying to apply on single-table inputs a model which has " "been trained on multi-table data." ) - if not self.is_multitable_model_ and dataset.is_multitable(): + if not self.is_multitable_model_ and ds.is_multitable(): raise ValueError( "You are trying to apply on multi-table inputs a model which has " "been trained on single-table data." ) # Error if different number of dictionaries - dataset_domain = dataset.create_khiops_dictionary_domain() - if len(self.model_.dictionaries) != len(dataset_domain.dictionaries): + ds_domain = ds.create_khiops_dictionary_domain() + if len(self.model_.dictionaries) != len(ds_domain.dictionaries): raise ValueError( - f"X has {len(dataset_domain.dictionaries)} table(s), " + f"X has {len(ds_domain.dictionaries)} table(s), " f"but {self.__class__.__name__} is expecting " f"{len(self.model_.dictionaries)}" ) @@ -1629,13 +1618,13 @@ def _transform_check_dataset(self, dataset): # Note: Name checking is omitted for the main table _check_dictionary_compatibility( _extract_basic_dictionary(self._get_main_dictionary()), - dataset.main_table.create_khiops_dictionary(), + ds.main_table.create_khiops_dictionary(), self.__class__.__name__, ) # Multi-table model: Check name and dictionary coherence of secondary tables dataset_secondary_tables_by_name = { - table.name: table for table in dataset.secondary_tables + table.name: table for table in ds.secondary_tables } for dictionary in self.model_.dictionaries: assert dictionary.name.startswith(self._khiops_model_prefix), ( @@ -1700,11 +1689,11 @@ def predict(self, X): try: # Create the input dataset - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) # Call the template transform method y_pred = super()._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict, True, @@ -1955,26 +1944,23 @@ def fit(self, X, y, **kwargs): kwargs["categorical_target"] = True return super().fit(X, y, **kwargs) - def _check_target_type(self, dataset): - _check_categorical_target_type(dataset) + def _check_target_type(self, ds): + _check_categorical_target_type(ds) - def _fit_check_dataset(self, dataset): + def _fit_check_dataset(self, ds): # Call the parent method - super()._fit_check_dataset(dataset) + super()._fit_check_dataset(ds) # Check that the target is for classification in in_memory_tables - if dataset.is_in_memory(): - current_type_of_target = type_of_target(dataset.main_table.target_column) + if ds.is_in_memory(): + current_type_of_target = type_of_target(ds.main_table.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( f"Unknown label type: '{current_type_of_target}' " "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ( - dataset.is_in_memory() - and len(np.unique(dataset.main_table.target_column)) == 1 - ): + if ds.is_in_memory() and len(np.unique(ds.main_table.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -1982,12 +1968,12 @@ def _fit_check_dataset(self, dataset): def _fit_core_training_function(self, *args, **kwargs): return kh.train_predictor(*args, **kwargs) - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call the parent's method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Save the target datatype - self._original_target_type = dataset.target_column_type + self._original_target_type = ds.target_column_type # Save class values in the order of deployment self.classes_ = [] @@ -2123,9 +2109,9 @@ def predict_proba(self, X): # Call the generic transfrom method try: - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) y_probas = self._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict_proba, True, @@ -2139,7 +2125,7 @@ def predict_proba(self, X): # For in-memory datasets: # - Reorder the columns to that of self.classes_ # - Transform to np.ndarray - if dataset.is_in_memory(): + if ds.is_in_memory(): assert isinstance( y_probas, (pd.DataFrame, np.ndarray) ), "y_probas is not a Pandas DataFrame nor Numpy array" @@ -2328,9 +2314,9 @@ def fit(self, X, y=None, **kwargs): def _fit_core_training_function(self, *args, **kwargs): return kh.train_predictor(*args, **kwargs) - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): # Call the parent method - super()._fit_train_model(dataset, computation_dir, **kwargs) + super()._fit_train_model(ds, computation_dir, **kwargs) # Warn when there are no informative variables if self.model_report_.preparation_report.informative_variable_number == 0: @@ -2339,9 +2325,9 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): "The fitted model is the mean regressor." ) - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Remove variables depending on the target variables_to_eliminate = [] @@ -2365,8 +2351,8 @@ def _fit_training_post_process(self, dataset): self.feature_used_importances_ = feature_used_importances_ self.n_features_used_ = len(self.feature_used_names_) - def _check_target_type(self, dataset): - _check_numerical_target_type(dataset) + def _check_target_type(self, ds): + _check_numerical_target_type(ds) # Deactivate useless super delegation because the method have different docstring # pylint: disable=useless-super-delegation @@ -2583,9 +2569,9 @@ def _numerical_transform_method(self): ) return _transform_types_numerical[self.transform_type_numerical] - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Call parent method - super()._fit_check_params(dataset, **kwargs) + super()._fit_check_params(ds, **kwargs) # Check 'transform_type_categorical' parameter if not isinstance(self.transform_type_categorical, str): @@ -2617,11 +2603,11 @@ def _fit_check_params(self, dataset, **kwargs): "cannot be both None with n_trees == 0." ) - def _check_target_type(self, dataset): + def _check_target_type(self, ds): if self.categorical_target: - _check_categorical_target_type(dataset) + _check_categorical_target_type(ds) else: - _check_numerical_target_type(dataset) + _check_numerical_target_type(ds) def _fit_core_training_function(self, *args, **kwargs): return kh.train_recoder(*args, **kwargs) @@ -2661,10 +2647,10 @@ def fit(self, X, y=None, **kwargs): # pylint: enable=useless-super-delegation - def _fit_prepare_training_function_inputs(self, dataset, computation_dir): + def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Call the parent method args, kwargs = super()._fit_prepare_training_function_inputs( - dataset, computation_dir + ds, computation_dir ) # Rename encoder parameters, delete unused ones @@ -2680,9 +2666,9 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): return args, kwargs - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Eliminate the target variable from the main dictionary self._get_main_dictionary() @@ -2733,9 +2719,9 @@ def transform(self, X): # Create and transform the dataset try: - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) X_transformed = super()._transform( - dataset, + ds, computation_dir, self.model_.copy, True, @@ -2745,7 +2731,7 @@ def transform(self, X): finally: self._cleanup_computation_dir(computation_dir) kh.get_runner().root_temp_dir = initial_runner_temp_dir - if dataset.is_in_memory(): + if ds.is_in_memory(): return X_transformed.to_numpy(copy=False) return X_transformed diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 3245f1c7..6c758920 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -6,7 +6,6 @@ ###################################################################################### """Classes for handling diverse data tables""" import csv -import functools import io import json import os From c0b68b6f9868464e6cd4c566c409d8e8574b3c82 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Wed, 12 Jun 2024 09:49:56 +0200 Subject: [PATCH 07/12] Move target_column from tables to dataset --- khiops/sklearn/estimators.py | 67 ++-- khiops/utils/dataset.py | 695 ++++++++++++----------------------- tests/test_dataset_class.py | 10 +- tests/test_dataset_errors.py | 146 +++----- tests/test_sklearn.py | 49 ++- 5 files changed, 373 insertions(+), 594 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 6cab5644..0c7b9006 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -50,7 +50,12 @@ is_list_like, type_error_message, ) -from khiops.utils.dataset import Dataset, FileTable, read_internal_data_table +from khiops.utils.dataset import ( + Dataset, + FileTable, + get_khiops_variable_name, + read_internal_data_table, +) # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -123,14 +128,14 @@ def _check_dictionary_compatibility( def _check_categorical_target_type(ds): - if ds.target_column_type is None: + if ds.target_column is None: raise ValueError("Target vector is not specified.") if ds.is_in_memory() and not ( - isinstance(ds.target_column_type, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(ds.target_column_type) - or pd.api.types.is_integer_dtype(ds.target_column_type) - or pd.api.types.is_float_dtype(ds.target_column_type) + isinstance(ds.target_column_dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column_dtype) + or pd.api.types.is_integer_dtype(ds.target_column_dtype) + or pd.api.types.is_float_dtype(ds.target_column_dtype) ): raise ValueError( f"'y' has invalid type '{ds.target_column_type}'. " @@ -145,16 +150,16 @@ def _check_categorical_target_type(ds): def _check_numerical_target_type(ds): - if ds.target_column_type is None: + if ds.target_column is None: raise ValueError("Target vector is not specified.") if ds.is_in_memory(): - if not pd.api.types.is_numeric_dtype(ds.target_column_type): + if not pd.api.types.is_numeric_dtype(ds.target_column_dtype): raise ValueError( f"Unknown label type '{ds.target_column_type}'. " "Expected a numerical type." ) - if ds.main_table.target_column is not None: - assert_all_finite(ds.main_table.target_column) + if ds.target_column is not None: + assert_all_finite(ds.target_column) elif not ds.is_in_memory() and ds.target_column_type != "Numerical": raise ValueError( f"Target column has invalid type '{ds.target_column_type}'. " @@ -1458,7 +1463,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): ds.create_khiops_dictionary_domain(), ds.main_table.name, main_table_path, - ds.main_table.get_khiops_variable_name(ds.main_table.target_column_id), + get_khiops_variable_name(ds.target_column_id), output_dir, ] @@ -1503,9 +1508,7 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Set the target variable name - self.model_target_variable_name_ = ds.main_table.get_khiops_variable_name( - ds.main_table.target_column_id - ) + self.model_target_variable_name_ = get_khiops_variable_name(ds.target_column_id) # Verify it has at least one dictionary and a root dictionary in multi-table if len(self.model_.dictionaries) == 1: @@ -1888,10 +1891,10 @@ def __init__( self._predicted_target_meta_data_tag = "Prediction" def _is_real_target_dtype_integer(self): - assert self._original_target_type is not None, "Original target type not set" - return pd.api.types.is_integer_dtype(self._original_target_type) or ( - isinstance(self._original_target_type, pd.CategoricalDtype) - and pd.api.types.is_integer_dtype(self._original_target_type.categories) + assert self._original_target_dtype is not None, "Original target type not set" + return pd.api.types.is_integer_dtype(self._original_target_dtype) or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_integer_dtype(self._original_target_dtype.categories) ) def _sorted_prob_variable_names(self): @@ -1953,14 +1956,14 @@ def _fit_check_dataset(self, ds): # Check that the target is for classification in in_memory_tables if ds.is_in_memory(): - current_type_of_target = type_of_target(ds.main_table.target_column) + current_type_of_target = type_of_target(ds.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( f"Unknown label type: '{current_type_of_target}' " "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ds.is_in_memory() and len(np.unique(ds.main_table.target_column)) == 1: + if ds.is_in_memory() and len(np.unique(ds.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -1973,7 +1976,10 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Save the target datatype - self._original_target_type = ds.target_column_type + if ds.is_in_memory(): + self._original_target_dtype = ds.target_column_dtype + else: + self._original_target_dtype = np.dtype("object") # Save class values in the order of deployment self.classes_ = [] @@ -2052,18 +2058,25 @@ def predict(self, X): y_pred = y_pred.to_numpy(copy=False).ravel() # If integer and string just transform - if pd.api.types.is_integer_dtype(self._original_target_type): - y_pred = y_pred.astype(self._original_target_type) - elif pd.api.types.is_string_dtype(self._original_target_type): + if pd.api.types.is_integer_dtype(self._original_target_dtype): + y_pred = y_pred.astype(self._original_target_dtype) + # If str transform to str + # Note: If the original type is None then it was learned with a file dataset + elif self._original_target_dtype is None or pd.api.types.is_string_dtype( + self._original_target_dtype + ): y_pred = y_pred.astype(str, copy=False) # If category first coerce the type to the categories' type else: - assert pd.api.types.is_categorical_dtype(self._original_target_type) + assert pd.api.types.is_categorical_dtype(self._original_target_dtype), ( + "_original_target_dtype is not categorical" + f", it is '{self._original_target_dtype}'" + ) if pd.api.types.is_integer_dtype( - self._original_target_type.categories.dtype + self._original_target_dtype.categories.dtype ): y_pred = y_pred.astype( - self._original_target_type.categories.dtype, copy=False + self._original_target_dtype.categories.dtype, copy=False ) else: y_pred = y_pred.astype(str, copy=False) diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 6c758920..51f6a683 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -80,10 +80,10 @@ def check_table_entry(table_name, table_spec): str, ) ) - _check_table_key(table_name, key) + check_table_key(table_name, key) -def _check_table_key(table_name, key): +def check_table_key(table_name, key): if key is not None: if not is_list_like(key) and not isinstance(key, str): raise TypeError( @@ -316,6 +316,16 @@ def get_khiops_type(numpy_type): return khiops_type +def get_khiops_variable_name(column_id): + """Return the khiops variable name associated to a column id""" + if isinstance(column_id, str): + variable_name = column_id + else: + assert isinstance(column_id, np.int64) + variable_name = f"Var{column_id}" + return variable_name + + def read_internal_data_table(file_path_or_stream): """Reads into a DataFrame a data table file with the internal format settings @@ -407,27 +417,27 @@ def __init__(self, X, y=None, categorical_target=True, key=None): self.main_table = None self.secondary_tables = None self.relations = None + self.categorical_target = categorical_target + self.target_column = None + self.target_column_id = None + self.target_column_type = None + self.target_column_dtype = None # Only for in_memory datasets self.sep = None self.header = None # Initialization from different types of input "X" # A single pandas dataframe if isinstance(X, pd.DataFrame): - self._init_tables_from_dataframe( - X, y, categorical_target=categorical_target - ) + self.main_table = PandasTable("main_table", X) + self.secondary_tables = [] # A single numpy array (or compatible object) elif hasattr(X, "__array__"): - self._init_tables_from_numpy_array( - X, - y, - categorical_target=categorical_target, - ) + self.main_table = NumpyTable("main_table", X) + self.secondary_tables = [] # A scipy.sparse.spmatrix elif isinstance(X, sp.spmatrix): - self._init_tables_from_sparse_matrix( - X, y, categorical_target=categorical_target - ) + self.main_table = SparseTable("main_table", X) + self.secondary_tables = [] # Special rejection for scipy.sparse.sparray (to pass the sklearn tests) # Note: We don't use scipy.sparse.sparray because it is not implemented in scipy # 1.10 which is the latest supporting py3.8 @@ -455,8 +465,17 @@ def __init__(self, X, y=None, categorical_target=True, key=None): ), stacklevel=3, ) - self._init_tables_from_tuple(X, y, categorical_target=categorical_target) - # A sequence + # Check the input tuple + self._check_input_tuple(X) + + # Obtain path and separator + path, sep = X + + # Initialization + self.main_table = FileTable("main_table", path=path, sep=sep) + self.secondary_tables = [] + + # A dataset sequence spec # We try first for compatible python arrays then the deprecated sequences spec elif is_list_like(X): # Try to transform to a numerical array with sklearn's check_array @@ -465,9 +484,8 @@ def __init__(self, X, y=None, categorical_target=True, key=None): # this branch's code try: X_checked = check_array(X, ensure_2d=True, force_all_finite=False) - self._init_tables_from_numpy_array( - X_checked, y, categorical_target=categorical_target - ) + self.main_table = NumpyTable("main_table", X_checked) + self.secondary_tables = [] except ValueError: warnings.warn( deprecation_message( @@ -478,16 +496,21 @@ def __init__(self, X, y=None, categorical_target=True, key=None): ), stacklevel=3, ) - self._init_tables_from_sequence(X, y, key=key) - # A dict specification + self._init_tables_from_sequence(X, key=key) + # A a dataset dict spec elif is_dict_like(X): - self._init_tables_from_mapping(X, y, categorical_target=categorical_target) + self._init_tables_from_mapping(X) # Fail if X is not recognized else: raise TypeError( type_error_message("X", X, "array-like", tuple, Sequence, Mapping) ) + # Initialization of the target column if any + if y is not None: + self._init_target_column(y) + + # Post-conditions assert self.main_table is not None, "'main_table' is 'None' after init" assert isinstance( self.secondary_tables, list @@ -495,116 +518,48 @@ def __init__(self, X, y=None, categorical_target=True, key=None): assert not self.is_multitable() or len( self.secondary_tables ), "'secondary_tables' is empty in a multi-table dataset" + assert ( + y is None or self.target_column is not None + ), "'y' is set but 'target_column' is None" - def _init_tables_from_dataframe(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type pandas.DataFrame""" - assert isinstance(X, pd.DataFrame), "'X' must be a pandas.DataFrame" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - self.main_table = PandasTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_sparse_matrix(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type scipy.sparse.spmatrix""" - assert isinstance(X, sp.spmatrix), "'X' must be a scipy.sparse.spmatrix" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - - self.main_table = SparseTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_numpy_array(self, X, y=None, categorical_target=True): - assert hasattr( - X, "__array__" - ), "'X' must be a numpy.ndarray or implement __array__" - - if y is not None: - y_checked = column_or_1d(y, warn=True) - else: - y_checked = None - self.main_table = NumpyTable( - "main_table", - X, - target_column=y_checked, - categorical_target=categorical_target, - ) - self.secondary_tables = [] - - def _init_tables_from_tuple(self, X, y=None, categorical_target=True): - """Initializes the spec from a 'X' of type tuple""" - assert isinstance(X, tuple), "'X' must be a tuple" - - # Check the input tuple - self._check_input_tuple(X, y) - - # Obtain path and separator - path, sep = X - - # Initialization - self.main_table = FileTable( - "main_table", - categorical_target=categorical_target, - target_column_id=y, - path=path, - sep=sep, - ) - self.secondary_tables = [] - - def _check_input_tuple(self, X, y=None): + def _check_input_tuple(self, X): if len(X) != 2: raise ValueError(f"'X' tuple input must have length 2 not {len(X)}") if not isinstance(X[0], str): raise TypeError(type_error_message("X[0]", X[0], str)) if not isinstance(X[1], str): raise TypeError(type_error_message("X[1]", X[1], str)) - if y is not None and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - def _init_tables_from_sequence(self, X, y=None, categorical_target=True, key=None): + def _init_tables_from_sequence(self, X, key=None): """Initializes the spec from a list-like 'X'""" assert is_list_like(X), "'X' must be a list-like" # Check the input sequence - self._check_input_sequence(X, y, key=key) + self._check_input_sequence(X, key=key) # Initialize the tables if isinstance(X[0], pd.DataFrame): - self.main_table = PandasTable( - "main_table", - X[0], - target_column=y, - categorical_target=categorical_target, - key=key, - ) + self.main_table = PandasTable("main_table", X[0], key=key) self.secondary_tables = [] for index, dataframe in enumerate(X[1:], start=1): self.secondary_tables.append( PandasTable(f"secondary_table_{index:02d}", dataframe, key=key) ) else: - self.main_table = FileTable( - "main_table", - X[0], - target_column_id=y, - categorical_target=categorical_target, - key=key, - ) + self.main_table = FileTable("main_table", X[0], key=key) self.secondary_tables = [] for index, table_path in enumerate(X[1:], start=1): self.secondary_tables.append( FileTable(f"secondary_table_{index:02d}", table_path, key=key) ) + # Create a list of relations main_table_name = self.main_table.name self.relations = [ (main_table_name, table.name, False) for table in self.secondary_tables ] - def _check_input_sequence(self, X, y=None, key=None): + def _check_input_sequence(self, X, key=None): # Check the first table if len(X) == 0: raise ValueError("'X' must be a non-empty sequence") @@ -616,35 +571,19 @@ def _check_input_sequence(self, X, y=None, key=None): for i, secondary_X in enumerate(X[1:], start=1): if not isinstance(secondary_X, main_table_type): raise TypeError( - type_error_message(f"X[{i}]", X[i], main_table_type) + type_error_message(f"Table at index {i}", X[i], main_table_type) + " as the first table in X" ) - # Check the type of y - if y is not None: - if isinstance(X[0], str) and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - elif isinstance(X[0], pd.DataFrame) and not isinstance(y, pd.Series): - raise TypeError(type_error_message("y", y, pd.Series)) + # Check the key for the main_table (it is the same for the others) + check_table_key("main_table", key) - # Check the type of key - if not is_list_like(key) and not isinstance(key, str): - raise TypeError(type_error_message("key", key, "list-like", str)) - if is_list_like(key): - for column_index, column_name in enumerate(key): - if not isinstance(column_name, str): - raise TypeError( - type_error_message( - f"key[{column_index}]", key[column_index], str - ) - ) - - def _init_tables_from_mapping(self, X, y=None, categorical_target=True): + def _init_tables_from_mapping(self, X): """Initializes the table spec from a dict-like 'X'""" assert is_dict_like(X), "'X' must be dict-like" # Check the input mapping - self._check_input_mapping(X, y) + check_dataset_spec(X) # Initialize tables objects if len(X["tables"]) == 1: @@ -669,8 +608,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.main_table = FileTable( main_table_name, main_table_source, - target_column_id=y, - categorical_target=categorical_target, key=main_table_key, sep=self.sep, header=self.header, @@ -695,8 +632,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): main_table_name, main_table_source, key=main_table_key, - target_column=y, - categorical_target=categorical_target, ) self.secondary_tables = [] for table_name, (table_source, table_key) in X["tables"].items(): @@ -710,8 +645,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): main_table_name, main_table_source, key=main_table_key, - target_column=y, - categorical_target=categorical_target, ) self.secondary_tables = [] # Initialize a numpyarray dataset (monotable) @@ -719,8 +652,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): self.main_table = NumpyTable( main_table_name, main_table_source, - target_column=y, - categorical_target=categorical_target, ) if len(X["tables"]) > 1: raise ValueError( @@ -745,40 +676,94 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): ) self.relations = relations - def _check_input_mapping(self, X, y=None): - # Check the dataset spec for X - check_dataset_spec(X) + def _init_target_column(self, y): + assert self.main_table is not None + assert self.secondary_tables is not None + # Check y's type + # For in memory target columns: + # - column_or_1d checks *and transforms* to a numpy.array if successful + # - warn=True in column_or_1d is necessary to pass sklearn checks + if isinstance(y, str): + y_checked = y + else: + y_checked = column_or_1d(y, warn=True) - # Check the target coherence with X's tables - if y is not None: - if len(X["tables"]) == 1: - main_table_source, _ = list(X["tables"].values())[0] + # Check the target type coherence with those of X's tables + if isinstance( + self.main_table, (PandasTable, SparseTable, NumpyTable) + ) and isinstance(y_checked, str): + if isinstance(self.main_table, PandasTable): + type_message = "pandas.DataFrame" + elif isinstance(self.main_table, SparseTable): + type_message = "scipy.sparse.spmatrix" else: - main_table_source, _ = X["tables"][X["main_table"]] - if ( - isinstance(main_table_source, pd.DataFrame) - and not isinstance(y, pd.Series) - and not isinstance(y, pd.DataFrame) - ): - raise TypeError( - type_error_message("y", y, pd.Series, pd.DataFrame) - + " (X's tables are of type pandas.DataFrame)" - ) - if ( - isinstance(main_table_source, sp.spmatrix) - or hasattr(main_table_source, "__array__") - ) and not hasattr(y, "__array__"): - raise TypeError( - type_error_message("y", y, "array-like") - + " (X's tables are of type numpy.ndarray" - + " or scipy.sparse.spmatrix)" + type_message = "numpy.ndarray" + raise TypeError( + type_error_message("y", y, "array-like") + + f" (X's tables are of type {type_message})" + ) + if isinstance(self.main_table, (SparseTable, NumpyTable)) and isinstance( + y_checked, str + ): + raise TypeError( + type_error_message("y", y, "array-like") + + " (X's tables are of type numpy.ndarray" + + " or scipy.sparse.spmatrix)" + ) + if isinstance(self.main_table.data_source, str) and not isinstance( + y_checked, str + ): + raise TypeError( + type_error_message("y", y, str) + + " (X's tables are of type str [file paths])" + ) + + # Initialize the members related to the target + # Case when y is a memory array + if hasattr(y_checked, "__array__"): + self.target_column = y_checked + self.target_column_dtype = self.target_column.dtype + + # Initialize the id of the target column + if isinstance(y, pd.Series) and y.name is not None: + self.target_column_id = y.name + elif isinstance(y, pd.DataFrame): + self.target_column_id = y.columns[0] + else: + if pd.api.types.is_integer_dtype(self.main_table.column_ids): + self.target_column_id = self.main_table.column_ids[-1] + 1 + else: + assert pd.api.types.is_string_dtype(self.main_table.column_ids) + self.target_column_id = "UnknownTargetColumn" + + # Fail if there is a column in the main_table with the target column's name + if self.target_column_id in self.main_table.column_ids: + raise ValueError( + f"Target column name '{self.target_column_id}' " + f"is already present in the main table. " + f"Column names: {list(self.main_table.column_ids)}" ) - if isinstance(main_table_source, str) and not isinstance(y, str): - raise TypeError( - type_error_message("y", y, str) - + " (X's tables are of type str [file paths])" + # Case when y is column id: Set both the column and the id to it + else: + assert isinstance(y, str), type_error_message("y", y, str) + self.target_column = y + self.target_column_id = y + + # Check the target column exists in the main table + if self.target_column_id not in self.main_table.column_ids: + raise ValueError( + f"Target column '{self.target_column}' not present in main table. " + f"Column names: {list(self.main_table.column_ids)}'" ) + # Force the target column type from the parameters + if self.categorical_target: + self.main_table.khiops_types[self.target_column] = "Categorical" + self.target_column_type = "Categorical" + else: + self.main_table.khiops_types[self.target_column] = "Numerical" + self.target_column_type = "Numerical" + def is_in_memory(self): """Tests whether the dataset is in-memory @@ -851,15 +836,25 @@ def create_khiops_dictionary_domain(self): # Create root dictionary and add it to the domain dictionary_domain = kh.DictionaryDomain() - root_dictionary = self.main_table.create_khiops_dictionary() - dictionary_domain.add_dictionary(root_dictionary) + main_dictionary = self.main_table.create_khiops_dictionary() + dictionary_domain.add_dictionary(main_dictionary) + + # For in-memory datasets: Add the target variable if available + if self.is_in_memory() and self.target_column is not None: + variable = kh.Variable() + variable.name = get_khiops_variable_name(self.target_column_id) + if self.categorical_target: + variable.type = "Categorical" + else: + variable.type = "Numerical" + main_dictionary.add_variable(variable) # Create the dictionaries for each secondary table and the table variables in # root dictionary that point to each secondary table # This is performed using a breadth-first-search over the graph of relations # Note: In general 'name' and 'object_type' fields of Variable can be different if self.secondary_tables: - root_dictionary.root = True + main_dictionary.root = True table_names = [table.name for table in self.secondary_tables] tables_to_visit = [self.main_table.name] while tables_to_visit: @@ -884,17 +879,18 @@ def create_khiops_dictionary_domain(self): table_variable.name = table.name table_variable.object_type = table.name parent_table_dictionary.add_variable(table_variable) + return dictionary_domain - def create_table_files_for_khiops(self, target_dir, sort=True): + def create_table_files_for_khiops(self, out_dir, sort=True): """Prepares the tables of the dataset to be used by Khiops If this is a multi-table dataset it will create sorted copies the tables. Parameters ---------- - target_dir : str - The directory where the sorted tables will be created + out_dir : str + The directory where the sorted tables will be created. Returns ------- @@ -911,22 +907,27 @@ def create_table_files_for_khiops(self, target_dir, sort=True): sort_main_table = sort and ( self.is_multitable() or self.main_table.key is not None ) - main_table_path = self.main_table.create_table_file_for_khiops( - target_dir, sort=sort_main_table - ) + if self.is_in_memory(): + main_table_path = self.main_table.create_table_file_for_khiops( + out_dir, + sort=sort_main_table, + target_column=self.target_column, + target_column_id=self.target_column_id, + ) + else: + main_table_path = self.main_table.create_table_file_for_khiops( + out_dir, + sort=sort_main_table, + ) # Create a copy of each secondary table secondary_table_paths = {} for table in self.secondary_tables: secondary_table_paths[table.name] = table.create_table_file_for_khiops( - target_dir, sort=sort + out_dir, sort=sort ) - return main_table_path, secondary_table_paths - @property - def target_column_type(self): - """The target column's type""" - return self.main_table.target_column_type + return main_table_path, secondary_table_paths def __repr__(self): return str(self.create_khiops_dictionary_domain()) @@ -935,7 +936,7 @@ def __repr__(self): class DatasetTable(ABC): """A generic dataset table""" - def __init__(self, name, categorical_target=True, key=None): + def __init__(self, name, key=None): # Check input if not isinstance(name, str): raise TypeError(type_error_message("name", name, str)) @@ -957,12 +958,10 @@ def __init__(self, name, categorical_target=True, key=None): # Initialization (must be completed by concrete sub-classes) self.name = name self.data_source = None - self.categorical_target = categorical_target if is_list_like(key) or key is None: self.key = key else: self.key = [key] - self.target_column_id = None self.column_ids = None self.khiops_types = None self.n_samples = None @@ -1010,43 +1009,25 @@ def create_khiops_dictionary(self): dictionary.key = list(self.key) # For each column add a Khiops variable to the dictionary - for column_id in self._get_all_column_ids(): + for column_id in self.column_ids: variable = kh.Variable() - - # Set the variable name for string and integer column indexes - if isinstance(column_id, str): - variable.name = str(column_id) - else: - assert isinstance(column_id, (np.int64, int)) - variable.name = f"Var{column_id}" + variable.name = get_khiops_variable_name(column_id) # Set the type of the column/variable # Case of a column in the key : Set to categorical if self.key is not None and column_id in self.key: variable.type = "Categorical" - # Case of the target column: Set to specified type - elif column_id == self.target_column_id: - assert self.target_column_id is not None - if self.categorical_target: - variable.type = "Categorical" - else: - variable.type = "Numerical" # The rest of columns: Obtain the type from dtypes else: variable.type = self.khiops_types[column_id] dictionary.add_variable(variable) return dictionary - @abstractmethod - def _get_all_column_ids(self): - """Returns the column ids including the target""" - class PandasTable(DatasetTable): """Table encapsulating the features dataframe X and the target labels y - X is of type pandas.DataFrame. - y is of type pandas.Series or pandas.DataFrame. + X is of type pandas.DataFrame. y is array-like. Parameters ---------- @@ -1056,45 +1037,17 @@ class PandasTable(DatasetTable): The data frame to be encapsulated. key : list-like of str, optional The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, dataframe, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, dataframe, key=None): # Call the parent method - super().__init__(name, categorical_target=categorical_target, key=key) + super().__init__(name, key=key) # Check inputs specific to this sub-class if not isinstance(dataframe, pd.DataFrame): raise TypeError(type_error_message("dataframe", dataframe, pd.DataFrame)) if dataframe.shape[0] == 0: raise ValueError("'dataframe' is empty") - if target_column is not None: - if not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - if isinstance(target_column, pd.Series): - if ( - target_column.name is not None - and target_column.name in dataframe.columns - ): - raise ValueError( - f"Target series name '{target_column.name}' " - f"is already present in dataframe : {list(dataframe.columns)}" - ) - elif isinstance(target_column, pd.DataFrame): - number_of_target_columns = len(target_column.columns) - if number_of_target_columns != 1: - raise ValueError( - "Target dataframe should contain exactly one column. " - f"It contains {number_of_target_columns}." - ) - target_column = target_column.iloc[:, 0] # Initialize the attributes self.data_source = dataframe @@ -1124,21 +1077,6 @@ def __init__( for column_id in self.column_ids } - # Initialize target column (if any) - self.target_column = target_column - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - self.target_column_id = target_column.name - else: - if pd.api.types.is_integer_dtype(self.column_ids): - self.target_column_id = self.column_ids[-1] + 1 - else: - assert pd.api.types.is_string_dtype(self.column_ids) - self.target_column_id = "UnknownTargetColumn" - # Check key integrity self.check_key() @@ -1151,35 +1089,31 @@ def __repr__(self): f"dtypes={dtypes_str}; target={self.target_column_id}>" ) - def _get_all_column_ids(self): - if self.target_column is not None: - all_column_ids = list(self.column_ids) + [self.target_column_id] - else: - all_column_ids = list(self.column_ids) - return all_column_ids - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, np.int64) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): assert not sort or self.key is not None, "Cannot sort table without a key" assert not sort or is_list_like( self.key ), "Cannot sort table with a key is that is not list-like" assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None # Create the output table resource object output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the output dataframe output_dataframe = self._create_dataframe_copy() + output_names = { + column_id: get_khiops_variable_name(column_id) + for column_id in self.column_ids + } + output_dataframe.rename(columns=output_names, inplace=True) + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) # Sort by key if requested (as string) if sort: @@ -1200,43 +1134,8 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path def _create_dataframe_copy(self): - """Creates an in-memory copy of the dataframe with the target column""" - # Create a copy of the dataframe and add a copy of the target column (if any) - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - output_target_column = self.target_column.reset_index(drop=True) - else: - output_target_column = pd.Series( - self.target_column, name=self.target_column_id - ) - output_dataframe = pd.concat( - [self.data_source.reset_index(drop=True), output_target_column], - axis=1, - ) - else: - output_dataframe = self.data_source.copy() - - # Rename the columns - output_dataframe_column_names = {} - for column_id in self._get_all_column_ids(): - output_dataframe_column_names[column_id] = self.get_khiops_variable_name( - column_id - ) - output_dataframe.rename( - output_dataframe_column_names, axis="columns", inplace=True - ) - - return output_dataframe - - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type + """Creates an in memory copy of the dataframe""" + return self.data_source.copy() class NumpyTable(DatasetTable): @@ -1250,38 +1149,19 @@ class NumpyTable(DatasetTable): The data frame to be encapsulated. key : :external:term`array-like` of int, optional The names of the columns composing the key - target_column : :external:term:`array-like` of shape (n_samples,) , optional - The series representing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, array, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, array, key=None): # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) + super().__init__(name, key=key) # Check the array's types and shape if not hasattr(array, "__array__"): raise TypeError(type_error_message("array", array, np.ndarray)) - # Check (and potentially transform with a copy) the array's data - checked_array = check_array(array, ensure_2d=True, force_all_finite=False) - - # Check the target's types and shape - if target_column is not None: - checked_target_column = column_or_1d(target_column, warn=True) - # Initialize the members - self.data_source = checked_array - self.column_ids = list(range(self.data_source.shape[1])) - self.target_column_id = self.data_source.shape[1] - if target_column is not None: - self.target_column = checked_target_column - else: - self.target_column = None - self.categorical_target = categorical_target + self.data_source = check_array(array, ensure_2d=True, force_all_finite=False) + self.column_ids = column_or_1d(range(self.data_source.shape[1])) self.khiops_types = { column_id: get_khiops_type(self.data_source.dtype) for column_id in self.column_ids @@ -1295,23 +1175,9 @@ def __repr__(self): f"dtype={dtype_str}; target={self.target_column_id}>" ) - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): assert not sort or self.key is not None, "Cannot sort table without a key" assert not sort or is_list_like( self.key @@ -1324,9 +1190,13 @@ def create_table_file_for_khiops(self, output_dir, sort=True): # Write the output dataframe # Note: This is not optimized for memory. output_dataframe = pd.DataFrame(self.data_source.copy()) - output_dataframe.columns = [f"Var{column_id}" for column_id in self.column_ids] - if self.target_column is not None: - output_dataframe[f"Var{self.target_column_id}"] = self.target_column + output_dataframe.columns = [ + get_khiops_variable_name(column_id) for column_id in self.column_ids + ] + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) # Sort by key if requested (as string) if sort: @@ -1347,13 +1217,6 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type - class SparseTable(DatasetTable): """Table encapsulating feature matrix X and target array y @@ -1369,18 +1232,12 @@ class SparseTable(DatasetTable): The sparse matrix to be encapsulated. key : list-like of str, optional The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ - def __init__( - self, name, matrix, key=None, target_column=None, categorical_target=True - ): + def __init__(self, name, matrix, key=None): assert key is None, "'key' must be unset for sparse matrix tables" # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) + super().__init__(name, key=key) # Check the sparse matrix types if not isinstance(matrix, sp.spmatrix): @@ -1392,21 +1249,11 @@ def __init__( type_error_message("'matrix' dtype", matrix.dtype, "numeric") ) - # Check the target's types - if target_column is not None and not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - # Initialize the members self.data_source = matrix - self.column_ids = list(range(self.data_source.shape[1])) - self.target_column_id = self.data_source.shape[1] - self.target_column = target_column - self.categorical_target = categorical_target + self.column_ids = column_or_1d(range(matrix.shape[1])) self.khiops_types = { - column_id: get_khiops_type(self.data_source.dtype) - for column_id in self.column_ids + column_id: get_khiops_type(matrix.dtype) for column_id in self.column_ids } self.n_samples = self.data_source.shape[0] @@ -1414,7 +1261,7 @@ def __repr__(self): dtype_str = str(self.data_source.dtype) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtype={dtype_str}; target={self.target_column_id}>" + f"dtype={dtype_str}>" ) def create_khiops_dictionary(self): @@ -1438,34 +1285,14 @@ def create_khiops_dictionary(self): # For each variable, add metadata, named `VarKey` variable_names = [variable.name for variable in dictionary.variables] - target_column_variable_name = self.get_khiops_variable_name( - self.target_column_id - ) for i, variable_name in enumerate(variable_names, 1): - if variable_name != target_column_variable_name: - variable = dictionary.remove_variable(variable_name) - variable.meta_data.add_value("VarKey", i) - variable_block.add_variable(variable) + variable = dictionary.remove_variable(variable_name) + variable.meta_data.add_value("VarKey", i) + variable_block.add_variable(variable) dictionary.add_variable_block(variable_block) return dictionary - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - def _flatten(self, iterable): if isinstance(iterable, Iterable): for iterand in iterable: @@ -1474,13 +1301,9 @@ def _flatten(self, iterable): else: yield iterand - def _write_sparse_block(self, row_index, stream, target=None): - assert row_index in range( - self.data_source.shape[0] - ), "'row_index' must be coherent with the shape of the sparse matrix" - if target is not None: - assert target in self.target_column, "'target' must be in the target column" - stream.write(f"{target}\t") + def _write_sparse_block(self, row_index, stream, target_value=None): + + # Access the sparse row row = self.data_source.getrow(row_index) # Variable indices are not always sorted in `row.indices` # Khiops needs variable indices to be sorted @@ -1499,26 +1322,35 @@ def _write_sparse_block(self, row_index, stream, target=None): ] for variable_index, variable_value in zip(sorted_indices, sorted_data): stream.write(f"{variable_index + 1}:{variable_value} ") - stream.write("\n") - def create_table_file_for_khiops(self, output_dir, sort=True): + # Write the target value at the end of the record if available + if target_value is not None: + stream.write(f"\t{target_value}\n") + else: + stream.write("\n") + + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None + # Create the output table resource object output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the sparse matrix to an internal table file with io.StringIO() as output_sparse_matrix_stream: - if self.target_column is not None: - target_column_name = self.get_khiops_variable_name( - self.target_column_id - ) + if target_column is not None: output_sparse_matrix_stream.write( - f"{target_column_name}\tSparseVariables\n" + f"SparseVariables\t{get_khiops_variable_name(target_column_id)}\n" ) - for target, row_index in zip( - self.target_column, range(self.data_source.shape[0]) + for target_value, row_index in zip( + target_column, range(self.data_source.shape[0]) ): self._write_sparse_block( - row_index, output_sparse_matrix_stream, target=target + row_index, + output_sparse_matrix_stream, + target_value=target_value, ) else: output_sparse_matrix_stream.write("SparseVariables\n") @@ -1531,13 +1363,6 @@ def create_table_file_for_khiops(self, output_dir, sort=True): return output_table_path - @property - def target_column_type(self): - target_column_type = None - if self.target_column is not None: - target_column_type = self.target_column.dtype - return target_column_type - class FileTable(DatasetTable): """A table representing a delimited text file @@ -1554,24 +1379,18 @@ class FileTable(DatasetTable): Indicates if the table key : list-like of str, optional The names of the columns composing the key - target_column_id : str, optional - Name of the target variable column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. """ def __init__( self, name, path, - target_column_id=None, - categorical_target=True, key=None, sep="\t", header=True, ): # Initialize parameters - super().__init__(name=name, categorical_target=categorical_target, key=key) + super().__init__(name=name, key=key) # Check the parameters specific to this sub-class if not isinstance(path, str): @@ -1583,7 +1402,6 @@ def __init__( self.data_source = path self.sep = sep self.header = header - self.target_column_id = target_column_id # Build a dictionary file from the input data table # Note: We use export_dictionary_as_json instead of read_dictionary_file @@ -1618,33 +1436,9 @@ def __init__( self.column_ids = [var["name"] for var in variables] self.khiops_types = {var["name"]: var["type"] for var in variables} - # Check the target column exists - if ( - self.target_column_id is not None - and target_column_id not in self.column_ids - ): - raise ValueError( - f"Target column '{target_column_id}'" - f"not present in columns '{self.column_ids}'" - ) - - # Force the target column type from the parameters - if self.target_column_id is not None: - if categorical_target: - self.khiops_types[target_column_id] = "Categorical" - else: - self.khiops_types[target_column_id] = "Numerical" - # Check key integrity self.check_key() - def _get_all_column_ids(self): - return list(self.column_ids) - - def get_khiops_variable_name(self, column_id): - assert column_id in self._get_all_column_ids() - return column_id - def create_table_file_for_khiops(self, output_dir, sort=True): assert not sort or self.key is not None, "key is 'None'" @@ -1687,12 +1481,3 @@ def create_table_file_for_khiops(self, output_dir, sort=True): fs.write(output_table_file_path, fs.read(self.data_source)) return output_table_file_path - - @property - def target_column_type(self): - target_column_type = None - if self.target_column_id is not None: - target_column_type = ( - "Categorical" if self.categorical_target else "Numerical" - ) - return target_column_type diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index e8707dd4..9662e224 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -524,8 +524,8 @@ def test_out_file_from_dataframe_monotable(self): # Check that the dataframes are equal assert_frame_equal( - out_table, ref_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, ) def test_out_file_from_numpy_array_monotable(self): @@ -560,12 +560,14 @@ def _create_test_sparse_matrix_with_target(self): return sparse_matrix, target_array def _load_khiops_sparse_file(self, stream): - # skip header + # Skip header next(stream) + + # Read the sparse file target_vector = [] feature_matrix = [] for line in stream: - target, features = line.split(b"\t") + features, target_value = line.split(b"\t") feature_row = np.zeros(100) for feature in features.strip().split(b" "): indexed_feature = feature.split(b":") @@ -578,7 +580,7 @@ def _load_khiops_sparse_file(self, stream): feature_index, feature_value = indexed_feature feature_row[int(feature_index) - 1] = float(feature_value) feature_matrix.append(feature_row) - target_vector.append(float(target)) + target_vector.append(float(target_value)) target_array = np.array(target_vector) sparse_matrix = sp.csr_matrix(feature_matrix) return sparse_matrix, target_array diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 314cff70..457d6529 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -448,14 +448,44 @@ def test_y_type_must_be_str_or_array_like_1d(self): dataframe.to_csv(table_path, sep="\t", index=False) tuple_spec = (table_path, "\t") bad_y = dataframe["class"] - expected_msg = type_error_message("y", bad_y, str) + expected_msg = ( + type_error_message("y", bad_y, str) + + " (X's tables are of type str [file paths])" + ) self.assert_dataset_fails(tuple_spec, bad_y, TypeError, expected_msg) # Test when X is a dataframe: expects array-like - bad_y = AnotherType() - expected_msg = type_error_message("y", bad_y, "array-like") + bad_y = "TargetColumn" + expected_msg = ( + type_error_message("y", bad_y, "array-like") + + " (X's tables are of type pandas.DataFrame)" + ) self.assert_dataset_fails(dataframe, bad_y, TypeError, expected_msg) + def test_df_dataset_fails_if_target_column_is_already_in_the_features(self): + """Test in-memory table failing when the target is already in the features""" + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) + features_table = spec["tables"]["Reviews"][0] + bad_y = features_table["Recommended IND"] + with self.assertRaises(ValueError) as context: + Dataset(spec, bad_y) + output_error_msg = str(context.exception) + expected_msg_prefix = ( + "Target column name 'Recommended IND' is already present in the main table." + ) + self.assertIn(expected_msg_prefix, output_error_msg) + + def test_file_dataset_fails_if_table_does_not_contain_the_target_column(self): + """Test FileTable failing if the table does not contain the target column""" + table_path = os.path.join(self.output_dir, "table.csv") + table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + table.to_csv(table_path, sep="\t", index=False) + with self.assertRaises(ValueError) as context: + Dataset({"tables": {"main_table": (table_path, None)}}, y="TargetColumn") + output_error_msg = str(context.exception) + expected_msg_prefix = "Target column 'TargetColumn' not present in" + self.assertIn(expected_msg_prefix, output_error_msg) + ##################################### # Tests for dictionary dataset spec # ##################################### @@ -648,11 +678,11 @@ def test_dict_spec_format_tuple_1st_element_must_be_a_single_character(self): self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_y_type_must_be_series_or_df_when_x_is_df_spec(self): - """Test Dataset raising TypeError if X a is df-dict-spec and y isn't a Series""" + """Test Dataset raising TypeError if X a is ds-spec and y isn't array-like""" spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) - bad_y = AnotherType() + bad_y = "TargetColumnName" expected_msg = ( - type_error_message("y", bad_y, pd.Series, pd.DataFrame) + type_error_message("y", bad_y, "array-like") + " (X's tables are of type pandas.DataFrame)" ) self.assert_dataset_fails(spec, bad_y, TypeError, expected_msg) @@ -662,7 +692,7 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): spec, _ = self.create_fixture_dataset_spec( output_dir=self.output_dir, data_type="file" ) - bad_y = AnotherType() + bad_y = np.array([1, 2, 3]) expected_msg = ( type_error_message("y", bad_y, str) + " (X's tables are of type str [file paths])" @@ -671,13 +701,12 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): def test_dict_spec_table_name_must_be_str(self): """Test Dataset raising TypeError when a table name is not a str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: PandasTable( AnotherType(), features_table, - target_column=y, ) output_error_msg = str(context.exception) expected_msg = type_error_message("name", AnotherType(), str) @@ -696,41 +725,30 @@ def test_dict_spec_key_type_must_be_str_or_list_like(self): """Test Dataset raising TypeError when a key is not of the proper type""" bad_key = AnotherType() expected_error_msg = type_error_message("key", bad_key, str, int, "list-like") - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=bad_key, - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) self.assertEqual(output_error_msg, expected_error_msg) def test_dict_spec_key_column_type_must_be_str_or_int(self): """Test Dataset raising TypeError when a key column is not of the proper type""" - bad_key = {"not-a-str-or-int": []} + bad_key = [AnotherType()] expected_error_msg = ( - type_error_message("key[0]", bad_key, str, int) + " at table 'reviews'" + type_error_message("key[0]", AnotherType(), str, int) + + " at table 'reviews'" ) - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=[bad_key], - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) - self.assertEqual(output_error_msg, expected_error_msg) + self.assertEqual(expected_error_msg, output_error_msg) def test_dict_spec_relations_must_be_list_like(self): """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" @@ -866,59 +884,18 @@ def test_pandas_table_input_type_must_be_dataframe(self): def test_pandas_table_input_table_must_not_be_empty(self): """Test PandasTable raising ValueError if the input dataframe is empty""" with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - pd.DataFrame(), - target_column="class", - ) + PandasTable("reviews", pd.DataFrame()) output_error_msg = str(context.exception) expected_msg = "'dataframe' is empty" self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_target_column_must_be_series(self): - """Test PandasTable raising TypeError if the input target col. isn't a Series""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=AnotherType(), - ) - output_error_msg = str(context.exception) - expected_msg = type_error_message("target_column", AnotherType(), "array-like") - self.assertEqual(output_error_msg, expected_msg) - - def test_pandas_table_fails_if_target_column_is_already_in_the_features(self): - """Test in-memory table failing when the target is already in the features""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - y = features_table["Recommended IND"] - with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - features_table, - target_column=y, - ) - output_error_msg = str(context.exception) - expected_msg = ( - "Target series name 'Recommended IND' is already present in" - " dataframe : ['User_ID', 'Age', 'Clothing ID', 'Date', 'New'," - " 'Title', 'Recommended IND', 'Positive Feedback average']" - ) - self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_column_ids_must_all_be_int_or_str(self): """Test that in-memory dataset all columns ids must be int or str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] features_table.rename(columns={"User_ID": 1}, inplace=True) with self.assertRaises(TypeError) as context: - PandasTable("reviews", features_table, target_column=y) + PandasTable("reviews", features_table) output_error_msg = str(context.exception) expected_msg = ( "Dataframe column ids must be either all integers or all " @@ -929,21 +906,10 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self): def test_file_table_fails_with_non_existent_table_file(self): """Test FileTable failing when it is created with a non-existent file""" with self.assertRaises(ValueError) as context: - FileTable("reviews", "Review.csv", target_column_id="class") + FileTable("reviews", "Review.csv") output_error_msg = str(context.exception) expected_msg = "Non-existent data table file: Review.csv" - self.assertEqual(output_error_msg, expected_msg) - - def test_file_table_fails_if_table_does_not_contain_the_target_column(self): - """Test FileTable failing if the table does not contain the target column""" - table_path = os.path.join(self.output_dir, "table.csv") - table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - table.to_csv(table_path, sep="\t", index=False) - with self.assertRaises(ValueError) as context: - table = FileTable("table", table_path, target_column_id="class") - output_error_msg = str(context.exception) - expected_msg_prefix = "Target column" - self.assertIn(expected_msg_prefix, output_error_msg) + self.assertEqual(expected_msg, output_error_msg) def test_file_table_internal_file_creation_fails_on_an_existing_path(self): """Test FileTable failing to create an internal file to a existing path""" @@ -953,12 +919,7 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self): old_file_path = spec["tables"]["Reviews"][0] new_file_path = old_file_path.replace("Reviews.csv", "copy_Reviews.txt") os.rename(old_file_path, new_file_path) - file_table = FileTable( - "Reviews", - new_file_path, - target_column_id="class", - key="User_ID", - ) + file_table = FileTable("Reviews", new_file_path, key="User_ID") with self.assertRaises(ValueError) as context: file_table.create_table_file_for_khiops(self.output_dir, sort=False) output_error_msg = str(context.exception) @@ -1015,6 +976,7 @@ def test_sequence_spec_must_be_str_or_df(self): # Test that the second element is not str bad_spec = ["table_1", AnotherType()] expected_msg = ( - type_error_message("X[1]", bad_spec[1], str) + " as the first table in X" + type_error_message("Table at index 1", bad_spec[1], str) + + " as the first table in X" ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 8bafaf34..f9ce3872 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -2271,20 +2271,37 @@ def test_sklearn_check_estimator(self): KhiopsEncoder(n_trees=0, transform_type_numerical="0-1_normalization"), ] - # Execute sklearn's estimator test battery - for khiops_estimator in khiops_estimators: - for estimator, check in check_estimator( - khiops_estimator, generate_only=True - ): - # Skip some checks for KhiopsEncoder as they yield "empty" - # deployed tables; they need to be implemented manually - check_name = check.func.__name__ - if check_name in [ - "check_fit_score_takes_y", - "check_fit_idempotent", - ] and isinstance(estimator, KhiopsEncoder): - continue - with self.subTest( - sklearn_check_name=check_name, sklearn_check_kwargs=check.keywords + # Ignore the "No informative variables" warnings + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+no informative variables" + ) + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+No informative input variable" + ) + + # Execute sklearn's estimator test battery + print("") + for khiops_estimator in khiops_estimators: + for estimator, check in check_estimator( + khiops_estimator, generate_only=True ): - check(estimator) + # Skip some checks for KhiopsEncoder as they yield "empty" + # deployed tables; they need to be implemented manually + check_name = check.func.__name__ + if check_name in [ + "check_fit_score_takes_y", + "check_fit_idempotent", + ] and isinstance(estimator, KhiopsEncoder): + continue + print( + f">>> Executing {check_name} on " + f"{estimator.__class__.__name__}... ", + end="", + ) + with self.subTest( + sklearn_check_name=check_name, + sklearn_check_kwargs=check.keywords, + ): + check(estimator) + print("Done") From fd6a90940d13152f6a522789ed6afc4091119f61 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 14 Jun 2024 14:34:54 +0200 Subject: [PATCH 08/12] Ease conditions for input in predict* methods Before this commit input tables needed to have the same number of columns, names and types as the model dictionary. The columns needed also to be in the same order. Now the conditions are the following for the predict* and transform methods: - Columns must have the same names regardless the order of the input table. - An additional flexibility with supervised models: the target column may be present in the input table. - The types must be the same for the input but the following case is allowed: - If a given column has Numerical type as input but the model is Categorical, then it is coerced to categorical with a warning. --- khiops/sklearn/estimators.py | 125 ++++++++++++++++++++++++----------- khiops/utils/dataset.py | 27 +++++++- 2 files changed, 111 insertions(+), 41 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 0c7b9006..3d899c3c 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -93,38 +93,55 @@ def _check_dictionary_compatibility( model_dictionary, ds_dictionary, estimator_class_name, + target_variable_name=None, ): # Prefix for all error messages - error_msg_prefix = f"X contains incompatible table '{ds_dictionary.name}'" - - # Save variable arrays and their size - model_variables = model_dictionary.variables - dataset_variables = ds_dictionary.variables + error_msg_prefix = ( + f"Model {estimator_class_name} incompatible with " + f"table '{ds_dictionary.name}'" + ) + + # Put the variable names in sets + model_variable_names = {var.name for var in model_dictionary.variables} + ds_variable_names = {var.name for var in ds_dictionary.variables} + + # The only feature that may be missing of the dataset is the target + model_var_names_not_in_ds = model_variable_names - ds_variable_names + if len(model_var_names_not_in_ds) > 0: + if target_variable_name is None: + effective_model_var_names_not_in_ds = model_var_names_not_in_ds + else: + effective_model_var_names_not_in_ds = model_var_names_not_in_ds - { + target_variable_name + } + if len(effective_model_var_names_not_in_ds) > 0: + raise ValueError( + f"{error_msg_prefix}: Missing features: " + f"{effective_model_var_names_not_in_ds}." + ) - # Error if different number of variables - if len(model_variables) != len(dataset_variables): + # Raise an error if there are extra features in the input + ds_var_names_not_in_model = ds_variable_names - model_variable_names + if len(ds_var_names_not_in_model) > 0: raise ValueError( - f"{error_msg_prefix}: It has " - f"{len(dataset_variables)} feature(s) but {estimator_class_name} " - f"is expecting {len(model_variables)}. Reshape your data." + f"{error_msg_prefix}: Features not in model: {ds_var_names_not_in_model}." ) - # Check variables: Must have same name and type - for var_index, (model_variable, dataset_variable) in enumerate( - zip(model_variables, dataset_variables) - ): - if model_variable.name != dataset_variable.name: - raise ValueError( - f"{error_msg_prefix}: Feature #{var_index} should be named " - f"'{model_variable.name}' " - f"instead of '{dataset_variable.name}'" - ) - if model_variable.type != dataset_variable.type: - raise ValueError( - f"{error_msg_prefix}: Feature #{var_index} should convertible to " - f"'{model_variable.type}' " - f"instead of '{dataset_variable.type}'" - ) + # Check the type + for ds_var in ds_dictionary.variables: + model_var = model_dictionary.get_variable(ds_var.name) + if ds_var.type != model_var.type: + if model_var.type == "Categorical": + warnings.warn( + f"X contains variable '{ds_var.name}' which was deemed " + "numerical. It will be coerced to categorical." + ) + else: + raise ValueError( + f"{error_msg_prefix}: Khiops type for variable " + f"'{ds_var.name}' should be '{model_var.type}' " + f"not '{ds_var.type}'" + ) def _check_categorical_target_type(ds): @@ -419,7 +436,7 @@ def _transform( deployment_ds = self._transform_create_deployment_dataset(ds, computation_dir) # Create a deployment dictionary - deployment_dictionary_domain = _transform_create_deployment_model_fun() + deployment_dictionary_domain = _transform_create_deployment_model_fun(ds) # Deploy the model output_table_path = self._transform_deploy_model( @@ -1296,8 +1313,8 @@ def _transform_create_deployment_dataset(self, ds, computation_dir): return Dataset(deploy_dataset_spec) - def _transform_prepare_deployment_model_for_predict(self): - return self.model_ + def _transform_prepare_deployment_model_for_predict(self, _): + return self.model_.copy() def _transform_deployment_post_process( self, deployment_ds, output_table_path, drop_key @@ -1527,12 +1544,6 @@ def _fit_training_post_process(self, ds): if self.model_main_dictionary_name_ is None: raise ValueError("No model dictionary after Khiops call") - # Remove the target variable in the model dictionary - model_main_dictionary = self.model_.get_dictionary( - self.model_main_dictionary_name_ - ) - model_main_dictionary.remove_variable(self.model_target_variable_name_) - # Extract, from the preparation reports, the number of evaluated features, # their names and their levels univariate_preparation_report = self.model_report_.preparation_report @@ -1623,6 +1634,7 @@ def _transform_check_dataset(self, ds): _extract_basic_dictionary(self._get_main_dictionary()), ds.main_table.create_khiops_dictionary(), self.__class__.__name__, + target_variable_name=self.model_target_variable_name_, ) # Multi-table model: Check name and dictionary coherence of secondary tables @@ -1714,10 +1726,13 @@ def predict(self, X): assert isinstance(y_pred, (str, pd.DataFrame)), "Expected str or DataFrame" return y_pred - def _transform_prepare_deployment_model_for_predict(self): + def _transform_prepare_deployment_model_for_predict(self, ds): assert ( self._predicted_target_meta_data_tag is not None ), "Predicted target metadata tag is not set" + assert hasattr( + self, "model_main_dictionary_name_" + ), "Model main dictionary name has not been set" # Create a copy of the model dictionary using only the predicted target # Also activate the key to reorder the output in the multitable case @@ -1730,6 +1745,12 @@ def _transform_prepare_deployment_model_for_predict(self): variable.used = True else: variable.used = False + + # Remove the target variable if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + return model_copy def get_feature_used_statistics(self, modeling_report): @@ -2068,7 +2089,7 @@ def predict(self, X): y_pred = y_pred.astype(str, copy=False) # If category first coerce the type to the categories' type else: - assert pd.api.types.is_categorical_dtype(self._original_target_dtype), ( + assert isinstance(self._original_target_dtype, pd.CategoricalDtype), ( "_original_target_dtype is not categorical" f", it is '{self._original_target_dtype}'" ) @@ -2149,7 +2170,11 @@ def predict_proba(self, X): assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray" return y_probas - def _transform_prepare_deployment_model_for_predict_proba(self): + def _transform_prepare_deployment_model_for_predict_proba(self, ds): + assert hasattr( + self, "model_target_variable_name_" + ), "Target variable name has not been set" + # Create a copy of the model dictionary with only the probabilities used # We also activate the key to reorder the output in the multitable case model_copy = self.model_.copy() @@ -2163,6 +2188,11 @@ def _transform_prepare_deployment_model_for_predict_proba(self): else: variable.used = False + # Remove the target variable if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + return model_copy @@ -2689,7 +2719,7 @@ def _fit_training_post_process(self, ds): # Save the encoded feature names self.feature_names_out_ = [] for variable in self._get_main_dictionary().variables: - if variable.used: + if variable.used and variable.name != ds.target_column_id: self.feature_names_out_.append(variable.name) # Activate the key columns in multitable @@ -2736,7 +2766,7 @@ def transform(self, X): X_transformed = super()._transform( ds, computation_dir, - self.model_.copy, + self._transform_prepare_deployment_model, True, "transform.txt", ) @@ -2748,6 +2778,21 @@ def transform(self, X): return X_transformed.to_numpy(copy=False) return X_transformed + def _transform_prepare_deployment_model(self, ds): + assert hasattr( + self, "model_target_variable_name_" + ), "Target variable name has not been set" + + # Create a copy of the model dictionary domain with the target variable + # if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + model_copy = self.model_.copy() + model_dictionary = model_copy.get_dictionary(self.model_main_dictionary_name_) + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + + return model_copy + def fit_transform(self, X, y=None, **kwargs): """Fit and transforms its inputs diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 51f6a683..810bcea8 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -510,6 +510,11 @@ def __init__(self, X, y=None, categorical_target=True, key=None): if y is not None: self._init_target_column(y) + # Index the tables by name + self._tables_by_name = { + table.name: table for table in [self.main_table] + self.secondary_tables + } + # Post-conditions assert self.main_table is not None, "'main_table' is 'None' after init" assert isinstance( @@ -824,6 +829,26 @@ def copy(self): """ return Dataset(self.to_spec()) + def get_table(self, table_name): + """Returns a table by its name + + Parameters + ---------- + table_name: str + The name of the table to be retrieved. + + Returns + ------- + `DatasetTable` + The table object for the specified name. + + Raises + ------ + `KeyError` + If there is no table with the specified name. + """ + return self._tables_by_name[table_name] + def create_khiops_dictionary_domain(self): """Creates a Khiops dictionary domain representing this dataset @@ -1086,7 +1111,7 @@ def __repr__(self): ) return ( f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtypes={dtypes_str}; target={self.target_column_id}>" + f"dtypes={dtypes_str}>" ) def create_table_file_for_khiops( From 6ddb43dedceb676e1ed8fc5fcf8961990d626153 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 17 Jun 2024 17:31:32 +0200 Subject: [PATCH 09/12] Implement train/test split dataset function --- khiops/utils/dataset.py | 8 +- khiops/utils/helpers.py | 206 +++++++++++++++- tests/test_helper_functions.py | 423 ++++++++++++++++++++++++++++++--- 3 files changed, 596 insertions(+), 41 deletions(-) diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 810bcea8..b5465a4a 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -907,7 +907,7 @@ def create_khiops_dictionary_domain(self): return dictionary_domain - def create_table_files_for_khiops(self, out_dir, sort=True): + def create_table_files_for_khiops(self, output_dir, sort=True): """Prepares the tables of the dataset to be used by Khiops If this is a multi-table dataset it will create sorted copies the tables. @@ -934,14 +934,14 @@ def create_table_files_for_khiops(self, out_dir, sort=True): ) if self.is_in_memory(): main_table_path = self.main_table.create_table_file_for_khiops( - out_dir, + output_dir, sort=sort_main_table, target_column=self.target_column, target_column_id=self.target_column_id, ) else: main_table_path = self.main_table.create_table_file_for_khiops( - out_dir, + output_dir, sort=sort_main_table, ) @@ -949,7 +949,7 @@ def create_table_files_for_khiops(self, out_dir, sort=True): secondary_table_paths = {} for table in self.secondary_tables: secondary_table_paths[table.name] = table.create_table_file_for_khiops( - out_dir, sort=sort + output_dir, sort=sort ) return main_table_path, secondary_table_paths diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index f052b376..01687f5e 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -1,7 +1,10 @@ """General helper functions""" +import itertools import os +from sklearn.model_selection import train_test_split + from khiops import core as kh from khiops.core.internals.common import is_dict_like, type_error_message from khiops.utils.dataset import Dataset, FileTable, PandasTable @@ -71,7 +74,6 @@ def _sort_df_table(table): def _sort_file_table(table, sep, header, output_dir): assert isinstance(table, FileTable), type_error_message("table", table, FileTable) - domain = kh.DictionaryDomain() dictionary = table.create_khiops_dictionary() domain.add_dictionary(dictionary) @@ -88,3 +90,205 @@ def _sort_file_table(table, sep, header, output_dir): ) return out_data_source + + +def train_test_split_dataset( + ds_spec, target_column=None, test_size=0.25, output_dir=None, **kwargs +): + # Check the types + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) + + # Build the dataset for the feature table + ds = Dataset(ds_spec) + + # Check the parameter coherence + if not ds.is_in_memory(): + if target_column is not None: + raise ValueError("'target_column' cannot be used with file path datasets") + if output_dir is None: + raise ValueError("'output_dir' must be specified for file path datasets") + if not isinstance(output_dir, str): + raise TypeError(type_error_message("output_dir", output_dir, str)) + + # Perform the split for each type of dataset + if ds.is_in_memory(): + # Obtain the keys for the other test_train_split function + sklearn_split_params = {} + for param in ("train_size", "random_state", "shuffle", "stratify"): + if param in kwargs: + sklearn_split_params[param] = kwargs[param] + + if target_column is None: + train_ds, test_ds = _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + train_target_column = None + test_target_column = None + else: + train_ds, test_ds, train_target_column, test_target_column = ( + _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + ) + else: + train_ds, test_ds = _train_test_split_file_dataset(ds, test_size, output_dir) + train_target_column = None + test_target_column = None + + # Create the return tuple + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + if target_column is None: + split = itertools.chain((train_ds.to_spec(), test_ds.to_spec())) + else: + split = itertools.chain( + ( + train_ds.to_spec(), + test_ds.to_spec(), + train_target_column, + test_target_column, + ) + ) + + return split + + +def _train_test_split_in_memory_dataset( + ds, target_column, test_size, split_params=None +): + # Create shallow copies of the feature dataset + train_ds = ds.copy() + test_ds = ds.copy() + + # Split the main table and the target (if any) + if target_column is None: + train_ds.main_table.data_source, test_ds.main_table.data_source = ( + train_test_split( + ds.main_table.data_source, test_size=test_size, **split_params + ) + ) + train_target_column = None + test_target_column = None + else: + ( + train_ds.main_table.data_source, + test_ds.main_table.data_source, + train_target_column, + test_target_column, + ) = train_test_split( + ds.main_table.data_source, + target_column, + test_size=test_size, + **split_params, + ) + + # Split the secondary tables tables + # Note: The tables are traversed in BFS + todo_relations = [ + relation for relation in ds.relations if relation[0] == ds.main_table.name + ] + while todo_relations: + current_parent_table_name, current_child_table_name, _ = todo_relations.pop(0) + for relation in ds.relations: + parent_table_name, _, _ = relation + if parent_table_name == current_child_table_name: + todo_relations.append(relation) + + for new_ds in (train_ds, test_ds): + origin_child_table = ds.get_table(current_child_table_name) + new_child_table = new_ds.get_table(current_child_table_name) + new_parent_table = new_ds.get_table(current_parent_table_name) + new_parent_key_cols_df = new_parent_table.data_source[new_parent_table.key] + new_child_table.data_source = new_parent_key_cols_df.merge( + origin_child_table.data_source, on=new_parent_table.key + ) + + # Build the return value + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + if target_column is None: + return_tuple = itertools.chain((train_ds, test_ds)) + else: + return_tuple = itertools.chain( + (train_ds, test_ds, train_target_column, test_target_column) + ) + + return return_tuple + + +def _train_test_split_file_dataset(ds, test_size, output_dir): + domain = ds.create_khiops_dictionary_domain() + secondary_data_paths = domain.extract_data_paths(ds.main_table.name) + additional_data_tables = {} + output_additional_data_tables = { + "train": {}, + "test": {}, + } + # Initialize the split datasets as copies of the original one + split_dss = { + "train": ds.copy(), + "test": ds.copy(), + } + for split, split_ds in split_dss.items(): + split_ds.main_table.data_source = os.path.join( + output_dir, split, f"{split_ds.main_table.name}.txt" + ) + + for data_path in secondary_data_paths: + dictionary = domain.get_dictionary_at_data_path(data_path) + table = ds.get_table(dictionary.name) + additional_data_tables[data_path] = table.data_source + for ( + split, + split_output_additional_data_tables, + ) in output_additional_data_tables.items(): + data_table_path = os.path.join(output_dir, split, f"{table.name}.txt") + split_output_additional_data_tables[data_path] = data_table_path + split_dss[split].get_table(table.name).data_source = data_table_path + + # Construct the split with Khiops by deploying a idempotent model with selection + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["train"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["train"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Include sample", + ) + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["test"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["test"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Exclude sample", + ) + + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + return itertools.chain((split_dss["train"], split_dss["test"])) diff --git a/tests/test_helper_functions.py b/tests/test_helper_functions.py index b69d8331..233d720f 100644 --- a/tests/test_helper_functions.py +++ b/tests/test_helper_functions.py @@ -14,7 +14,7 @@ from khiops.core.dictionary import DictionaryDomain from khiops.core.helpers import build_multi_table_dictionary_domain -from khiops.utils.helpers import sort_dataset +from khiops.utils.helpers import sort_dataset, train_test_split_dataset class KhiopsHelperFunctions(unittest.TestCase): @@ -101,26 +101,29 @@ def test_build_multi_table_dictionary_domain(self): def test_sort_dataset_dataframe(self): """Tests that the sort_dataset function works for dataframe datasets""" # Create the fixture dataset - clients_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CLIENTS_CSV)) - calls_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CALLS_CSV)) + clients_df = pd.read_csv(io.StringIO(UNSORTED_CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(UNSORTED_CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(UNSORTED_CONNECTIONS_CSV)) ds_spec = { "main_table": "clients", "tables": { "clients": (clients_df, ["id"]), "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), }, - "relations": [("clients", "calls", False)], + "relations": [("clients", "calls", False), ("calls", "connections", False)], } # Call the sort_dataset function sorted_ds_spec = sort_dataset(ds_spec) ref_sorted_table_dfs = { - "clients": pd.read_csv(io.StringIO(TEST_CLIENTS_CSV)), - "calls": pd.read_csv(io.StringIO(TEST_CALLS_CSV)), + "clients": pd.read_csv(io.StringIO(CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(CONNECTIONS_CSV)), } # Check that the structure of the sorted dataset - self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) # Check that the table specs are the equivalent and the tables are sorted for table_name in ds_spec["tables"]: @@ -132,25 +135,36 @@ def test_sort_dataset_dataframe(self): def test_sort_dataset_file(self): """Tests that the sort_dataset function works for file datasets""" - # Create a execution context with temporary files and directories + # Create a execution context for temporary files and directories with contextlib.ExitStack() as exit_stack: # Create temporary files and a temporary directory clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) # Create the fixture dataset - clients_csv_file.write(bytes(UNSORTED_TEST_CLIENTS_CSV, encoding="utf8")) - calls_csv_file.write(bytes(UNSORTED_TEST_CALLS_CSV, encoding="utf8")) + clients_csv_file.write(bytes(UNSORTED_CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(UNSORTED_CALLS_CSV, encoding="ascii")) + connections_csv_file.write( + bytes(UNSORTED_CONNECTIONS_CSV, encoding="ascii") + ) clients_csv_file.flush() calls_csv_file.flush() + connections_csv_file.flush() ds_spec = { "main_table": "clients", "tables": { "clients": (clients_csv_file.name, ["id"]), "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), }, - "relations": [("clients", "calls", False)], + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], "format": (",", True), } @@ -158,10 +172,14 @@ def test_sort_dataset_file(self): sorted_ds_spec = sort_dataset(ds_spec, output_dir=tmp_dir) # Check that the structure of the sorted dataset - self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) # Check that the table specs are the equivalent and the tables are sorted - ref_sorted_tables = {"clients": TEST_CLIENTS_CSV, "calls": TEST_CALLS_CSV} + ref_sorted_tables = { + "clients": CLIENTS_CSV, + "calls": CALLS_CSV, + "connections": CONNECTIONS_CSV, + } for table_name, _ in ds_spec["tables"].items(): # Read the contents of the sorted table to a list of strings sorted_table_spec = sorted_ds_spec["tables"][table_name] @@ -178,22 +196,157 @@ def test_sort_dataset_file(self): # Check that the sorted table is equal to the reference self.assertEqual(ref_sorted_table, sorted_table) - def _assert_sorted_dataset_keeps_structure(self, ds_spec, sorted_ds_spec): - """Asserts that the sorted dataset keeps the structure of the input dataset + def test_traint_test_split_dataset_dataframe(self): + """Tests that the train_test_split_dataset function works for df datasets""" + # Create the fixture dataset + clients_df = pd.read_csv(io.StringIO(CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(CONNECTIONS_CSV)) + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_df.drop("class", axis=1), ["id"]), + "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False), ("calls", "connections", False)], + } + y = clients_df["class"] + + # Execute the train/test split function + ds_spec_train, ds_spec_test, y_train, y_test = train_test_split_dataset( + ds_spec, y, test_size=0.5, random_state=31614 + ) + + # Check that the target are the same as the reference + ref_y_train = pd.read_csv(io.StringIO(TRAIN_DF_TARGET_CSV))["class"] + ref_y_test = pd.read_csv(io.StringIO(TEST_DF_TARGET_CSV))["class"] + self._assert_series_equal(ref_y_train, y_train.reset_index()["class"]) + self._assert_series_equal(ref_y_test, y_test.reset_index()["class"]) + + # Check that the dataset spec structure is the same + self._assert_dataset_keeps_structure(ds_spec_train, ds_spec) + self._assert_dataset_keeps_structure(ds_spec_test, ds_spec) + + # Check that the table contents match those of the references + split_ds_specs = { + "train": ds_spec_train, + "test": ds_spec_test, + } + ref_table_dfs = { + "train": { + "clients": pd.read_csv(io.StringIO(TRAIN_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TRAIN_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TRAIN_DF_CONNECTIONS_CSV)), + }, + "test": { + "clients": pd.read_csv(io.StringIO(TEST_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TEST_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TEST_DF_CONNECTIONS_CSV)), + }, + } + for split, ref_tables in ref_table_dfs.items(): + for table_name in ds_spec["tables"]: + with self.subTest(split=split, table_name=table_name): + self._assert_frame_equal( + split_ds_specs[split]["tables"][table_name][0].reset_index( + drop=True + ), + ref_tables[table_name].reset_index(drop=True), + ) + + def test_train_test_split_dataset_file(self): + """Tests that the train_test_split_dataset function works for file datasets""" + # Create a execution context for temporary files and directories + with contextlib.ExitStack() as exit_stack: + # Create temporary files and a temporary directory + clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) + tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) + + # Create the fixture dataset + clients_csv_file.write(bytes(CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(CALLS_CSV, encoding="ascii")) + connections_csv_file.write(bytes(CONNECTIONS_CSV, encoding="ascii")) + clients_csv_file.flush() + calls_csv_file.flush() + connections_csv_file.flush() + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_csv_file.name, ["id"]), + "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), + }, + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], + "format": (",", True), + } + + # Call the train_test_split_dataset function + train_ds_spec, test_ds_spec = train_test_split_dataset( + ds_spec, test_size=0.5, output_dir=tmp_dir + ) + split_ds_specs = {"train": train_ds_spec, "test": test_ds_spec} + + # Check that the structure of the splitted datasets + self._assert_dataset_keeps_structure(ds_spec, train_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, test_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + ref_split_tables = { + "train": { + "clients": TRAIN_FILE_CLIENTS_CSV, + "calls": TRAIN_FILE_CALLS_CSV, + "connections": TRAIN_FILE_CONNECTIONS_CSV, + }, + "test": { + "clients": TEST_FILE_CLIENTS_CSV, + "calls": TEST_FILE_CALLS_CSV, + "connections": TEST_FILE_CONNECTIONS_CSV, + }, + } + for split, split_ds_spec in split_ds_specs.items(): + for table_name, _ in ds_spec["tables"].items(): + # Read the contents of the splitted table to a list of strings + split_table_spec = split_ds_spec["tables"][table_name] + split_table_file = exit_stack.enter_context( + open(split_table_spec[0], encoding="ascii") + ) + split_table = split_table_file.readlines() + + # Transform the reference table string to a list of strings + ref_split_table = ref_split_tables[split][table_name].splitlines( + keepends=True + ) + + # Check that the sorted table is equal to the reference + self.assertEqual(split_table, ref_split_table) + + def _assert_dataset_keeps_structure(self, ds_spec, ref_ds_spec): + """Asserts that the input dataset has the same structure as the reference It does not check the contents of the tables. """ # Check that the spec dictionary is the same excluding the tables - self.assertIn("main_table", sorted_ds_spec) - self.assertIn("tables", sorted_ds_spec) - self.assertIn("relations", sorted_ds_spec) - self.assertEqual(ds_spec["main_table"], sorted_ds_spec["main_table"]) - self.assertEqual(ds_spec["relations"], sorted_ds_spec["relations"]) - self.assertEqual(ds_spec["tables"].keys(), sorted_ds_spec["tables"].keys()) + self.assertIn("main_table", ref_ds_spec) + self.assertIn("tables", ref_ds_spec) + self.assertIn("relations", ref_ds_spec) + self.assertEqual(ds_spec["main_table"], ref_ds_spec["main_table"]) + self.assertEqual(ds_spec["relations"], ref_ds_spec["relations"]) + self.assertEqual(ds_spec["tables"].keys(), ref_ds_spec["tables"].keys()) + if "format" in ref_ds_spec: + self.assertIn("format", ds_spec) + self.assertEqual(ds_spec["format"], ref_ds_spec["format"]) # Check that the table keys are equal for table_name, table_spec in ds_spec["tables"].items(): - self.assertEqual(table_spec[1], sorted_ds_spec["tables"][table_name][1]) + self.assertEqual(table_spec[1], ref_ds_spec["tables"][table_name][1]) def _assert_frame_equal(self, ref_df, out_df): """Wrapper for the assert_frame_equal pandas function @@ -210,19 +363,37 @@ def _assert_frame_equal(self, ref_df, out_df): if failure_error is not None: self.fail(failure_error) + def _assert_series_equal(self, ref_series, out_series): + """Wrapper for the assert_frame_equal pandas function + + In case of failure of assert_frame_equal we capture the AssertionError thrown by + it and make a unittest call to fail. This reports the error found by + assert_frame_equal while avoiding a double thrown exception. + """ + failure_error = None + try: + pd.testing.assert_series_equal(ref_series, out_series) + except AssertionError as error: + failure_error = error + if failure_error is not None: + self.fail(failure_error) + # pylint: disable=line-too-long # fmt: off -TEST_CLIENTS_CSV = """ -id,name,phone,email,address,numberrange,time,date -1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" -10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" -13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" -4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" -7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" + +# Test data + +CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 """.lstrip() -TEST_CALLS_CSV = """ +CALLS_CSV = """ id,call_id,duration 1,1,38 1,20,29 @@ -235,24 +406,204 @@ def _assert_frame_equal(self, ref_df, out_df): 7,4,339 """.lstrip() -UNSORTED_TEST_CLIENTS_CSV = """ +CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip() + +UNSORTED_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +UNSORTED_CALLS_CSV = """ +id,call_id,duration +1,1,38 +10,2,7 +13,25,329 +4,2,543 +13,30,8 +13,3,1 +4,14,48 +1,20,29 +7,4,339 +""".lstrip() + +UNSORTED_CONNECTIONS_CSV = """ +id,call_id,connection_ip +13,25,193.23.02.67 +1,1,277.1.56.30 +4,14,16.56.66.16 +13,25,146.74.18.88 +13,25,118.41.87.47 +1,1,147.43.67.35 +4,14,19.30.36.57 +1,20,199.44.70.12 +10,2,170.05.79.41 +1,20,169.51.97.96 +10,2,118.45.57.51 +13,25,161.51.79.60 +13,3,115.45.02.58 +4,14,15.16.40.67 +1,1,164.27.26.50 +7,4,16.66.64.13 +13,30,12.115.90.93 +7,4,15.13.69.18 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + +TRAIN_DF_CLIENTS_CSV = """ id,name,phone,email,address,numberrange,time,date -1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" -13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" 7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" +""".lstrip() + +TRAIN_DF_CALLS_CSV = """ +id,call_id,duration +7,4,339 +13,25,329 +13,3,1 +13,30,8 +""".lstrip() + +TRAIN_DF_TARGET_CSV = """ +class +1 +0 +""".lstrip() + +TRAIN_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +7,4,16.66.64.13 +7,4,15.13.69.18 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +""".lstrip() + + +TEST_DF_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date 4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" 10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" """.lstrip() -UNSORTED_TEST_CALLS_CSV = """ +TEST_DF_TARGET_CSV = """ +class +1 +0 +1 +""".lstrip() + + +TEST_DF_CALLS_CSV = """ id,call_id,duration +4,14,48 +4,2,543 +10,2,7 1,1,38 +1,20,29 +""".lstrip() + +TEST_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +10,2,170.05.79.41 +10,2,118.45.57.51 +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +""".lstrip() + +TRAIN_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +TRAIN_FILE_CALLS_CSV = """ +id,call_id,duration 10,2,7 13,25,329 -4,2,543 -13,30,8 13,3,1 +13,30,8 4,14,48 +4,2,543 +""".lstrip() + +TRAIN_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + + +TEST_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +""".lstrip() + +TEST_FILE_CALLS_CSV = """ +id,call_id,duration +1,1,38 1,20,29 7,4,339 """.lstrip() + +TEST_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip() From 2d56546176b2ef40c86d80bf02f7d38a95aea3a0 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Tue, 18 Jun 2024 15:51:39 +0200 Subject: [PATCH 10/12] Update documentation --- .pre-commit-config.yaml | 3 + doc/convert_samples.py | 2 +- doc/core/index.rst | 12 +- doc/create-doc | 11 +- doc/internal/index.rst | 24 +- doc/samples/samples.rst | 28 ++ doc/samples/samples_sklearn.rst | 442 ++++++++++--------------- doc/sklearn/index.rst | 4 +- doc/tools/index.rst | 3 +- khiops/core/internals/runner.py | 17 +- khiops/samples/samples.ipynb | 41 +++ khiops/samples/samples.py | 30 ++ khiops/samples/samples_sklearn.ipynb | 472 +++++++++++---------------- khiops/samples/samples_sklearn.py | 469 +++++++++++--------------- khiops/sklearn/estimators.py | 111 ++++--- khiops/utils/dataset.py | 47 ++- khiops/utils/helpers.py | 47 ++- 17 files changed, 831 insertions(+), 932 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62e0fee2..ff2a227c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,13 +33,16 @@ repos: rev: 0.29.0 hooks: - id: check-github-workflows + name: gh-workflows args: [--verbose] - id: check-github-actions + name: gh-actions args: [--verbose] - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: 3.0.0 hooks: - id: shellcheck + name: shellcheck - repo: local hooks: - id: samples-generation diff --git a/doc/convert_samples.py b/doc/convert_samples.py index f1741cf3..7feb6743 100644 --- a/doc/convert_samples.py +++ b/doc/convert_samples.py @@ -67,7 +67,7 @@ def create_rest_page_header(script_name): subtitle += ":py:mod:`khiops.core` module." else: title = "Samples sklearn" - subtitle += ":py:mod:`khiops.sklearn` module." + subtitle += ":py:mod:`khiops.sklearn ` module." return ( ":orphan:\n" "\n" diff --git a/doc/core/index.rst b/doc/core/index.rst index 14987e01..74c33bee 100644 --- a/doc/core/index.rst +++ b/doc/core/index.rst @@ -20,9 +20,9 @@ Main Modules :recursive: :nosignatures: - khiops.core.api - khiops.core.dictionary - khiops.core.analysis_results - khiops.core.coclustering_results - khiops.core.exceptions - khiops.core.helpers + api + dictionary + analysis_results + coclustering_results + exceptions + helpers diff --git a/doc/create-doc b/doc/create-doc index 17b2d363..1de0c1ac 100755 --- a/doc/create-doc +++ b/doc/create-doc @@ -90,21 +90,18 @@ fi # Create the coursework materials echo "Creating ZIP files" -(cd "$KHIOPS_TUTORIAL_REPO_DIR" && cp -r data helper_functions.py "../$tutorials_dir") cd "$tutorials_dir" mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "core_tutorials_solutions.zip" Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "core_tutorials_solutions.zip" Core*.ipynb data/*/* exercises/.dummy +zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "$KHIOPS_TUTORIAL_REPO_DIR" python create-coursework.py cd coursework mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "../../$tutorials_dir/core_tutorials.zip" \ - Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "../../$tutorials_dir/sklearn_tutorials.zip" \ - Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "../../$tutorials_dir/core_tutorials.zip" Core*.ipynb data/*/* exercises/.dummy +zip "../../$tutorials_dir/sklearn_tutorials.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "../.." # Create the documentation with Sphinx diff --git a/doc/internal/index.rst b/doc/internal/index.rst index 63a5a779..db8301e0 100644 --- a/doc/internal/index.rst +++ b/doc/internal/index.rst @@ -3,17 +3,23 @@ Internals These are internal modules with no "data science" functionality. Their documentation is available for completeness. +.. currentmodule:: khiops.utils .. autosummary:: :nosignatures: :toctree: generated - khiops.utils.dataset - khiops.core.internals.common - khiops.core.internals.filesystems - khiops.core.internals.io - khiops.core.internals.runner - khiops.core.internals.scenario - khiops.core.internals.task - khiops.core.internals.types - khiops.core.internals.version + dataset +.. currentmodule:: khiops.core.internals +.. autosummary:: + :nosignatures: + :toctree: generated + + common + filesystems + io + runner + scenario + task + types + version diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst index 4a64ee4a..5f35064b 100644 --- a/doc/samples/samples.rst +++ b/doc/samples/samples.rst @@ -1185,6 +1185,34 @@ Samples output_data_table_path, sort_variables=["AccidentId", "VehicleId"], ) +.. autofunction:: sort_data_tables_mt +.. code-block:: python + + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) .. autofunction:: extract_keys_from_data_table .. code-block:: python diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 80df3fe4..b2e7fc87 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -5,7 +5,7 @@ Samples sklearn =============== -The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.sklearn` module. +The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.sklearn ` module. Script and Jupyter notebook --------------------------- @@ -152,55 +152,32 @@ Samples import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics - from sklearn.model_selection import train_test_split - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { + # Create the dataset spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) # Train the classifier (by default it analyzes 100 multi-table features) khc = KhiopsClassifier() @@ -224,6 +201,73 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_multitable_star_file +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + os.makedirs(results_dir, exist_ok=True) + + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_multitable_snowflake .. code-block:: python @@ -232,29 +276,31 @@ Samples import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) places_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. X = { "main_table": "Accidents", "tables": { @@ -270,16 +316,22 @@ Samples ], } - # Load the target variable from the AccidentsSummary dataset + # Load the target variable "Gravity" from the "AccidentsSummary" dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) # Train the classifier (by default it creates 1000 multi-table features) khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) + khc.fit(X_train, y_train) # Show the feature importance info print(f"Features evaluated: {khc.n_features_evaluated_}") @@ -290,23 +342,23 @@ Samples print("---") # Predict the class on the test dataset - y_pred = khc.predict(X) + y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") - print(y_pred[:10]) + print(y_test_pred[:10]) print("---") - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) print(f"Class order: {khc.classes_}") print("Predicted class probabilities (first 10):") - print(y_probas[:10]) + print(y_test_probas[:10]) print("---") - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_sparse .. code-block:: python @@ -365,20 +417,9 @@ Samples # Imports import os - import pandas as pd import pickle - from khiops import core as kh from khiops.sklearn import KhiopsClassifier - - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) + from sklearn.datasets import load_iris # Create/clean the output directory results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") @@ -388,9 +429,14 @@ Samples else: os.makedirs(results_dir, exist_ok=True) + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) # Unpickle it with open(khc_pickle_path, "rb") as khc_pickle_file: @@ -413,22 +459,14 @@ Samples from sklearn import metrics from sklearn.model_selection import train_test_split - # Load the dataset into a pandas dataframe + # Load the "Adult" dataset and set the target to the "age" column adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # Create the regressor object khr = KhiopsRegressor() @@ -459,21 +497,14 @@ Samples .. code-block:: python # Imports - import os - import pandas as pd - from khiops import core as kh from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Load the dataset + X, y = load_iris(return_X_y=True) # Create the encoder object - khe = KhiopsEncoder() + khe = KhiopsEncoder(transform_type_numerical="part_label") khe.fit(X, y) # Transform the training dataset @@ -481,7 +512,7 @@ Samples # Print both the original and transformed features print("Original:") - print(X.head(10)) + print(X[:10]) print("---") print("Encoded feature names:") print(khe.feature_names_out_) @@ -497,42 +528,34 @@ Samples from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] - # Create the KhiopsEncoder with 10 additional multitable features and fit it + # Create the KhiopsEncoder with 5 multitable features and fit it khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) .. autofunction:: khiops_encoder_multitable_snowflake .. code-block:: python @@ -543,33 +566,38 @@ Samples from khiops.sklearn import KhiopsEncoder # Load the tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) - # Build the multitable input X + # Build the multi-table spec # Note: We discard the "Gravity" field from the "Users" table as it was used to # build the target column X = { "main_table": "Accidents", "tables": { "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), }, "relations": [ ("Accidents", "Vehicles"), + ("Accidents", "Places", True), ("Vehicles", "Users"), ], } @@ -577,9 +605,12 @@ Samples # Load the target variable from the AccidentsSummary dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -612,22 +643,14 @@ Samples from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder - # Load the dataset into a pandas dataframe + # Load the dataset into dataframes adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # Create the pipeline and fit it. Steps: # - The khiops supervised column encoder, generates a full-categorical table @@ -638,8 +661,6 @@ Samples ( "onehot_enc", ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), ), ("hgb_clf", HistGradientBoostingClassifier()), ] @@ -674,13 +695,13 @@ Samples from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) # Create the KhiopsCoclustering instance khcc = KhiopsCoclustering() @@ -704,9 +725,9 @@ Samples from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) @@ -738,32 +759,24 @@ Samples from sklearn.model_selection import train_test_split # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") @@ -771,16 +784,16 @@ Samples khc = KhiopsClassifier(key="AccidentId") # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) + khc.fit([X_train, X_train_secondary], y_train) # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) + y_test_pred = khc.predict([X_test, X_test_secondary]) print("Predicted classes (first 10):") print(y_test_pred[:10]) print("---") # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) print("Predicted class probabilities (first 10):") print(y_test_probas[:10]) print("---") @@ -790,102 +803,3 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") -.. autofunction:: khiops_classifier_multitable_star_file -.. code-block:: python - - # Imports - import os - import pandas as pd - from khiops import core as kh - from khiops.sklearn import KhiopsClassifier - from sklearn import metrics - from sklearn.model_selection import train_test_split - - # Create output directory - results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_file") - if not os.path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not os.path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = os.path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = os.path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") diff --git a/doc/sklearn/index.rst b/doc/sklearn/index.rst index e5f05932..27e3dcf7 100644 --- a/doc/sklearn/index.rst +++ b/doc/sklearn/index.rst @@ -8,13 +8,13 @@ khiops.sklearn from khiops.sklearn import KhiopsClassifier clf = KhiopsClassifier() -.. currentmodule:: khiops +.. currentmodule:: khiops.sklearn .. autosummary:: :toctree: generated :recursive: :nosignatures: - khiops.sklearn.estimators + estimators Related Docs ------------ diff --git a/doc/tools/index.rst b/doc/tools/index.rst index 4dbdf1f3..162cc3be 100644 --- a/doc/tools/index.rst +++ b/doc/tools/index.rst @@ -7,4 +7,5 @@ These are auxiliary tools for the Khiops Python library. :toctree: generated :nosignatures: - khiops.tools + utils.helpers + tools diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index e15b8f37..d29adfa2 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -72,7 +72,7 @@ def get_dir_status(a_dir): return status -def check_samples_dir(samples_dir): +def _check_samples_dir(samples_dir): # Warn if there are problems with the samples_dir samples_dir_status = get_dir_status(samples_dir) download_msg = ( @@ -295,20 +295,15 @@ def _check_executable(bin_path): def get_linux_distribution_name(): - """Detect Linux distribution name + """Detect the Linux distribution name - Parses the `NAME` variable defined in the `/etc/os-release` or - `/usr/lib/os-release` files and converts it to lowercase. + Parses the ``NAME`` variable defined in the ``/etc/os-release`` or + ``/usr/lib/os-release`` files and converts it to lowercase. Returns ------- str Name of the Linux distribution, converted to lowecase - - Raises - ------ - OSError - If neither `/etc/os-release` nor `/usr/lib/os-release` are found """ def get_linux_distribution_from_os_release_file(os_release_file_path): @@ -1530,13 +1525,13 @@ def _tool_path(self, tool_name): def _set_samples_dir(self, samples_dir): """Checks and sets the samples directory""" - check_samples_dir(samples_dir) + _check_samples_dir(samples_dir) super()._set_samples_dir(samples_dir) def _get_samples_dir(self): # Check the samples dir once (the check emmits only warnings) if not self._samples_dir_checked: - check_samples_dir(self._samples_dir) + _check_samples_dir(self._samples_dir) self._samples_dir_checked = True return self._samples_dir diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index 53b2df7b..bccde760 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -1563,6 +1563,47 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `sort_data_tables_mt()`\n\n", + "Sorts with the dedicated helper a multi-table dataset by the default keys\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops.utils.helpers import sort_dataset\n", + "\n", + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", + "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"sort_data_tables_mt\")\n", + "\n", + "# Build the dataset spec\n", + "ds_spec = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_table_path, \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_table_path, \"AccidentId\"),\n", + " },\n", + "}\n", + "\n", + "# Sort the dataset\n", + "sort_dataset(ds_spec, output_dir=results_dir)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py index 0ca90133..da8b6288 100644 --- a/khiops/samples/samples.py +++ b/khiops/samples/samples.py @@ -1329,6 +1329,35 @@ def sort_data_table_expert(): ) +def sort_data_tables_mt(): + """Sorts with the dedicated helper a multi-table dataset by the default keys""" + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) + + def extract_keys_from_data_table(): """Extracts the keys from a database @@ -1662,6 +1691,7 @@ def build_deployed_dictionary(): deploy_regressor_for_metrics, sort_data_table, sort_data_table_expert, + sort_data_tables_mt, extract_keys_from_data_table, train_coclustering, simplify_coclustering, diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 2aa029fb..3c94880a 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -164,55 +164,32 @@ "import pandas as pd\n", "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", - "from sklearn.model_selection import train_test_split\n", "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "# Load the dataset into pandas dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Split the root dataframe into train and test\n", - "accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Obtain the main X feature table and the y target vector (\"Class\" column)\n", - "y_train = accidents_train_df[\"Gravity\"]\n", - "y_test = accidents_test_df[\"Gravity\"]\n", - "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - "# Create the dataset multitable specification for the train/test split\n", - "# We specify each table with a name and a tuple (dataframe, key_columns)\n", - "X_train = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - "}\n", - "X_test = {\n", + "# Create the dataset spec and the target\n", + "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (X_test_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", "}\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(\n", + " X, y, test_size=0.3, random_state=1\n", + ")\n", "\n", "# Train the classifier (by default it analyzes 100 multi-table features)\n", "khc = KhiopsClassifier()\n", @@ -238,12 +215,92 @@ "print(f\"Test auc = {test_auc}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_star_file()`\n\n", + "Trains a `.KhiopsClassifier` with a file dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# Create output directory\n", + "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n", + "os.makedirs(results_dir, exist_ok=True)\n", + "\n", + "# Create the dataset spec\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " \"AccidentId\",\n", + " ),\n", + " \"Vehicles\": (\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", + " [\"AccidentId\", \"VehicleId\"],\n", + " ),\n", + " },\n", + " \"format\": (\"\\t\", True),\n", + "}\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test = train_test_split_dataset(\n", + " X, output_dir=os.path.join(results_dir, \"split\"), test_size=0.3\n", + ")\n", + "\n", + "# Create the classifier and fit it\n", + "khc = KhiopsClassifier(output_dir=results_dir)\n", + "khc.fit(X_train, y=\"Gravity\")\n", + "\n", + "# Predict the class in addition to the class probabilities on the test dataset\n", + "y_test_pred_path = khc.predict(X_test)\n", + "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[\"PredictedGravity\"].head(10))\n", + "print(\"---\")\n", + "\n", + "y_test_probas_path = khc.predict_proba(X_test)\n", + "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", + "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[proba_columns].head(10))\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "# Note: For roc_auc_score we have to use the \"greatest\" label which is \"NonLethal\"\n", + "y_test = pd.read_csv(\n", + " X_test[\"tables\"][\"Accidents\"][0],\n", + " usecols=[\"Gravity\"],\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityNonLethal\"])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `khiops_classifier_multitable_snowflake()`\n\n", - "Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n\n .. note::\n For simplicity we train from the whole dataset. To assess the performance one\n usually splits the dataset into train and test subsets.\n\n \n" + "Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n" ] }, { @@ -257,29 +314,31 @@ "import pandas as pd\n", "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", "\n", "# Load the dataset tables into dataframes\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "users_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"),\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "places_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", - "# Build the multitable input X\n", - "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", - "# build the target column\n", + "\n", + "# Create the dataset spec\n", + "# Note: We discard the \"Gravity\" column from the \"Users\" table to avoid a target\n", + "# leak. This is because the column was used to build the target.\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", @@ -295,16 +354,22 @@ " ],\n", "}\n", "\n", - "# Load the target variable from the AccidentsSummary dataset\n", + "# Load the target variable \"Gravity\" from the \"AccidentsSummary\" dataset\n", "y = pd.read_csv(\n", " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", - ")[\"Gravity\"]\n", + ").squeeze(\n", + " \"columns\"\n", + ") # squeeze to ensure pandas.Series\n", + "\n", + "# Split into train and test datasets\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)\n", "\n", "# Train the classifier (by default it creates 1000 multi-table features)\n", "khc = KhiopsClassifier(n_trees=0)\n", - "khc.fit(X, y)\n", + "khc.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", @@ -315,23 +380,23 @@ "print(\"---\")\n", "\n", "# Predict the class on the test dataset\n", - "y_pred = khc.predict(X)\n", + "y_test_pred = khc.predict(X_test)\n", "print(\"Predicted classes (first 10):\")\n", - "print(y_pred[:10])\n", + "print(y_test_pred[:10])\n", "print(\"---\")\n", "\n", - "# Predict the class probability on the train dataset\n", - "y_probas = khc.predict_proba(X)\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", "print(f\"Class order: {khc.classes_}\")\n", "print(\"Predicted class probabilities (first 10):\")\n", - "print(y_probas[:10])\n", + "print(y_test_probas[:10])\n", "print(\"---\")\n", "\n", - "# Evaluate accuracy and auc metrics on the train dataset\n", - "train_accuracy = metrics.accuracy_score(y_pred, y)\n", - "train_auc = metrics.roc_auc_score(y, y_probas[:, 1])\n", - "print(f\"Train accuracy = {train_accuracy}\")\n", - "print(f\"Train auc = {train_auc}\")" + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test_pred, y_test)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" ] }, { @@ -416,20 +481,9 @@ "source": [ "# Imports\n", "import os\n", - "import pandas as pd\n", "import pickle\n", - "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", - "\n", - "# Load the dataset into a pandas dataframe\n", - "iris_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - "# Train the model with the whole dataset\n", - "X = iris_df.drop([\"Class\"], axis=1)\n", - "y = iris_df[\"Class\"]\n", - "khc = KhiopsClassifier()\n", - "khc.fit(X, y)\n", + "from sklearn.datasets import load_iris\n", "\n", "# Create/clean the output directory\n", "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_pickle\")\n", @@ -439,9 +493,14 @@ "else:\n", " os.makedirs(results_dir, exist_ok=True)\n", "\n", + "# Train the model with the Iris dataset\n", + "X, y = load_iris(return_X_y=True)\n", + "khc = KhiopsClassifier()\n", + "khc.fit(X, y)\n", + "\n", "# Pickle its content to a file\n", - "with open(khc_pickle_path, \"wb\") as khc_pickle_write_file:\n", - " pickle.dump(khc, khc_pickle_write_file)\n", + "with open(khc_pickle_path, \"wb\") as khc_pickle_output_file:\n", + " pickle.dump(khc, khc_pickle_output_file)\n", "\n", "# Unpickle it\n", "with open(khc_pickle_path, \"rb\") as khc_pickle_file:\n", @@ -477,22 +536,14 @@ "from sklearn import metrics\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Load the dataset into a pandas dataframe\n", + "# Load the \"Adult\" dataset and set the target to the \"age\" column\n", "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"age\", axis=1)\n", + "y = adult_df[\"age\"]\n", "\n", "# Split the whole dataframe into train and test (40%-60% for speed)\n", - "adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.6, random_state=1\n", - ")\n", - "\n", - "# Split the dataset into:\n", - "# - the X feature table\n", - "# - the y target vector (\"age\" column)\n", - "X_train = adult_train_df.drop(\"age\", axis=1)\n", - "X_test = adult_test_df.drop(\"age\", axis=1)\n", - "y_train = adult_train_df[\"age\"]\n", - "y_test = adult_test_df[\"age\"]\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)\n", "\n", "# Create the regressor object\n", "khr = KhiopsRegressor()\n", @@ -536,21 +587,14 @@ "outputs": [], "source": [ "# Imports\n", - "import os\n", - "import pandas as pd\n", - "from khiops import core as kh\n", "from khiops.sklearn import KhiopsEncoder\n", + "from sklearn.datasets import load_iris\n", "\n", - "# Load the dataset into a pandas dataframe\n", - "iris_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - "# Train the model with the whole dataset\n", - "X = iris_df.drop(\"Class\", axis=1)\n", - "y = iris_df[\"Class\"]\n", + "# Load the dataset\n", + "X, y = load_iris(return_X_y=True)\n", "\n", "# Create the encoder object\n", - "khe = KhiopsEncoder()\n", + "khe = KhiopsEncoder(transform_type_numerical=\"part_label\")\n", "khe.fit(X, y)\n", "\n", "# Transform the training dataset\n", @@ -558,7 +602,7 @@ "\n", "# Print both the original and transformed features\n", "print(\"Original:\")\n", - "print(X.head(10))\n", + "print(X[:10])\n", "print(\"---\")\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", @@ -587,42 +631,34 @@ "from khiops import core as kh\n", "from khiops.sklearn import KhiopsEncoder\n", "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "# Load the dataset tables into dataframe\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Obtain the root X feature table and the y target vector (\"Class\" column)\n", - "X_main = accidents_df.drop(\"Gravity\", axis=1)\n", - "y = accidents_df[\"Gravity\"]\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "X_secondary = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Create the dataset multitable specification for the train/test split\n", - "# We specify each table with a name and a tuple (dataframe, key_columns)\n", - "X_dataset = {\n", + "# Build the multi-table spec and the target\n", + "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (X_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", "}\n", + "y = accidents_df[\"Gravity\"]\n", "\n", - "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", + "# Create the KhiopsEncoder with 5 multitable features and fit it\n", "khe = KhiopsEncoder(n_features=10)\n", - "khe.fit(X_dataset, y)\n", + "khe.fit(X, y)\n", "\n", "# Transform the train dataset\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", "print(\"Encoded data:\")\n", - "print(khe.transform(X_dataset)[:10])" + "print(khe.transform(X)[:10])" ] }, { @@ -630,7 +666,7 @@ "metadata": {}, "source": [ "### `khiops_encoder_multitable_snowflake()`\n\n", - "Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n\n .. note::\n For simplicity we train from the whole dataset. To assess the performance\n one usually splits the dataset into train and test subsets.\n \n" + "Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n" ] }, { @@ -646,33 +682,38 @@ "from khiops.sklearn import KhiopsEncoder\n", "\n", "# Load the tables into dataframes\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "places_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", "users_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"),\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "\n", - "# Build the multitable input X\n", + "# Build the multi-table spec\n", "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", "# build the target column\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", " \"Accidents\": (accidents_df, \"AccidentId\"),\n", + " \"Places\": (places_df, \"AccidentId\"),\n", " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", " },\n", " \"relations\": [\n", " (\"Accidents\", \"Vehicles\"),\n", + " (\"Accidents\", \"Places\", True),\n", " (\"Vehicles\", \"Users\"),\n", " ],\n", "}\n", @@ -680,9 +721,12 @@ "# Load the target variable from the AccidentsSummary dataset\n", "y = pd.read_csv(\n", " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", - ")[\"Gravity\"]\n", + ").squeeze(\n", + " \"columns\"\n", + ") # squeeze to ensure pandas.Series\n", "\n", "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", "khe = KhiopsEncoder(n_features=10)\n", @@ -707,7 +751,7 @@ "metadata": {}, "source": [ "### `khiops_encoder_pipeline_with_hgbc()`\n\n", - "Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\n" + "Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\n" ] }, { @@ -728,22 +772,14 @@ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "# Load the dataset into a pandas dataframe\n", + "# Load the dataset into dataframes\n", "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"class\", axis=1)\n", + "y = adult_df[\"class\"]\n", "\n", - "# Split the whole dataframe into train and test (70%-30%)\n", - "adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Split the dataset into:\n", - "# - the X feature table\n", - "# - the y target vector (\"class\" column)\n", - "X_train = adult_train_df.drop(\"class\", axis=1)\n", - "X_test = adult_test_df.drop(\"class\", axis=1)\n", - "y_train = adult_train_df[\"class\"]\n", - "y_test = adult_test_df[\"class\"]\n", + "# Split the dataset into train and test (70%-30%)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", "\n", "# Create the pipeline and fit it. Steps:\n", "# - The khiops supervised column encoder, generates a full-categorical table\n", @@ -754,8 +790,6 @@ " (\n", " \"onehot_enc\",\n", " ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)),\n", - " # For sklearn < 1.2, use\n", - " # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)),\n", " ),\n", " (\"hgb_clf\", HistGradientBoostingClassifier()),\n", "]\n", @@ -803,13 +837,13 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "splice_dataset_path = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - "splice_dna_X = pd.read_csv(\n", - " os.path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_dna_df = pd.read_csv(\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", ")\n", "\n", "# Train with only 70% of data (for speed in this example)\n", - "X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)\n", + "X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1)\n", "\n", "# Create the KhiopsCoclustering instance\n", "khcc = KhiopsCoclustering()\n", @@ -846,9 +880,9 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "splice_dataset_path = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "splice_dna_X = pd.read_csv(\n", - " os.path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", ")\n", "\n", "# Train with only 70% of data (for speed in this example)\n", @@ -893,32 +927,24 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "X = accidents_df.drop(\"Gravity\", axis=1)\n", + "y = accidents_df[\"Gravity\"]\n", "\n", - "# Split the root dataframe into train and test\n", - "accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Obtain the main X feature table and the y target vector (\"Class\" column)\n", - "y_train = accidents_train_df[\"Gravity\"]\n", - "y_test = accidents_test_df[\"Gravity\"]\n", - "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", + "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", "\n", @@ -926,16 +952,16 @@ "khc = KhiopsClassifier(key=\"AccidentId\")\n", "\n", "# Train the classifier\n", - "khc.fit([X_train_main, X_train_secondary], y_train)\n", + "khc.fit([X_train, X_train_secondary], y_train)\n", "\n", "# Predict the class on the test dataset\n", - "y_test_pred = khc.predict([X_test_main, X_test_secondary])\n", + "y_test_pred = khc.predict([X_test, X_test_secondary])\n", "print(\"Predicted classes (first 10):\")\n", "print(y_test_pred[:10])\n", "print(\"---\")\n", "\n", "# Predict the class probability on the test dataset\n", - "y_test_probas = khc.predict_proba([X_test_main, X_test_secondary])\n", + "y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n", "print(\"Predicted class probabilities (first 10):\")\n", "print(y_test_probas[:10])\n", "print(\"---\")\n", @@ -946,118 +972,6 @@ "print(f\"Test accuracy = {test_accuracy}\")\n", "print(f\"Test auc = {test_auc}\")" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `khiops_classifier_multitable_star_file()`\n\n", - "Trains a `.KhiopsClassifier` with a file path based dataset\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n If you need to handle large datasets that do not easily fit into memory then you\n may use the `~.khiops.core` API directly, which allows to specify file paths\n directly.\n \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "import pandas as pd\n", - "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier\n", - "from sklearn import metrics\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Create output directory\n", - "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_file\")\n", - "if not os.path.exists(\"kh_samples\"):\n", - " os.mkdir(\"kh_samples\")\n", - " os.mkdir(results_dir)\n", - "else:\n", - " if not os.path.exists(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "\n", - "# Split the root dataframe into train and test\n", - "X_train_main, X_test_main = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - "# Write the train and test dataset sets to disk\n", - "# For the test file we remove the target column from the main table\n", - "X_train_main_path = os.path.join(results_dir, \"X_train_main.txt\")\n", - "X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n", - "X_train_secondary_path = os.path.join(results_dir, \"X_train_secondary.txt\")\n", - "X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n", - "X_test_main_path = os.path.join(results_dir, \"X_test_main.txt\")\n", - "y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n", - "X_test_main.drop(columns=\"Gravity\").to_csv(\n", - " X_test_main_path, sep=\"\\t\", header=True, index=False\n", - ")\n", - "X_test_secondary_path = os.path.join(results_dir, \"X_test_secondary.txt\")\n", - "X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n", - "\n", - "# Define the dictionary of train\n", - "X_train_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - "}\n", - "X_test_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_test_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - "}\n", - "\n", - "# Create the classifier and fit it\n", - "khc = KhiopsClassifier(output_dir=results_dir)\n", - "khc.fit(X_train_dataset, y=\"Gravity\")\n", - "\n", - "# Predict the class in addition to the class probabilities on the test dataset\n", - "y_test_pred_path = khc.predict(X_test_dataset)\n", - "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", - "print(\"Predicted classes (first 10):\")\n", - "print(y_test_pred[\"PredictedGravity\"].head(10))\n", - "print(\"---\")\n", - "\n", - "y_test_probas_path = khc.predict_proba(X_test_dataset)\n", - "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", - "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", - "print(\"Predicted class probabilities (first 10):\")\n", - "print(y_test_probas[proba_columns].head(10))\n", - "print(\"---\")\n", - "\n", - "# Evaluate accuracy and auc metrics on the test dataset\n", - "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", - "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n", - "print(f\"Test accuracy = {test_accuracy}\")\n", - "print(f\"Test auc = {test_auc}\")" - ] } ], "metadata": {}, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 976db54c..8b22fb5e 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -16,7 +16,7 @@ # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names estimators.py +# pylint --disable=all --enable=invalid-names samples_sklearn.py # pylint: disable=invalid-name # For ease of use the functions in this module contain (repeated) import statements @@ -145,55 +145,34 @@ def khiops_classifier_multitable_star(): import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics - from sklearn.model_selection import train_test_split - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { + # Create the dataset spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) # Train the classifier (by default it analyzes 100 multi-table features) khc = KhiopsClassifier() @@ -219,42 +198,106 @@ def khiops_classifier_multitable_star(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_snowflake(): - """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset +def khiops_classifier_multitable_star_file(): + """Trains a `.KhiopsClassifier` with a file dataset""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics - .. note:: - For simplicity we train from the whole dataset. To assess the performance one - usually splits the dataset into train and test subsets. + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + os.makedirs(results_dir, exist_ok=True) - """ + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + + +def khiops_classifier_multitable_snowflake(): + """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset""" # Imports import os import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) places_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. X = { "main_table": "Accidents", "tables": { @@ -270,16 +313,22 @@ def khiops_classifier_multitable_snowflake(): ], } - # Load the target variable from the AccidentsSummary dataset + # Load the target variable "Gravity" from the "AccidentsSummary" dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) # Train the classifier (by default it creates 1000 multi-table features) khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) + khc.fit(X_train, y_train) # Show the feature importance info print(f"Features evaluated: {khc.n_features_evaluated_}") @@ -290,23 +339,23 @@ def khiops_classifier_multitable_snowflake(): print("---") # Predict the class on the test dataset - y_pred = khc.predict(X) + y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") - print(y_pred[:10]) + print(y_test_pred[:10]) print("---") - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) print(f"Class order: {khc.classes_}") print("Predicted class probabilities (first 10):") - print(y_probas[:10]) + print(y_test_probas[:10]) print("---") - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") def khiops_classifier_sparse(): @@ -367,20 +416,9 @@ def khiops_classifier_pickle(): """Shows the serialization and deserialization of a `.KhiopsClassifier`""" # Imports import os - import pandas as pd import pickle - from khiops import core as kh from khiops.sklearn import KhiopsClassifier - - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) + from sklearn.datasets import load_iris # Create/clean the output directory results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") @@ -390,9 +428,14 @@ def khiops_classifier_pickle(): else: os.makedirs(results_dir, exist_ok=True) + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) # Unpickle it with open(khc_pickle_path, "rb") as khc_pickle_file: @@ -416,23 +459,17 @@ def khiops_regressor(): from sklearn import metrics from sklearn.model_selection import train_test_split - # Load the dataset into a pandas dataframe + # Load the "Adult" dataset and set the target to the "age" column adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] - # Create the regressor object khr = KhiopsRegressor() @@ -472,21 +509,14 @@ def khiops_encoder(): usually splits the dataset into train and test subsets. """ # Imports - import os - import pandas as pd - from khiops import core as kh from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Load the dataset + X, y = load_iris(return_X_y=True) # Create the encoder object - khe = KhiopsEncoder() + khe = KhiopsEncoder(transform_type_numerical="part_label") khe.fit(X, y) # Transform the training dataset @@ -494,7 +524,7 @@ def khiops_encoder(): # Print both the original and transformed features print("Original:") - print(X.head(10)) + print(X[:10]) print("---") print("Encoded feature names:") print(khe.feature_names_out_) @@ -511,51 +541,40 @@ def khiops_encoder_multitable_star(): from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] - # Create the KhiopsEncoder with 10 additional multitable features and fit it + # Create the KhiopsEncoder with 5 multitable features and fit it khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) def khiops_encoder_multitable_snowflake(): - """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset - - .. note:: - For simplicity we train from the whole dataset. To assess the performance - one usually splits the dataset into train and test subsets. - """ + """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset""" # Imports import os import pandas as pd @@ -563,33 +582,38 @@ def khiops_encoder_multitable_snowflake(): from khiops.sklearn import KhiopsEncoder # Load the tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) - # Build the multitable input X + # Build the multi-table spec # Note: We discard the "Gravity" field from the "Users" table as it was used to # build the target column X = { "main_table": "Accidents", "tables": { "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), }, "relations": [ ("Accidents", "Vehicles"), + ("Accidents", "Places", True), ("Vehicles", "Users"), ], } @@ -597,9 +621,12 @@ def khiops_encoder_multitable_snowflake(): # Load the target variable from the AccidentsSummary dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -622,7 +649,7 @@ def khiops_encoder_multitable_snowflake(): # Disable line too long just to have a title linking the sklearn documentation # pylint: disable=line-too-long def khiops_encoder_pipeline_with_hgbc(): - """Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" + """Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" # Imports import os import pandas as pd @@ -635,23 +662,17 @@ def khiops_encoder_pipeline_with_hgbc(): from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder - # Load the dataset into a pandas dataframe + # Load the dataset into dataframes adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] - # Create the pipeline and fit it. Steps: # - The khiops supervised column encoder, generates a full-categorical table # - One hot encoder in all columns @@ -661,8 +682,6 @@ def khiops_encoder_pipeline_with_hgbc(): ( "onehot_enc", ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), ), ("hgb_clf", HistGradientBoostingClassifier()), ] @@ -701,13 +720,13 @@ def khiops_coclustering(): from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) # Create the KhiopsCoclustering instance khcc = KhiopsCoclustering() @@ -732,9 +751,9 @@ def khiops_coclustering_simplify(): from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) @@ -776,32 +795,28 @@ def khiops_classifier_multitable_list(): from sklearn.model_selection import train_test_split # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") @@ -809,16 +824,16 @@ def khiops_classifier_multitable_list(): khc = KhiopsClassifier(key="AccidentId") # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) + khc.fit([X_train, X_train_secondary], y_train) # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) + y_test_pred = khc.predict([X_test, X_test_secondary]) print("Predicted classes (first 10):") print(y_test_pred[:10]) print("---") # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) print("Predicted class probabilities (first 10):") print(y_test_probas[:10]) print("---") @@ -830,117 +845,11 @@ def khiops_classifier_multitable_list(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_star_file(): - """Trains a `.KhiopsClassifier` with a file path based dataset - - .. warning:: - This dataset input method is **Deprecated** and will be removed in Khiops 11. - If you need to handle large datasets that do not easily fit into memory then you - may use the `~.khiops.core` API directly, which allows to specify file paths - directly. - """ - # Imports - import os - import pandas as pd - from khiops import core as kh - from khiops.sklearn import KhiopsClassifier - from sklearn import metrics - from sklearn.model_selection import train_test_split - - # Create output directory - results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_file") - if not os.path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not os.path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = os.path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = os.path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") - - exported_samples = [ khiops_classifier, khiops_classifier_multiclass, khiops_classifier_multitable_star, + khiops_classifier_multitable_star_file, khiops_classifier_multitable_snowflake, khiops_classifier_sparse, khiops_classifier_pickle, @@ -952,15 +861,13 @@ def khiops_classifier_multitable_star_file(): khiops_coclustering, khiops_coclustering_simplify, khiops_classifier_multitable_list, - khiops_classifier_multitable_star_file, ] def execute_samples(args): """Executes all non-interactive samples""" # Create the results directory if it does not exist - if not os.path.isdir("./kh_samples"): - os.mkdir("./kh_samples") + os.makedirs("./kh_samples", exist_ok=True) # Set the user-defined samples dir if any if args.samples_dir is not None: @@ -981,7 +888,7 @@ def execute_samples(args): print(f"{len(execution_samples)} sample(s) to execute\n") for sample in execution_samples: - print(">>> Executing samples_sklearn." + sample.__name__) + print(f">>> Executing samples_sklearn.{sample.__name__}") sample.__call__() print("> Done\n") diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 3d899c3c..27892bcb 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -477,7 +477,8 @@ def _transform_deploy_model( X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -743,7 +744,8 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -1178,7 +1180,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -1190,9 +1193,6 @@ def predict(self, X): `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("predict") @@ -1388,19 +1388,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values. + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -1946,19 +1948,21 @@ def fit(self, X, y, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -2052,7 +2056,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2113,7 +2118,8 @@ def predict_proba(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2331,19 +2337,22 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): + + - str: A path to a data table file for file-based ``dict`` dataset + specifications. - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. Returns ------- self : `KhiopsRegressor` @@ -2411,7 +2420,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2420,14 +2430,16 @@ def predict(self, X): Returns ------- - `ndarray ` + `numpy.ndarray` or str + An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. The key columns are added for - multi-table tasks. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. + multi-table tasks. The array is in the form of: + - `numpy.ndarray` if X is :external:term:`array-like`, or dataset spec + containing `pandas.DataFrame` table. + - str (a path for the file containing the array) if X is a dataset spec + containing file-path tables. """ # Call the parent's method y_pred = super().predict(X) @@ -2666,19 +2678,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -2739,7 +2753,8 @@ def transform(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2751,9 +2766,6 @@ def transform(self, X): `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("transform") @@ -2801,19 +2813,22 @@ def fit_transform(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + secondary ones joined to the main table using ``key`` estimator + parameter. y : :external:term:`array-like` of shape (n_samples,) - :external:term:`array-like` object containing the target values. + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index b5465a4a..66497dd7 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -37,6 +37,21 @@ def check_dataset_spec(ds_spec): + """Checks that a dataset spec is valid + + Parameters + ---------- + ds_spec : dict + A specification of a multi-table dataset (see :doc:`/multi_table_primer`). + + Raises + ------ + TypeError + If there are objects of the spec with invalid type. + ValueError + If there are objects of the spec with invalid values. + """ + # Check the "tables" field if "tables" not in ds_spec: raise ValueError("'tables' entry missing from dataset dict spec") @@ -47,18 +62,18 @@ def check_dataset_spec(ds_spec): if len(ds_spec["tables"]) == 0: raise ValueError("'tables' dictionary cannot be empty") for table_name, table_entry in ds_spec["tables"].items(): - check_table_entry(table_name, table_entry) + _check_table_entry(table_name, table_entry) # Multi-table specific table checks if len(ds_spec["tables"]) > 1: - check_multitable_spec(ds_spec) + _check_multitable_spec(ds_spec) # Check the 'format' field if "format" in ds_spec: - check_format_entry(ds_spec["format"]) + _check_format_entry(ds_spec["format"]) -def check_table_entry(table_name, table_spec): +def _check_table_entry(table_name, table_spec): if not isinstance(table_spec, tuple): raise TypeError( type_error_message(f"'{table_name}' table entry", table_spec, tuple) @@ -80,10 +95,10 @@ def check_table_entry(table_name, table_spec): str, ) ) - check_table_key(table_name, key) + _check_table_key(table_name, key) -def check_table_key(table_name, key): +def _check_table_key(table_name, key): if key is not None: if not is_list_like(key) and not isinstance(key, str): raise TypeError( @@ -102,7 +117,7 @@ def check_table_key(table_name, key): ) -def check_multitable_spec(ds_spec): +def _check_multitable_spec(ds_spec): assert len(ds_spec) > 1 # Check the main table if "main_table" not in ds_spec: @@ -149,10 +164,10 @@ def check_multitable_spec(ds_spec): for table in ds_spec["tables"].keys() if table != ds_spec["main_table"] ] - check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) + _check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) -def check_relations_entry(main_table_name, tables_spec, relations_spec): +def _check_relations_entry(main_table_name, tables_spec, relations_spec): # Check the types and size of the relation entries if not is_list_like(relations_spec): raise TypeError( @@ -208,7 +223,7 @@ def check_relations_entry(main_table_name, tables_spec, relations_spec): ) # Check hierachical keys - check_hierarchical_keys( + _check_hierarchical_keys( i, parent_table, tables_spec[parent_table][1], @@ -217,10 +232,10 @@ def check_relations_entry(main_table_name, tables_spec, relations_spec): ) # Check there are no cycles - check_no_cycles(relations_spec, main_table_name) + _check_no_cycles(relations_spec, main_table_name) -def check_hierarchical_keys( +def _check_hierarchical_keys( relation_id, parent_table, parent_table_key, child_table, child_table_key ): """Check that the parent table's key is contained in the child table's key""" @@ -250,7 +265,7 @@ def check_hierarchical_keys( ) -def check_no_cycles(relations_spec, main_table_name): +def _check_no_cycles(relations_spec, main_table_name): """Check that there are no cycles in the 'relations' entry""" tables_to_visit = [main_table_name] tables_visited = set() @@ -268,7 +283,7 @@ def check_no_cycles(relations_spec, main_table_name): ) -def check_format_entry(format_spec): +def _check_format_entry(format_spec): if not isinstance(format_spec, tuple): raise TypeError(type_error_message("'format' entry", format_spec, tuple)) if len(format_spec) != 2: @@ -581,7 +596,7 @@ def _check_input_sequence(self, X, key=None): ) # Check the key for the main_table (it is the same for the others) - check_table_key("main_table", key) + _check_table_key("main_table", key) def _init_tables_from_mapping(self, X): """Initializes the table spec from a dict-like 'X'""" @@ -914,7 +929,7 @@ def create_table_files_for_khiops(self, output_dir, sort=True): Parameters ---------- - out_dir : str + output_dir : str The directory where the sorted tables will be created. Returns diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index 01687f5e..d5c4f76d 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -20,10 +20,10 @@ def sort_dataset(ds_spec, output_dir=None): Parameters ---------- ds_spec: dict - The dataset dictionary specification. The tables must be either - `pandas.DataFrame` or file path references. + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. output_dir: str, optional - _Only for file datasets:_ The output directory for the sorted files. + *Only for file datasets:* The output directory for the sorted files. Notes @@ -32,6 +32,10 @@ def sort_dataset(ds_spec, output_dir=None): The sorting algorithm is mergesort, which ensures sort stability. The sorting engine for dataframes is Pandas and for file-based datasets is Khiops. + Examples + -------- + See the following functions of the ``samples.py`` documentation script: + - `samples.sort_data_tables_mt()` """ # Check the types if not is_dict_like(ds_spec): @@ -92,9 +96,38 @@ def _sort_file_table(table, sep, header, output_dir): return out_data_source +# Note: We build the splits with lists and itertools.chain avoid pylint warning about +# unbalanced-tuple-unpacking. See issue https://github.com/pylint-dev/pylint/issues/5671 + + def train_test_split_dataset( ds_spec, target_column=None, test_size=0.25, output_dir=None, **kwargs ): + """Splits a dataset spec into train and test + + Parameters + ---------- + ds_spec : ``dict`` + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. + target_column : :external:term:`array-like`, optional + The target values. + test_size : float, default 0.25 + The proportion of the dataset (between 0.0 and 1.0) to be included in the test + split. + output_dir : str, optional + *Only for file datasets:* The output directory for the split data files. + ... : + Other optional parameters for `sklearn.model_selection.train_test_split` + + + Examples + -------- + See the following functions of the ``samples_sklearn.py`` documentation script: + - `samples_sklearn.khiops_classifier_multitable_star` + - `samples_sklearn.khiops_classifier_multitable_star_file` + - `samples_sklearn.khiops_classifier_multitable_snowflake` + """ # Check the types if not is_dict_like(ds_spec): raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) @@ -218,13 +251,13 @@ def _train_test_split_in_memory_dataset( # users would be warned. To remove when the following issue is fixed: # https://github.com/pylint-dev/pylint/issues/5671 if target_column is None: - return_tuple = itertools.chain((train_ds, test_ds)) + split = itertools.chain((train_ds, test_ds)) else: - return_tuple = itertools.chain( + split = itertools.chain( (train_ds, test_ds, train_target_column, test_target_column) ) - return return_tuple + return split def _train_test_split_file_dataset(ds, test_size, output_dir): @@ -291,4 +324,4 @@ def _train_test_split_file_dataset(ds, test_size, output_dir): # unbalanced-tuple-unpacking. This warning appears when calling the function so # users would be warned. To remove when the following issue is fixed: # https://github.com/pylint-dev/pylint/issues/5671 - return itertools.chain((split_dss["train"], split_dss["test"])) + return itertools.chain(split_dss.values()) From 07c0a7f22cc7d5281e651f32f696b8ffc3c4cd56 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:57:37 +0200 Subject: [PATCH 11/12] Simplify Dataset state variables - Remove `target_column_type` and `target_column_dtype` members - Make `is_in_memory` and `is_multitable` properties - Minor changes in comments and renamings --- khiops/sklearn/estimators.py | 93 ++++++++++++++----------- khiops/utils/dataset.py | 129 ++++++++++++++++------------------- khiops/utils/helpers.py | 6 +- tests/test_dataset_class.py | 7 +- 4 files changed, 119 insertions(+), 116 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 27892bcb..fb5f4d22 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -148,38 +148,51 @@ def _check_categorical_target_type(ds): if ds.target_column is None: raise ValueError("Target vector is not specified.") - if ds.is_in_memory() and not ( - isinstance(ds.target_column_dtype, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(ds.target_column_dtype) - or pd.api.types.is_integer_dtype(ds.target_column_dtype) - or pd.api.types.is_float_dtype(ds.target_column_dtype) + if ds.is_in_memory and not ( + isinstance(ds.target_column.dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column.dtype) + or pd.api.types.is_integer_dtype(ds.target_column.dtype) + or pd.api.types.is_float_dtype(ds.target_column.dtype) ): raise ValueError( f"'y' has invalid type '{ds.target_column_type}'. " "Only string, integer, float and categorical types " "are accepted for the target." ) - elif not ds.is_in_memory() and ds.target_column_type != "Categorical": + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Categorical" + ): raise ValueError( - f"Target column has invalid type '{ds.target_column_type}'. " + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " "Only Categorical types are accepted for file datasets." ) def _check_numerical_target_type(ds): + # Check that the target column is specified if ds.target_column is None: raise ValueError("Target vector is not specified.") - if ds.is_in_memory(): - if not pd.api.types.is_numeric_dtype(ds.target_column_dtype): + + # If in-memory: Check that the column is numerical and that the values are finite + # The latter is required by sklearn + if ds.is_in_memory: + if not pd.api.types.is_numeric_dtype(ds.target_column.dtype): raise ValueError( - f"Unknown label type '{ds.target_column_type}'. " + f"Unknown label type '{ds.target_column.dtype}'. " "Expected a numerical type." ) if ds.target_column is not None: assert_all_finite(ds.target_column) - elif not ds.is_in_memory() and ds.target_column_type != "Numerical": + # Otherwise: Check the the Khiops type + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Numerical" + ): raise ValueError( - f"Target column has invalid type '{ds.target_column_type}'. " + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " "Only Numerical types are accepted for file datasets." ) @@ -384,7 +397,7 @@ def _fit(self, ds, computation_dir, **kwargs): ): self._fit_training_post_process(ds) self.is_fitted_ = True - self.is_multitable_model_ = ds.is_multitable() + self.is_multitable_model_ = ds.is_multitable def _fit_check_params(self, ds, **_): """Check the model parameters including those data dependent (in kwargs)""" @@ -395,7 +408,7 @@ def _fit_check_params(self, ds, **_): ): raise TypeError(type_error_message("key", self.key, str, "list-like")) - if not ds.is_in_memory() and self.output_dir is None: + if not ds.is_in_memory and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _fit_check_dataset(self, ds): @@ -529,7 +542,7 @@ def _transform_deploy_model( output_data_table_path = fs.get_child_path(output_dir, transformed_file_name) # Set the format parameters depending on the type of dataset - if deployment_ds.is_in_memory(): + if deployment_ds.is_in_memory: field_separator = "\t" header_line = True else: @@ -563,7 +576,7 @@ def _transform_deployment_post_process( self, deployment_ds, output_table_path, drop_key ): # Return a dataframe for dataframe based datasets - if deployment_ds.is_in_memory(): + if deployment_ds.is_in_memory: # Read the transformed table with the internal table settings with io.BytesIO(fs.read(output_table_path)) as output_table_stream: output_table_df = read_internal_data_table(output_table_stream) @@ -572,7 +585,7 @@ def _transform_deployment_post_process( # - Reorder the table to the original table order # - Because transformed data table file is sorted by key # - Drop the key columns if specified - if deployment_ds.is_multitable(): + if deployment_ds.is_multitable: key_df = deployment_ds.main_table.data_source[ deployment_ds.main_table.key ] @@ -822,7 +835,7 @@ def _fit_check_params(self, ds, **kwargs): ) def _fit_train_model(self, ds, computation_dir, **kwargs): - assert not ds.is_multitable(), "Coclustering not available in multitable" + assert not ds.is_multitable, "Coclustering not available in multitable" # Prepare the table files and dictionary for Khiops main_table_path, _ = ds.create_table_files_for_khiops( @@ -1217,7 +1230,7 @@ def predict(self, X): kh.get_runner().root_temp_dir = initial_runner_temp_dir # Transform to numpy.array for in-memory inputs - if ds.is_in_memory(): + if ds.is_in_memory: y_pred = y_pred.to_numpy() return y_pred @@ -1235,7 +1248,7 @@ def _transform_check_dataset(self, ds): # - They are mono-table only # - They are deployed with a multitable model whose main table contain # the keys of the input table and the secondary table is the input table - if ds.is_multitable(): + if ds.is_multitable: raise ValueError("Coclustering models not available in multi-table mode") # The "model dictionary domain" in the coclustering case it is just composed @@ -1251,14 +1264,14 @@ def _transform_check_dataset(self, ds): ) def _transform_create_deployment_dataset(self, ds, computation_dir): - assert not ds.is_multitable(), "'dataset' is multitable" + assert not ds.is_multitable, "'dataset' is multitable" # Build the multitable deployment dataset keys_table_name = f"keys_{ds.main_table.name}" deploy_dataset_spec = {} deploy_dataset_spec["main_table"] = keys_table_name deploy_dataset_spec["tables"] = {} - if ds.is_in_memory(): + if ds.is_in_memory: # Extract the keys from the main table keys_table_dataframe = pd.DataFrame( { @@ -1319,7 +1332,7 @@ def _transform_prepare_deployment_model_for_predict(self, _): def _transform_deployment_post_process( self, deployment_ds, output_table_path, drop_key ): - assert deployment_ds.is_multitable() + assert deployment_ds.is_multitable return super()._transform_deployment_post_process( deployment_ds, output_table_path, drop_key ) @@ -1500,7 +1513,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Set the format parameters depending on the type of dataset kwargs["detect_format"] = False - if ds.is_in_memory(): + if ds.is_in_memory: kwargs["field_separator"] = "\t" kwargs["header_line"] = True else: @@ -1610,12 +1623,12 @@ def _transform_check_dataset(self, ds): super()._transform_check_dataset(ds) # Check the coherence between thi input table and the model - if self.is_multitable_model_ and not ds.is_multitable(): + if self.is_multitable_model_ and not ds.is_multitable: raise ValueError( "You are trying to apply on single-table inputs a model which has " "been trained on multi-table data." ) - if not self.is_multitable_model_ and ds.is_multitable(): + if not self.is_multitable_model_ and ds.is_multitable: raise ValueError( "You are trying to apply on multi-table inputs a model which has " "been trained on single-table data." @@ -1914,10 +1927,14 @@ def __init__( self._predicted_target_meta_data_tag = "Prediction" def _is_real_target_dtype_integer(self): - assert self._original_target_dtype is not None, "Original target type not set" - return pd.api.types.is_integer_dtype(self._original_target_dtype) or ( - isinstance(self._original_target_dtype, pd.CategoricalDtype) - and pd.api.types.is_integer_dtype(self._original_target_dtype.categories) + return self._original_target_dtype is not None and ( + pd.api.types.is_integer_dtype(self._original_target_dtype) + or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_integer_dtype( + self._original_target_dtype.categories + ) + ) ) def _sorted_prob_variable_names(self): @@ -1980,7 +1997,7 @@ def _fit_check_dataset(self, ds): super()._fit_check_dataset(ds) # Check that the target is for classification in in_memory_tables - if ds.is_in_memory(): + if ds.is_in_memory: current_type_of_target = type_of_target(ds.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( @@ -1988,7 +2005,7 @@ def _fit_check_dataset(self, ds): "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ds.is_in_memory() and len(np.unique(ds.target_column)) == 1: + if ds.is_in_memory and len(np.unique(ds.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -2001,10 +2018,10 @@ def _fit_training_post_process(self, ds): super()._fit_training_post_process(ds) # Save the target datatype - if ds.is_in_memory(): - self._original_target_dtype = ds.target_column_dtype + if ds.is_in_memory: + self._original_target_dtype = ds.target_column.dtype else: - self._original_target_dtype = np.dtype("object") + self._original_target_dtype = None # Save class values in the order of deployment self.classes_ = [] @@ -2012,7 +2029,7 @@ def _fit_training_post_process(self, ds): for key in variable.meta_data.keys: if key.startswith("TargetProb"): self.classes_.append(variable.meta_data.get_value(key)) - if self._is_real_target_dtype_integer(): + if ds.is_in_memory and self._is_real_target_dtype_integer(): self.classes_ = [int(class_value) for class_value in self.classes_] self.classes_.sort() self.classes_ = column_or_1d(self.classes_) @@ -2165,7 +2182,7 @@ def predict_proba(self, X): # For in-memory datasets: # - Reorder the columns to that of self.classes_ # - Transform to np.ndarray - if ds.is_in_memory(): + if ds.is_in_memory: assert isinstance( y_probas, (pd.DataFrame, np.ndarray) ), "y_probas is not a Pandas DataFrame nor Numpy array" @@ -2786,7 +2803,7 @@ def transform(self, X): finally: self._cleanup_computation_dir(computation_dir) kh.get_runner().root_temp_dir = initial_runner_temp_dir - if ds.is_in_memory(): + if ds.is_in_memory: return X_transformed.to_numpy(copy=False) return X_transformed diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 66497dd7..500312ae 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -51,6 +51,9 @@ def check_dataset_spec(ds_spec): ValueError If there are objects of the spec with invalid values. """ + # Check the spec type + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, Mapping)) # Check the "tables" field if "tables" not in ds_spec: @@ -118,7 +121,6 @@ def _check_table_key(table_name, key): def _check_multitable_spec(ds_spec): - assert len(ds_spec) > 1 # Check the main table if "main_table" not in ds_spec: raise ValueError( @@ -137,9 +139,9 @@ def _check_multitable_spec(ds_spec): # Check that all tables have non-None keys for table_name, (_, table_key) in ds_spec["tables"].items(): if table_key is None: - table_type = "main" if ds_spec["main_table"] == table_name else "secondary" + table_kind = "main" if ds_spec["main_table"] == table_name else "secondary" raise ValueError( - f"key of {table_type} table '{table_name}' is 'None': " + f"key of {table_kind} table '{table_name}' is 'None': " "table keys must be specified in multi-table datasets" ) @@ -239,17 +241,21 @@ def _check_hierarchical_keys( relation_id, parent_table, parent_table_key, child_table, child_table_key ): """Check that the parent table's key is contained in the child table's key""" - table_key_error = False + # Perform the check and save the error status + error_found = False if isinstance(parent_table_key, str) and isinstance(child_table_key, str): - table_key_error = child_table_key != parent_table_key + error_found = child_table_key != parent_table_key elif isinstance(parent_table_key, str) and is_list_like(child_table_key): - table_key_error = parent_table_key not in child_table_key + error_found = parent_table_key not in child_table_key elif is_list_like(parent_table_key) and is_list_like(child_table_key): - table_key_error = not set(parent_table_key).issubset(set(child_table_key)) + error_found = not set(parent_table_key).issubset(child_table_key) elif is_list_like(parent_table_key) and isinstance(child_table_key, str): - table_key_error = True + error_found = ( + len(parent_table_key) != 1 or child_table_key not in parent_table_key + ) - if table_key_error: + # Report any error found + if error_found: if isinstance(child_table_key, str): child_table_key_msg = f"[{child_table_key}]" else: @@ -435,8 +441,6 @@ def __init__(self, X, y=None, categorical_target=True, key=None): self.categorical_target = categorical_target self.target_column = None self.target_column_id = None - self.target_column_type = None - self.target_column_dtype = None # Only for in_memory datasets self.sep = None self.header = None @@ -535,7 +539,7 @@ def __init__(self, X, y=None, categorical_target=True, key=None): assert isinstance( self.secondary_tables, list ), "'secondary_tables' is not a list after init" - assert not self.is_multitable() or len( + assert not self.is_multitable or len( self.secondary_tables ), "'secondary_tables' is empty in a multi-table dataset" assert ( @@ -699,6 +703,7 @@ def _init_tables_from_mapping(self, X): def _init_target_column(self, y): assert self.main_table is not None assert self.secondary_tables is not None + # Check y's type # For in memory target columns: # - column_or_1d checks *and transforms* to a numpy.array if successful @@ -722,14 +727,6 @@ def _init_target_column(self, y): type_error_message("y", y, "array-like") + f" (X's tables are of type {type_message})" ) - if isinstance(self.main_table, (SparseTable, NumpyTable)) and isinstance( - y_checked, str - ): - raise TypeError( - type_error_message("y", y, "array-like") - + " (X's tables are of type numpy.ndarray" - + " or scipy.sparse.spmatrix)" - ) if isinstance(self.main_table.data_source, str) and not isinstance( y_checked, str ): @@ -742,7 +739,6 @@ def _init_target_column(self, y): # Case when y is a memory array if hasattr(y_checked, "__array__"): self.target_column = y_checked - self.target_column_dtype = self.target_column.dtype # Initialize the id of the target column if isinstance(y, pd.Series) and y.name is not None: @@ -778,14 +774,13 @@ def _init_target_column(self, y): # Force the target column type from the parameters if self.categorical_target: - self.main_table.khiops_types[self.target_column] = "Categorical" - self.target_column_type = "Categorical" + self.main_table.khiops_types[self.target_column_id] = "Categorical" else: - self.main_table.khiops_types[self.target_column] = "Numerical" - self.target_column_type = "Numerical" + self.main_table.khiops_types[self.target_column_id] = "Numerical" + @property def is_in_memory(self): - """Tests whether the dataset is in-memory + """bool : ``True`` if the dataset is in-memory A dataset is in-memory if it is constituted either of only pandas.DataFrame tables, numpy.ndarray, or scipy.sparse.spmatrix tables. @@ -793,28 +788,22 @@ def is_in_memory(self): return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) + @property def table_type(self): - """Returns the table type of the dataset tables + """type : The table type of this dataset's tables - Returns - ------- - type - The type of the tables in the dataset. Possible values: - - `PandasTable` - - `NumpyTable` - - `SparseTable` - - `FileTable` + Possible values: + + - `PandasTable` + - `NumpyTable` + - `SparseTable` + - `FileTable` """ return type(self.main_table) + @property def is_multitable(self): - """Tests whether the dataset is a multi-table one - - Returns - ------- - bool - ``True`` if the dataset is multi-table. - """ + """bool : ``True`` if the dataset is multitable""" return self.secondary_tables is not None and len(self.secondary_tables) > 0 def to_spec(self): @@ -831,7 +820,7 @@ def to_spec(self): if self.relations: ds_spec["relations"] = [] ds_spec["relations"].extend(self.relations) - if self.table_type() == FileTable: + if self.table_type == FileTable: ds_spec["format"] = (self.sep, self.header) return ds_spec @@ -880,7 +869,7 @@ def create_khiops_dictionary_domain(self): dictionary_domain.add_dictionary(main_dictionary) # For in-memory datasets: Add the target variable if available - if self.is_in_memory() and self.target_column is not None: + if self.is_in_memory and self.target_column is not None: variable = kh.Variable() variable.name = get_khiops_variable_name(self.target_column_id) if self.categorical_target: @@ -945,15 +934,18 @@ def create_table_files_for_khiops(self, output_dir, sort=True): # - The caller specifies not to do it (sort = False) # - The dataset is mono-table and the main table has no key sort_main_table = sort and ( - self.is_multitable() or self.main_table.key is not None + self.is_multitable or self.main_table.key is not None ) - if self.is_in_memory(): + + # In-memory dataset: Create the table files and add the target column + if self.is_in_memory: main_table_path = self.main_table.create_table_file_for_khiops( output_dir, sort=sort_main_table, target_column=self.target_column, target_column_id=self.target_column_id, ) + # File dataset: Create the table files (the target column is in the file) else: main_table_path = self.main_table.create_table_file_for_khiops( output_dir, @@ -973,6 +965,9 @@ def __repr__(self): return str(self.create_khiops_dictionary_domain()) +# pylint: enable=invalid-name + + class DatasetTable(ABC): """A generic dataset table""" @@ -1046,7 +1041,7 @@ def create_khiops_dictionary(self): dictionary = kh.Dictionary() dictionary.name = self.name if self.key is not None: - dictionary.key = list(self.key) + dictionary.key = self.key # For each column add a Khiops variable to the dictionary for column_id in self.column_ids: @@ -1065,18 +1060,16 @@ def create_khiops_dictionary(self): class PandasTable(DatasetTable): - """Table encapsulating the features dataframe X and the target labels y - - X is of type pandas.DataFrame. y is array-like. + """DatasetTable encapsulating a pandas dataframe Parameters ---------- name : str Name for the table. dataframe : `pandas.DataFrame` - The data frame to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key + The data frame to be encapsulated. It must be non-empty. + key : list of str, optional + The names of the columns composing the key. """ def __init__(self, name, dataframe, key=None): @@ -1144,7 +1137,7 @@ def create_table_file_for_khiops( output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") # Write the output dataframe - output_dataframe = self._create_dataframe_copy() + output_dataframe = self.data_source.copy() output_names = { column_id: get_khiops_variable_name(column_id) for column_id in self.column_ids @@ -1173,22 +1166,18 @@ def create_table_file_for_khiops( return output_table_path - def _create_dataframe_copy(self): - """Creates an in memory copy of the dataframe""" - return self.data_source.copy() - class NumpyTable(DatasetTable): - """Table encapsulating (X,y) pair with types (ndarray, ndarray) + """DatasetTable encapsulating a NumPy array Parameters ---------- name : str Name for the table. - array : :external:term:`array-like` of shape (n_samples, n_features_in) + array : `numpy.ndarray` of shape (n_samples, n_features_in) The data frame to be encapsulated. key : :external:term`array-like` of int, optional - The names of the columns composing the key + The names of the columns composing the key. """ def __init__(self, name, array, key=None): @@ -1259,10 +1248,7 @@ def create_table_file_for_khiops( class SparseTable(DatasetTable): - """Table encapsulating feature matrix X and target array y - - X is of type scipy.sparse.spmatrix. - y is array-like. + """DatasetTable encapsulating a SciPy sparse matrix Parameters ---------- @@ -1270,8 +1256,8 @@ class SparseTable(DatasetTable): Name for the table. matrix : `scipy.sparse.spmatrix` The sparse matrix to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key + key : list of str, optional + The names of the columns composing the key. """ def __init__(self, name, matrix, key=None): @@ -1405,7 +1391,7 @@ def create_table_file_for_khiops( class FileTable(DatasetTable): - """A table representing a delimited text file + """DatasetTable encapsulating a delimited text data file Parameters ---------- @@ -1413,12 +1399,12 @@ class FileTable(DatasetTable): Name for the table. path : str Path of the file containing the table. + key : list-like of str, optional + The names of the columns composing the key. sep : str, optional Field separator character. If not specified it will be inferred from the file. header : bool, optional - Indicates if the table - key : list-like of str, optional - The names of the columns composing the key + Indicates if the table. """ def __init__( @@ -1471,7 +1457,6 @@ def __init__( ) # Set the column names and types - assert json_domain["dictionaries"][0]["name"] == self.name variables = json_domain["dictionaries"][0]["variables"] self.column_ids = [var["name"] for var in variables] self.khiops_types = {var["name"]: var["type"] for var in variables} diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index d5c4f76d..e8c4d192 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -45,7 +45,7 @@ def sort_dataset(ds_spec, output_dir=None): ds = Dataset(ds_spec) # Check special arguments in function of the dataset - if ds.table_type() == FileTable and output_dir is None: + if ds.table_type == FileTable and output_dir is None: raise ValueError("'output_dir' must be specified for file based datasets") # Make a copy of the dataset (note: data sources are just reference) @@ -136,7 +136,7 @@ def train_test_split_dataset( ds = Dataset(ds_spec) # Check the parameter coherence - if not ds.is_in_memory(): + if not ds.is_in_memory: if target_column is not None: raise ValueError("'target_column' cannot be used with file path datasets") if output_dir is None: @@ -145,7 +145,7 @@ def train_test_split_dataset( raise TypeError(type_error_message("output_dir", output_dir, str)) # Perform the split for each type of dataset - if ds.is_in_memory(): + if ds.is_in_memory: # Obtain the keys for the other test_train_split function sklearn_split_params = {} for param in ("train_size", "random_state", "shuffle", "stratify"): diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 9662e224..be0034c6 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -171,7 +171,10 @@ def create_multitable_star_data_files(self, main_table_path, secondary_table_pat secondary_table.to_csv(secondary_table_path, sep="\t", index=False) def create_multitable_snowflake_dataframes(self): + # Set the random seed for reproducibility np.random.seed(31416) + + # Create the main table main_table_data = { "User_ID": [ "60B2Xk_3Fw", @@ -189,6 +192,7 @@ def create_multitable_snowflake_dataframes(self): } main_table = pd.DataFrame(main_table_data) + # Create the secondary tables secondary_table_data_1 = { "User_ID": np.random.choice(main_table["User_ID"], 20), "VAR_1": np.random.choice(["a", "b", "c", "d"], 20), @@ -197,7 +201,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(20).tolist(), 2), } secondary_table_1 = pd.DataFrame(secondary_table_data_1) - secondary_table_data_2 = { "User_ID": np.random.choice( main_table["User_ID"], len(main_table), replace=False @@ -210,7 +213,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(len(main_table)).tolist(), 2), } secondary_table_2 = pd.DataFrame(secondary_table_data_2) - tertiary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 100), "VAR_1": np.random.choice(["a", "b", "c", "d"], 100), @@ -218,7 +220,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_3": np.round(np.random.rand(100).tolist(), 2), } tertiary_table = pd.DataFrame(tertiary_table_data) - quaternary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 50), "VAR_1": np.random.choice(["a", "b", "c", "d"], 50), From 558866cde239989f2ea4659022c2b55d8fc9317b Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 26 Aug 2024 10:44:59 +0200 Subject: [PATCH 12/12] Update CHANGELOG.md --- CHANGELOG.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98a89962..14269dc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,17 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. +## 10.2.2.5 - Unreleased + +### Added + +- (General) `train_test_split_dataset` helper function to ease the splitting in train/test for + multi-table datasets. +- (General) `sort_dataset` helper function to ease the sorting by key of multi-table datasets. + ## 10.2.2.4 - 2024-08-05 -## Added +### Added - (`sklearn`) Sklearn's attributes for supervised estimators. ## 10.2.2.3 - 2024-08-02