diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 2afed2c9..5d9c4ee1 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -111,9 +111,8 @@ Samples .. code-block:: python def khiops_classifier_sparse(): - # Load 3 classes of the 20newsgroups dataset - categories = ["comp.graphics", "sci.space", "misc.forsale"] + categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"] data_train, y_train = fetch_20newsgroups( subset="train", categories=categories, @@ -125,15 +124,18 @@ Samples return_X_y=True, ) - # Extract features from the training data using a sparse vectorizer - vectorizer = HashingVectorizer(n_features=2**10, stop_words="english") + # Extract features from the train and test data using a sparse vectorizer + vectorizer = HashingVectorizer(n_features=2048, stop_words="english") X_train = vectorizer.fit_transform(data_train) - - # Extract features from the test data using the same vectorizer X_test = vectorizer.transform(data_test) - # Create the classifier object - khc = KhiopsClassifier() + # Print density of the intermediary datasets + print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}") + print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}") + print("---") + + # Create the classifier object (no trees) + khc = KhiopsClassifier(n_trees=0) # Train the classifier khc.fit(X_train, y_train) diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index dd541edb..b87c1f50 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -102,10 +102,15 @@ "outputs": [], "source": [ "def khiops_classifier_sparse():\n", - " \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\"\"\"\n", + " \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\n", "\n", + " .. note::\n", + " No intermediary dense data is used by Khiops because it supports sparse data\n", + " natively.\n", + "\n", + " \"\"\"\n", " # Load 3 classes of the 20newsgroups dataset\n", - " categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\"]\n", + " categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\", \"alt.atheism\"]\n", " data_train, y_train = fetch_20newsgroups(\n", " subset=\"train\",\n", " categories=categories,\n", @@ -117,15 +122,18 @@ " return_X_y=True,\n", " )\n", "\n", - " # Extract features from the training data using a sparse vectorizer\n", - " vectorizer = HashingVectorizer(n_features=2**10, stop_words=\"english\")\n", + " # Extract features from the train and test data using a sparse vectorizer\n", + " vectorizer = HashingVectorizer(n_features=2048, stop_words=\"english\")\n", " X_train = vectorizer.fit_transform(data_train)\n", - "\n", - " # Extract features from the test data using the same vectorizer\n", " X_test = vectorizer.transform(data_test)\n", "\n", - " # Create the classifier object\n", - " khc = KhiopsClassifier()\n", + " # Print density of the intermediary datasets\n", + " print(f\"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}\")\n", + " print(f\"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}\")\n", + " print(\"---\")\n", + "\n", + " # Create the classifier object (no trees)\n", + " khc = KhiopsClassifier(n_trees=0)\n", "\n", " # Train the classifier\n", " khc.fit(X_train, y_train)\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 112bd95f..7089a165 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -92,10 +92,15 @@ def khiops_classifier(): def khiops_classifier_sparse(): - """Trains a `.KhiopsClassifier` on a monotable sparse matrix""" + """Trains a `.KhiopsClassifier` on a monotable sparse matrix + .. note:: + No intermediary dense data is used by Khiops because it supports sparse data + natively. + + """ # Load 3 classes of the 20newsgroups dataset - categories = ["comp.graphics", "sci.space", "misc.forsale"] + categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"] data_train, y_train = fetch_20newsgroups( subset="train", categories=categories, @@ -107,15 +112,18 @@ def khiops_classifier_sparse(): return_X_y=True, ) - # Extract features from the training data using a sparse vectorizer - vectorizer = HashingVectorizer(n_features=2**10, stop_words="english") + # Extract features from the train and test data using a sparse vectorizer + vectorizer = HashingVectorizer(n_features=2048, stop_words="english") X_train = vectorizer.fit_transform(data_train) - - # Extract features from the test data using the same vectorizer X_test = vectorizer.transform(data_test) - # Create the classifier object - khc = KhiopsClassifier() + # Print density of the intermediary datasets + print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}") + print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}") + print("---") + + # Create the classifier object (no trees) + khc = KhiopsClassifier(n_trees=0) # Train the classifier khc.fit(X_train, y_train) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 31979614..f672aba1 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -305,6 +305,15 @@ def fit(self, X, y=None, **kwargs): try: categorical_target = kwargs.get("categorical_target", True) dataset = Dataset(X, y, categorical_target=categorical_target, key=self.key) + if not dataset.is_in_memory(): + warnings.warn( + deprecation_message( + "File-path dataset sklearn input", + "11.0.0", + "in-memory datasets or khiops.core API", + quote=False, + ), + ) self._fit(dataset, computation_dir, **kwargs) # Undefine any attributes to pass to "not fitted" except: @@ -527,8 +536,18 @@ def _transform_deploy_model( def _transform_check_dataset(self, dataset): """Checks the dataset before deploying a model on them""" - if not dataset.is_in_memory() and self.output_dir is None: - raise ValueError("'output_dir' is not set but dataset is file-based") + if not dataset.is_in_memory(): + warnings.warn( + deprecation_message( + "File-path dataset input", + "11.0.0", + "in-memory datasets or khiops.core API", + quote=False, + ), + stacklevel=4, + ) + if self.output_dir is None: + raise ValueError("'output_dir' is not set but dataset is file-based") def _transform_deployment_post_process( self, deployment_dataset, output_table_path, drop_key diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py index 32c915b3..500ec30e 100644 --- a/khiops/sklearn/tables.py +++ b/khiops/sklearn/tables.py @@ -619,15 +619,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True): # Initialize a file dataset if isinstance(main_table_source, str): - warnings.warn( - deprecation_message( - "File-path dataset input", - "11.0.0", - "dataframe-based dataset or khiops.core API", - quote=False, - ), - stacklevel=4, - ) # Obtain the file format parameters if "format" in X: self.sep, self.header = X["format"]