Skip to content

Commit

Permalink
Move file dataset deprecation warning to sklearn.estimators
Browse files Browse the repository at this point in the history
  • Loading branch information
folmos-at-orange committed Apr 24, 2024
1 parent 1c177ec commit 54486d2
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 35 deletions.
18 changes: 10 additions & 8 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,8 @@ Samples
.. code-block:: python
def khiops_classifier_sparse():
# Load 3 classes of the 20newsgroups dataset
categories = ["comp.graphics", "sci.space", "misc.forsale"]
categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"]
data_train, y_train = fetch_20newsgroups(
subset="train",
categories=categories,
Expand All @@ -125,15 +124,18 @@ Samples
return_X_y=True,
)
# Extract features from the training data using a sparse vectorizer
vectorizer = HashingVectorizer(n_features=2**10, stop_words="english")
# Extract features from the train and test data using a sparse vectorizer
vectorizer = HashingVectorizer(n_features=2048, stop_words="english")
X_train = vectorizer.fit_transform(data_train)
# Extract features from the test data using the same vectorizer
X_test = vectorizer.transform(data_test)
# Create the classifier object
khc = KhiopsClassifier()
# Print density of the intermediary datasets
print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}")
print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}")
print("---")
# Create the classifier object (no trees)
khc = KhiopsClassifier(n_trees=0)
# Train the classifier
khc.fit(X_train, y_train)
Expand Down
24 changes: 16 additions & 8 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,15 @@
"outputs": [],
"source": [
"def khiops_classifier_sparse():\n",
" \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\"\"\"\n",
" \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\n",
"\n",
" .. note::\n",
" No intermediary dense data is used by Khiops because it supports sparse data\n",
" natively.\n",
"\n",
" \"\"\"\n",
" # Load 3 classes of the 20newsgroups dataset\n",
" categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\"]\n",
" categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\", \"alt.atheism\"]\n",
" data_train, y_train = fetch_20newsgroups(\n",
" subset=\"train\",\n",
" categories=categories,\n",
Expand All @@ -117,15 +122,18 @@
" return_X_y=True,\n",
" )\n",
"\n",
" # Extract features from the training data using a sparse vectorizer\n",
" vectorizer = HashingVectorizer(n_features=2**10, stop_words=\"english\")\n",
" # Extract features from the train and test data using a sparse vectorizer\n",
" vectorizer = HashingVectorizer(n_features=2048, stop_words=\"english\")\n",
" X_train = vectorizer.fit_transform(data_train)\n",
"\n",
" # Extract features from the test data using the same vectorizer\n",
" X_test = vectorizer.transform(data_test)\n",
"\n",
" # Create the classifier object\n",
" khc = KhiopsClassifier()\n",
" # Print density of the intermediary datasets\n",
" print(f\"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}\")\n",
" print(f\"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}\")\n",
" print(\"---\")\n",
"\n",
" # Create the classifier object (no trees)\n",
" khc = KhiopsClassifier(n_trees=0)\n",
"\n",
" # Train the classifier\n",
" khc.fit(X_train, y_train)\n",
Expand Down
24 changes: 16 additions & 8 deletions khiops/samples/samples_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,15 @@ def khiops_classifier():


def khiops_classifier_sparse():
"""Trains a `.KhiopsClassifier` on a monotable sparse matrix"""
"""Trains a `.KhiopsClassifier` on a monotable sparse matrix
.. note::
No intermediary dense data is used by Khiops because it supports sparse data
natively.
"""
# Load 3 classes of the 20newsgroups dataset
categories = ["comp.graphics", "sci.space", "misc.forsale"]
categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"]
data_train, y_train = fetch_20newsgroups(
subset="train",
categories=categories,
Expand All @@ -107,15 +112,18 @@ def khiops_classifier_sparse():
return_X_y=True,
)

# Extract features from the training data using a sparse vectorizer
vectorizer = HashingVectorizer(n_features=2**10, stop_words="english")
# Extract features from the train and test data using a sparse vectorizer
vectorizer = HashingVectorizer(n_features=2048, stop_words="english")
X_train = vectorizer.fit_transform(data_train)

# Extract features from the test data using the same vectorizer
X_test = vectorizer.transform(data_test)

# Create the classifier object
khc = KhiopsClassifier()
# Print density of the intermediary datasets
print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}")
print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}")
print("---")

# Create the classifier object (no trees)
khc = KhiopsClassifier(n_trees=0)

# Train the classifier
khc.fit(X_train, y_train)
Expand Down
23 changes: 21 additions & 2 deletions khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,15 @@ def fit(self, X, y=None, **kwargs):
try:
categorical_target = kwargs.get("categorical_target", True)
dataset = Dataset(X, y, categorical_target=categorical_target, key=self.key)
if not dataset.is_in_memory():
warnings.warn(
deprecation_message(
"File-path dataset sklearn input",
"11.0.0",
"in-memory datasets or khiops.core API",
quote=False,
),
)
self._fit(dataset, computation_dir, **kwargs)
# Undefine any attributes to pass to "not fitted"
except:
Expand Down Expand Up @@ -527,8 +536,18 @@ def _transform_deploy_model(

def _transform_check_dataset(self, dataset):
"""Checks the dataset before deploying a model on them"""
if not dataset.is_in_memory() and self.output_dir is None:
raise ValueError("'output_dir' is not set but dataset is file-based")
if not dataset.is_in_memory():
warnings.warn(
deprecation_message(
"File-path dataset input",
"11.0.0",
"in-memory datasets or khiops.core API",
quote=False,
),
stacklevel=4,
)
if self.output_dir is None:
raise ValueError("'output_dir' is not set but dataset is file-based")

def _transform_deployment_post_process(
self, deployment_dataset, output_table_path, drop_key
Expand Down
9 changes: 0 additions & 9 deletions khiops/sklearn/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,15 +619,6 @@ def _init_tables_from_mapping(self, X, y=None, categorical_target=True):

# Initialize a file dataset
if isinstance(main_table_source, str):
warnings.warn(
deprecation_message(
"File-path dataset input",
"11.0.0",
"dataframe-based dataset or khiops.core API",
quote=False,
),
stacklevel=4,
)
# Obtain the file format parameters
if "format" in X:
self.sep, self.header = X["format"]
Expand Down

0 comments on commit 54486d2

Please sign in to comment.