diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 62e0fee2..ff2a227c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,13 +33,16 @@ repos: rev: 0.29.0 hooks: - id: check-github-workflows + name: gh-workflows args: [--verbose] - id: check-github-actions + name: gh-actions args: [--verbose] - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: 3.0.0 hooks: - id: shellcheck + name: shellcheck - repo: local hooks: - id: samples-generation diff --git a/CHANGELOG.md b/CHANGELOG.md index 98a89962..14269dc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,17 @@ - Example: 10.2.1.4 is the 5th version that supports khiops 10.2.1. - Internals: Changes in *Internals* sections are unlikely to be of interest for data scientists. +## 10.2.2.5 - Unreleased + +### Added + +- (General) `train_test_split_dataset` helper function to ease the splitting in train/test for + multi-table datasets. +- (General) `sort_dataset` helper function to ease the sorting by key of multi-table datasets. + ## 10.2.2.4 - 2024-08-05 -## Added +### Added - (`sklearn`) Sklearn's attributes for supervised estimators. ## 10.2.2.3 - 2024-08-02 diff --git a/doc/convert_samples.py b/doc/convert_samples.py index f1741cf3..7feb6743 100644 --- a/doc/convert_samples.py +++ b/doc/convert_samples.py @@ -67,7 +67,7 @@ def create_rest_page_header(script_name): subtitle += ":py:mod:`khiops.core` module." else: title = "Samples sklearn" - subtitle += ":py:mod:`khiops.sklearn` module." + subtitle += ":py:mod:`khiops.sklearn ` module." return ( ":orphan:\n" "\n" diff --git a/doc/core/index.rst b/doc/core/index.rst index 14987e01..74c33bee 100644 --- a/doc/core/index.rst +++ b/doc/core/index.rst @@ -20,9 +20,9 @@ Main Modules :recursive: :nosignatures: - khiops.core.api - khiops.core.dictionary - khiops.core.analysis_results - khiops.core.coclustering_results - khiops.core.exceptions - khiops.core.helpers + api + dictionary + analysis_results + coclustering_results + exceptions + helpers diff --git a/doc/create-doc b/doc/create-doc index 17b2d363..1de0c1ac 100755 --- a/doc/create-doc +++ b/doc/create-doc @@ -90,21 +90,18 @@ fi # Create the coursework materials echo "Creating ZIP files" -(cd "$KHIOPS_TUTORIAL_REPO_DIR" && cp -r data helper_functions.py "../$tutorials_dir") cd "$tutorials_dir" mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "core_tutorials_solutions.zip" Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "core_tutorials_solutions.zip" Core*.ipynb data/*/* exercises/.dummy +zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "$KHIOPS_TUTORIAL_REPO_DIR" python create-coursework.py cd coursework mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "../../$tutorials_dir/core_tutorials.zip" \ - Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "../../$tutorials_dir/sklearn_tutorials.zip" \ - Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "../../$tutorials_dir/core_tutorials.zip" Core*.ipynb data/*/* exercises/.dummy +zip "../../$tutorials_dir/sklearn_tutorials.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "../.." # Create the documentation with Sphinx diff --git a/doc/internal/index.rst b/doc/internal/index.rst index 743d9615..db8301e0 100644 --- a/doc/internal/index.rst +++ b/doc/internal/index.rst @@ -3,17 +3,23 @@ Internals These are internal modules with no "data science" functionality. Their documentation is available for completeness. +.. currentmodule:: khiops.utils .. autosummary:: :nosignatures: :toctree: generated - khiops.sklearn.tables - khiops.core.internals.common - khiops.core.internals.filesystems - khiops.core.internals.io - khiops.core.internals.runner - khiops.core.internals.scenario - khiops.core.internals.task - khiops.core.internals.types - khiops.core.internals.version + dataset +.. currentmodule:: khiops.core.internals +.. autosummary:: + :nosignatures: + :toctree: generated + + common + filesystems + io + runner + scenario + task + types + version diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst index 4a64ee4a..5f35064b 100644 --- a/doc/samples/samples.rst +++ b/doc/samples/samples.rst @@ -1185,6 +1185,34 @@ Samples output_data_table_path, sort_variables=["AccidentId", "VehicleId"], ) +.. autofunction:: sort_data_tables_mt +.. code-block:: python + + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) .. autofunction:: extract_keys_from_data_table .. code-block:: python diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 80df3fe4..b2e7fc87 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -5,7 +5,7 @@ Samples sklearn =============== -The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.sklearn` module. +The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.sklearn ` module. Script and Jupyter notebook --------------------------- @@ -152,55 +152,32 @@ Samples import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics - from sklearn.model_selection import train_test_split - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { + # Create the dataset spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) # Train the classifier (by default it analyzes 100 multi-table features) khc = KhiopsClassifier() @@ -224,6 +201,73 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_multitable_star_file +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + os.makedirs(results_dir, exist_ok=True) + + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_multitable_snowflake .. code-block:: python @@ -232,29 +276,31 @@ Samples import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) places_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. X = { "main_table": "Accidents", "tables": { @@ -270,16 +316,22 @@ Samples ], } - # Load the target variable from the AccidentsSummary dataset + # Load the target variable "Gravity" from the "AccidentsSummary" dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) # Train the classifier (by default it creates 1000 multi-table features) khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) + khc.fit(X_train, y_train) # Show the feature importance info print(f"Features evaluated: {khc.n_features_evaluated_}") @@ -290,23 +342,23 @@ Samples print("---") # Predict the class on the test dataset - y_pred = khc.predict(X) + y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") - print(y_pred[:10]) + print(y_test_pred[:10]) print("---") - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) print(f"Class order: {khc.classes_}") print("Predicted class probabilities (first 10):") - print(y_probas[:10]) + print(y_test_probas[:10]) print("---") - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_sparse .. code-block:: python @@ -365,20 +417,9 @@ Samples # Imports import os - import pandas as pd import pickle - from khiops import core as kh from khiops.sklearn import KhiopsClassifier - - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) + from sklearn.datasets import load_iris # Create/clean the output directory results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") @@ -388,9 +429,14 @@ Samples else: os.makedirs(results_dir, exist_ok=True) + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) # Unpickle it with open(khc_pickle_path, "rb") as khc_pickle_file: @@ -413,22 +459,14 @@ Samples from sklearn import metrics from sklearn.model_selection import train_test_split - # Load the dataset into a pandas dataframe + # Load the "Adult" dataset and set the target to the "age" column adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # Create the regressor object khr = KhiopsRegressor() @@ -459,21 +497,14 @@ Samples .. code-block:: python # Imports - import os - import pandas as pd - from khiops import core as kh from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Load the dataset + X, y = load_iris(return_X_y=True) # Create the encoder object - khe = KhiopsEncoder() + khe = KhiopsEncoder(transform_type_numerical="part_label") khe.fit(X, y) # Transform the training dataset @@ -481,7 +512,7 @@ Samples # Print both the original and transformed features print("Original:") - print(X.head(10)) + print(X[:10]) print("---") print("Encoded feature names:") print(khe.feature_names_out_) @@ -497,42 +528,34 @@ Samples from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] - # Create the KhiopsEncoder with 10 additional multitable features and fit it + # Create the KhiopsEncoder with 5 multitable features and fit it khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) .. autofunction:: khiops_encoder_multitable_snowflake .. code-block:: python @@ -543,33 +566,38 @@ Samples from khiops.sklearn import KhiopsEncoder # Load the tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) - # Build the multitable input X + # Build the multi-table spec # Note: We discard the "Gravity" field from the "Users" table as it was used to # build the target column X = { "main_table": "Accidents", "tables": { "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), }, "relations": [ ("Accidents", "Vehicles"), + ("Accidents", "Places", True), ("Vehicles", "Users"), ], } @@ -577,9 +605,12 @@ Samples # Load the target variable from the AccidentsSummary dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -612,22 +643,14 @@ Samples from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder - # Load the dataset into a pandas dataframe + # Load the dataset into dataframes adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # Create the pipeline and fit it. Steps: # - The khiops supervised column encoder, generates a full-categorical table @@ -638,8 +661,6 @@ Samples ( "onehot_enc", ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), ), ("hgb_clf", HistGradientBoostingClassifier()), ] @@ -674,13 +695,13 @@ Samples from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) # Create the KhiopsCoclustering instance khcc = KhiopsCoclustering() @@ -704,9 +725,9 @@ Samples from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) @@ -738,32 +759,24 @@ Samples from sklearn.model_selection import train_test_split # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") @@ -771,16 +784,16 @@ Samples khc = KhiopsClassifier(key="AccidentId") # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) + khc.fit([X_train, X_train_secondary], y_train) # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) + y_test_pred = khc.predict([X_test, X_test_secondary]) print("Predicted classes (first 10):") print(y_test_pred[:10]) print("---") # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) print("Predicted class probabilities (first 10):") print(y_test_probas[:10]) print("---") @@ -790,102 +803,3 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") -.. autofunction:: khiops_classifier_multitable_star_file -.. code-block:: python - - # Imports - import os - import pandas as pd - from khiops import core as kh - from khiops.sklearn import KhiopsClassifier - from sklearn import metrics - from sklearn.model_selection import train_test_split - - # Create output directory - results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_file") - if not os.path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not os.path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = os.path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = os.path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") diff --git a/doc/sklearn/index.rst b/doc/sklearn/index.rst index e5f05932..27e3dcf7 100644 --- a/doc/sklearn/index.rst +++ b/doc/sklearn/index.rst @@ -8,13 +8,13 @@ khiops.sklearn from khiops.sklearn import KhiopsClassifier clf = KhiopsClassifier() -.. currentmodule:: khiops +.. currentmodule:: khiops.sklearn .. autosummary:: :toctree: generated :recursive: :nosignatures: - khiops.sklearn.estimators + estimators Related Docs ------------ diff --git a/doc/tools/index.rst b/doc/tools/index.rst index 4dbdf1f3..162cc3be 100644 --- a/doc/tools/index.rst +++ b/doc/tools/index.rst @@ -7,4 +7,5 @@ These are auxiliary tools for the Khiops Python library. :toctree: generated :nosignatures: - khiops.tools + utils.helpers + tools diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index e15b8f37..d29adfa2 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -72,7 +72,7 @@ def get_dir_status(a_dir): return status -def check_samples_dir(samples_dir): +def _check_samples_dir(samples_dir): # Warn if there are problems with the samples_dir samples_dir_status = get_dir_status(samples_dir) download_msg = ( @@ -295,20 +295,15 @@ def _check_executable(bin_path): def get_linux_distribution_name(): - """Detect Linux distribution name + """Detect the Linux distribution name - Parses the `NAME` variable defined in the `/etc/os-release` or - `/usr/lib/os-release` files and converts it to lowercase. + Parses the ``NAME`` variable defined in the ``/etc/os-release`` or + ``/usr/lib/os-release`` files and converts it to lowercase. Returns ------- str Name of the Linux distribution, converted to lowecase - - Raises - ------ - OSError - If neither `/etc/os-release` nor `/usr/lib/os-release` are found """ def get_linux_distribution_from_os_release_file(os_release_file_path): @@ -1530,13 +1525,13 @@ def _tool_path(self, tool_name): def _set_samples_dir(self, samples_dir): """Checks and sets the samples directory""" - check_samples_dir(samples_dir) + _check_samples_dir(samples_dir) super()._set_samples_dir(samples_dir) def _get_samples_dir(self): # Check the samples dir once (the check emmits only warnings) if not self._samples_dir_checked: - check_samples_dir(self._samples_dir) + _check_samples_dir(self._samples_dir) self._samples_dir_checked = True return self._samples_dir diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index 53b2df7b..bccde760 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -1563,6 +1563,47 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `sort_data_tables_mt()`\n\n", + "Sorts with the dedicated helper a multi-table dataset by the default keys\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops.utils.helpers import sort_dataset\n", + "\n", + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", + "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"sort_data_tables_mt\")\n", + "\n", + "# Build the dataset spec\n", + "ds_spec = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_table_path, \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_table_path, \"AccidentId\"),\n", + " },\n", + "}\n", + "\n", + "# Sort the dataset\n", + "sort_dataset(ds_spec, output_dir=results_dir)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py index 0ca90133..da8b6288 100644 --- a/khiops/samples/samples.py +++ b/khiops/samples/samples.py @@ -1329,6 +1329,35 @@ def sort_data_table_expert(): ) +def sort_data_tables_mt(): + """Sorts with the dedicated helper a multi-table dataset by the default keys""" + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) + + def extract_keys_from_data_table(): """Extracts the keys from a database @@ -1662,6 +1691,7 @@ def build_deployed_dictionary(): deploy_regressor_for_metrics, sort_data_table, sort_data_table_expert, + sort_data_tables_mt, extract_keys_from_data_table, train_coclustering, simplify_coclustering, diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 2aa029fb..3c94880a 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -164,55 +164,32 @@ "import pandas as pd\n", "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", - "from sklearn.model_selection import train_test_split\n", "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "# Load the dataset into pandas dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Split the root dataframe into train and test\n", - "accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Obtain the main X feature table and the y target vector (\"Class\" column)\n", - "y_train = accidents_train_df[\"Gravity\"]\n", - "y_test = accidents_test_df[\"Gravity\"]\n", - "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - "# Create the dataset multitable specification for the train/test split\n", - "# We specify each table with a name and a tuple (dataframe, key_columns)\n", - "X_train = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - "}\n", - "X_test = {\n", + "# Create the dataset spec and the target\n", + "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (X_test_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", "}\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(\n", + " X, y, test_size=0.3, random_state=1\n", + ")\n", "\n", "# Train the classifier (by default it analyzes 100 multi-table features)\n", "khc = KhiopsClassifier()\n", @@ -238,12 +215,92 @@ "print(f\"Test auc = {test_auc}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_star_file()`\n\n", + "Trains a `.KhiopsClassifier` with a file dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# Create output directory\n", + "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n", + "os.makedirs(results_dir, exist_ok=True)\n", + "\n", + "# Create the dataset spec\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " \"AccidentId\",\n", + " ),\n", + " \"Vehicles\": (\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", + " [\"AccidentId\", \"VehicleId\"],\n", + " ),\n", + " },\n", + " \"format\": (\"\\t\", True),\n", + "}\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test = train_test_split_dataset(\n", + " X, output_dir=os.path.join(results_dir, \"split\"), test_size=0.3\n", + ")\n", + "\n", + "# Create the classifier and fit it\n", + "khc = KhiopsClassifier(output_dir=results_dir)\n", + "khc.fit(X_train, y=\"Gravity\")\n", + "\n", + "# Predict the class in addition to the class probabilities on the test dataset\n", + "y_test_pred_path = khc.predict(X_test)\n", + "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[\"PredictedGravity\"].head(10))\n", + "print(\"---\")\n", + "\n", + "y_test_probas_path = khc.predict_proba(X_test)\n", + "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", + "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[proba_columns].head(10))\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "# Note: For roc_auc_score we have to use the \"greatest\" label which is \"NonLethal\"\n", + "y_test = pd.read_csv(\n", + " X_test[\"tables\"][\"Accidents\"][0],\n", + " usecols=[\"Gravity\"],\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityNonLethal\"])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### `khiops_classifier_multitable_snowflake()`\n\n", - "Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n\n .. note::\n For simplicity we train from the whole dataset. To assess the performance one\n usually splits the dataset into train and test subsets.\n\n \n" + "Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n" ] }, { @@ -257,29 +314,31 @@ "import pandas as pd\n", "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", "from sklearn import metrics\n", "\n", "# Load the dataset tables into dataframes\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "users_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"),\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "places_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", - "# Build the multitable input X\n", - "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", - "# build the target column\n", + "\n", + "# Create the dataset spec\n", + "# Note: We discard the \"Gravity\" column from the \"Users\" table to avoid a target\n", + "# leak. This is because the column was used to build the target.\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", @@ -295,16 +354,22 @@ " ],\n", "}\n", "\n", - "# Load the target variable from the AccidentsSummary dataset\n", + "# Load the target variable \"Gravity\" from the \"AccidentsSummary\" dataset\n", "y = pd.read_csv(\n", " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", - ")[\"Gravity\"]\n", + ").squeeze(\n", + " \"columns\"\n", + ") # squeeze to ensure pandas.Series\n", + "\n", + "# Split into train and test datasets\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)\n", "\n", "# Train the classifier (by default it creates 1000 multi-table features)\n", "khc = KhiopsClassifier(n_trees=0)\n", - "khc.fit(X, y)\n", + "khc.fit(X_train, y_train)\n", "\n", "# Show the feature importance info\n", "print(f\"Features evaluated: {khc.n_features_evaluated_}\")\n", @@ -315,23 +380,23 @@ "print(\"---\")\n", "\n", "# Predict the class on the test dataset\n", - "y_pred = khc.predict(X)\n", + "y_test_pred = khc.predict(X_test)\n", "print(\"Predicted classes (first 10):\")\n", - "print(y_pred[:10])\n", + "print(y_test_pred[:10])\n", "print(\"---\")\n", "\n", - "# Predict the class probability on the train dataset\n", - "y_probas = khc.predict_proba(X)\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", "print(f\"Class order: {khc.classes_}\")\n", "print(\"Predicted class probabilities (first 10):\")\n", - "print(y_probas[:10])\n", + "print(y_test_probas[:10])\n", "print(\"---\")\n", "\n", - "# Evaluate accuracy and auc metrics on the train dataset\n", - "train_accuracy = metrics.accuracy_score(y_pred, y)\n", - "train_auc = metrics.roc_auc_score(y, y_probas[:, 1])\n", - "print(f\"Train accuracy = {train_accuracy}\")\n", - "print(f\"Train auc = {train_auc}\")" + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test_pred, y_test)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" ] }, { @@ -416,20 +481,9 @@ "source": [ "# Imports\n", "import os\n", - "import pandas as pd\n", "import pickle\n", - "from khiops import core as kh\n", "from khiops.sklearn import KhiopsClassifier\n", - "\n", - "# Load the dataset into a pandas dataframe\n", - "iris_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - "# Train the model with the whole dataset\n", - "X = iris_df.drop([\"Class\"], axis=1)\n", - "y = iris_df[\"Class\"]\n", - "khc = KhiopsClassifier()\n", - "khc.fit(X, y)\n", + "from sklearn.datasets import load_iris\n", "\n", "# Create/clean the output directory\n", "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_pickle\")\n", @@ -439,9 +493,14 @@ "else:\n", " os.makedirs(results_dir, exist_ok=True)\n", "\n", + "# Train the model with the Iris dataset\n", + "X, y = load_iris(return_X_y=True)\n", + "khc = KhiopsClassifier()\n", + "khc.fit(X, y)\n", + "\n", "# Pickle its content to a file\n", - "with open(khc_pickle_path, \"wb\") as khc_pickle_write_file:\n", - " pickle.dump(khc, khc_pickle_write_file)\n", + "with open(khc_pickle_path, \"wb\") as khc_pickle_output_file:\n", + " pickle.dump(khc, khc_pickle_output_file)\n", "\n", "# Unpickle it\n", "with open(khc_pickle_path, \"rb\") as khc_pickle_file:\n", @@ -477,22 +536,14 @@ "from sklearn import metrics\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# Load the dataset into a pandas dataframe\n", + "# Load the \"Adult\" dataset and set the target to the \"age\" column\n", "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"age\", axis=1)\n", + "y = adult_df[\"age\"]\n", "\n", "# Split the whole dataframe into train and test (40%-60% for speed)\n", - "adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.6, random_state=1\n", - ")\n", - "\n", - "# Split the dataset into:\n", - "# - the X feature table\n", - "# - the y target vector (\"age\" column)\n", - "X_train = adult_train_df.drop(\"age\", axis=1)\n", - "X_test = adult_test_df.drop(\"age\", axis=1)\n", - "y_train = adult_train_df[\"age\"]\n", - "y_test = adult_test_df[\"age\"]\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)\n", "\n", "# Create the regressor object\n", "khr = KhiopsRegressor()\n", @@ -536,21 +587,14 @@ "outputs": [], "source": [ "# Imports\n", - "import os\n", - "import pandas as pd\n", - "from khiops import core as kh\n", "from khiops.sklearn import KhiopsEncoder\n", + "from sklearn.datasets import load_iris\n", "\n", - "# Load the dataset into a pandas dataframe\n", - "iris_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - "iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - "# Train the model with the whole dataset\n", - "X = iris_df.drop(\"Class\", axis=1)\n", - "y = iris_df[\"Class\"]\n", + "# Load the dataset\n", + "X, y = load_iris(return_X_y=True)\n", "\n", "# Create the encoder object\n", - "khe = KhiopsEncoder()\n", + "khe = KhiopsEncoder(transform_type_numerical=\"part_label\")\n", "khe.fit(X, y)\n", "\n", "# Transform the training dataset\n", @@ -558,7 +602,7 @@ "\n", "# Print both the original and transformed features\n", "print(\"Original:\")\n", - "print(X.head(10))\n", + "print(X[:10])\n", "print(\"---\")\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", @@ -587,42 +631,34 @@ "from khiops import core as kh\n", "from khiops.sklearn import KhiopsEncoder\n", "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "# Load the dataset tables into dataframe\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Obtain the root X feature table and the y target vector (\"Class\" column)\n", - "X_main = accidents_df.drop(\"Gravity\", axis=1)\n", - "y = accidents_df[\"Gravity\"]\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "X_secondary = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Create the dataset multitable specification for the train/test split\n", - "# We specify each table with a name and a tuple (dataframe, key_columns)\n", - "X_dataset = {\n", + "# Build the multi-table spec and the target\n", + "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (X_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", "}\n", + "y = accidents_df[\"Gravity\"]\n", "\n", - "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", + "# Create the KhiopsEncoder with 5 multitable features and fit it\n", "khe = KhiopsEncoder(n_features=10)\n", - "khe.fit(X_dataset, y)\n", + "khe.fit(X, y)\n", "\n", "# Transform the train dataset\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", "print(\"Encoded data:\")\n", - "print(khe.transform(X_dataset)[:10])" + "print(khe.transform(X)[:10])" ] }, { @@ -630,7 +666,7 @@ "metadata": {}, "source": [ "### `khiops_encoder_multitable_snowflake()`\n\n", - "Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n\n .. note::\n For simplicity we train from the whole dataset. To assess the performance\n one usually splits the dataset into train and test subsets.\n \n" + "Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n" ] }, { @@ -646,33 +682,38 @@ "from khiops.sklearn import KhiopsEncoder\n", "\n", "# Load the tables into dataframes\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "places_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", "users_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", ")\n", "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"),\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", "\n", - "# Build the multitable input X\n", + "# Build the multi-table spec\n", "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", "# build the target column\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", " \"Accidents\": (accidents_df, \"AccidentId\"),\n", + " \"Places\": (places_df, \"AccidentId\"),\n", " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", " },\n", " \"relations\": [\n", " (\"Accidents\", \"Vehicles\"),\n", + " (\"Accidents\", \"Places\", True),\n", " (\"Vehicles\", \"Users\"),\n", " ],\n", "}\n", @@ -680,9 +721,12 @@ "# Load the target variable from the AccidentsSummary dataset\n", "y = pd.read_csv(\n", " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", - ")[\"Gravity\"]\n", + ").squeeze(\n", + " \"columns\"\n", + ") # squeeze to ensure pandas.Series\n", "\n", "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", "khe = KhiopsEncoder(n_features=10)\n", @@ -707,7 +751,7 @@ "metadata": {}, "source": [ "### `khiops_encoder_pipeline_with_hgbc()`\n\n", - "Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\n" + "Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\n" ] }, { @@ -728,22 +772,14 @@ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "# Load the dataset into a pandas dataframe\n", + "# Load the dataset into dataframes\n", "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"class\", axis=1)\n", + "y = adult_df[\"class\"]\n", "\n", - "# Split the whole dataframe into train and test (70%-30%)\n", - "adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Split the dataset into:\n", - "# - the X feature table\n", - "# - the y target vector (\"class\" column)\n", - "X_train = adult_train_df.drop(\"class\", axis=1)\n", - "X_test = adult_test_df.drop(\"class\", axis=1)\n", - "y_train = adult_train_df[\"class\"]\n", - "y_test = adult_test_df[\"class\"]\n", + "# Split the dataset into train and test (70%-30%)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", "\n", "# Create the pipeline and fit it. Steps:\n", "# - The khiops supervised column encoder, generates a full-categorical table\n", @@ -754,8 +790,6 @@ " (\n", " \"onehot_enc\",\n", " ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)),\n", - " # For sklearn < 1.2, use\n", - " # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)),\n", " ),\n", " (\"hgb_clf\", HistGradientBoostingClassifier()),\n", "]\n", @@ -803,13 +837,13 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "splice_dataset_path = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - "splice_dna_X = pd.read_csv(\n", - " os.path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_dna_df = pd.read_csv(\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", ")\n", "\n", "# Train with only 70% of data (for speed in this example)\n", - "X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)\n", + "X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1)\n", "\n", "# Create the KhiopsCoclustering instance\n", "khcc = KhiopsCoclustering()\n", @@ -846,9 +880,9 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "splice_dataset_path = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", "splice_dna_X = pd.read_csv(\n", - " os.path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", ")\n", "\n", "# Train with only 70% of data (for speed in this example)\n", @@ -893,32 +927,24 @@ "from sklearn.model_selection import train_test_split\n", "\n", "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", " encoding=\"latin1\",\n", ")\n", + "X = accidents_df.drop(\"Gravity\", axis=1)\n", + "y = accidents_df[\"Gravity\"]\n", "\n", - "# Split the root dataframe into train and test\n", - "accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Obtain the main X feature table and the y target vector (\"Class\" column)\n", - "y_train = accidents_train_df[\"Gravity\"]\n", - "y_test = accidents_test_df[\"Gravity\"]\n", - "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", "\n", "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", + "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", "\n", @@ -926,16 +952,16 @@ "khc = KhiopsClassifier(key=\"AccidentId\")\n", "\n", "# Train the classifier\n", - "khc.fit([X_train_main, X_train_secondary], y_train)\n", + "khc.fit([X_train, X_train_secondary], y_train)\n", "\n", "# Predict the class on the test dataset\n", - "y_test_pred = khc.predict([X_test_main, X_test_secondary])\n", + "y_test_pred = khc.predict([X_test, X_test_secondary])\n", "print(\"Predicted classes (first 10):\")\n", "print(y_test_pred[:10])\n", "print(\"---\")\n", "\n", "# Predict the class probability on the test dataset\n", - "y_test_probas = khc.predict_proba([X_test_main, X_test_secondary])\n", + "y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n", "print(\"Predicted class probabilities (first 10):\")\n", "print(y_test_probas[:10])\n", "print(\"---\")\n", @@ -946,118 +972,6 @@ "print(f\"Test accuracy = {test_accuracy}\")\n", "print(f\"Test auc = {test_auc}\")" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `khiops_classifier_multitable_star_file()`\n\n", - "Trains a `.KhiopsClassifier` with a file path based dataset\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n If you need to handle large datasets that do not easily fit into memory then you\n may use the `~.khiops.core` API directly, which allows to specify file paths\n directly.\n \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports\n", - "import os\n", - "import pandas as pd\n", - "from khiops import core as kh\n", - "from khiops.sklearn import KhiopsClassifier\n", - "from sklearn import metrics\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Create output directory\n", - "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_file\")\n", - "if not os.path.exists(\"kh_samples\"):\n", - " os.mkdir(\"kh_samples\")\n", - " os.mkdir(results_dir)\n", - "else:\n", - " if not os.path.exists(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "\n", - "# Split the root dataframe into train and test\n", - "X_train_main, X_test_main = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - ")\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", - "\n", - "# Split the secondary dataframe with the keys of the splitted root dataframe\n", - "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - "# Write the train and test dataset sets to disk\n", - "# For the test file we remove the target column from the main table\n", - "X_train_main_path = os.path.join(results_dir, \"X_train_main.txt\")\n", - "X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n", - "X_train_secondary_path = os.path.join(results_dir, \"X_train_secondary.txt\")\n", - "X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n", - "X_test_main_path = os.path.join(results_dir, \"X_test_main.txt\")\n", - "y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n", - "X_test_main.drop(columns=\"Gravity\").to_csv(\n", - " X_test_main_path, sep=\"\\t\", header=True, index=False\n", - ")\n", - "X_test_secondary_path = os.path.join(results_dir, \"X_test_secondary.txt\")\n", - "X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n", - "\n", - "# Define the dictionary of train\n", - "X_train_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - "}\n", - "X_test_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_test_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - "}\n", - "\n", - "# Create the classifier and fit it\n", - "khc = KhiopsClassifier(output_dir=results_dir)\n", - "khc.fit(X_train_dataset, y=\"Gravity\")\n", - "\n", - "# Predict the class in addition to the class probabilities on the test dataset\n", - "y_test_pred_path = khc.predict(X_test_dataset)\n", - "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", - "print(\"Predicted classes (first 10):\")\n", - "print(y_test_pred[\"PredictedGravity\"].head(10))\n", - "print(\"---\")\n", - "\n", - "y_test_probas_path = khc.predict_proba(X_test_dataset)\n", - "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", - "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", - "print(\"Predicted class probabilities (first 10):\")\n", - "print(y_test_probas[proba_columns].head(10))\n", - "print(\"---\")\n", - "\n", - "# Evaluate accuracy and auc metrics on the test dataset\n", - "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", - "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n", - "print(f\"Test accuracy = {test_accuracy}\")\n", - "print(f\"Test auc = {test_auc}\")" - ] } ], "metadata": {}, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 976db54c..8b22fb5e 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -16,7 +16,7 @@ # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names estimators.py +# pylint --disable=all --enable=invalid-names samples_sklearn.py # pylint: disable=invalid-name # For ease of use the functions in this module contain (repeated) import statements @@ -145,55 +145,34 @@ def khiops_classifier_multitable_star(): import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics - from sklearn.model_selection import train_test_split - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { + # Create the dataset spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) # Train the classifier (by default it analyzes 100 multi-table features) khc = KhiopsClassifier() @@ -219,42 +198,106 @@ def khiops_classifier_multitable_star(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_snowflake(): - """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset +def khiops_classifier_multitable_star_file(): + """Trains a `.KhiopsClassifier` with a file dataset""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics - .. note:: - For simplicity we train from the whole dataset. To assess the performance one - usually splits the dataset into train and test subsets. + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + os.makedirs(results_dir, exist_ok=True) - """ + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + + +def khiops_classifier_multitable_snowflake(): + """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset""" # Imports import os import pandas as pd from khiops import core as kh from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset from sklearn import metrics # Load the dataset tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) places_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. X = { "main_table": "Accidents", "tables": { @@ -270,16 +313,22 @@ def khiops_classifier_multitable_snowflake(): ], } - # Load the target variable from the AccidentsSummary dataset + # Load the target variable "Gravity" from the "AccidentsSummary" dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) # Train the classifier (by default it creates 1000 multi-table features) khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) + khc.fit(X_train, y_train) # Show the feature importance info print(f"Features evaluated: {khc.n_features_evaluated_}") @@ -290,23 +339,23 @@ def khiops_classifier_multitable_snowflake(): print("---") # Predict the class on the test dataset - y_pred = khc.predict(X) + y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") - print(y_pred[:10]) + print(y_test_pred[:10]) print("---") - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) print(f"Class order: {khc.classes_}") print("Predicted class probabilities (first 10):") - print(y_probas[:10]) + print(y_test_probas[:10]) print("---") - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") def khiops_classifier_sparse(): @@ -367,20 +416,9 @@ def khiops_classifier_pickle(): """Shows the serialization and deserialization of a `.KhiopsClassifier`""" # Imports import os - import pandas as pd import pickle - from khiops import core as kh from khiops.sklearn import KhiopsClassifier - - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) + from sklearn.datasets import load_iris # Create/clean the output directory results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") @@ -390,9 +428,14 @@ def khiops_classifier_pickle(): else: os.makedirs(results_dir, exist_ok=True) + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) # Unpickle it with open(khc_pickle_path, "rb") as khc_pickle_file: @@ -416,23 +459,17 @@ def khiops_regressor(): from sklearn import metrics from sklearn.model_selection import train_test_split - # Load the dataset into a pandas dataframe + # Load the "Adult" dataset and set the target to the "age" column adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] - # Create the regressor object khr = KhiopsRegressor() @@ -472,21 +509,14 @@ def khiops_encoder(): usually splits the dataset into train and test subsets. """ # Imports - import os - import pandas as pd - from khiops import core as kh from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris - # Load the dataset into a pandas dataframe - iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Load the dataset + X, y = load_iris(return_X_y=True) # Create the encoder object - khe = KhiopsEncoder() + khe = KhiopsEncoder(transform_type_numerical="part_label") khe.fit(X, y) # Transform the training dataset @@ -494,7 +524,7 @@ def khiops_encoder(): # Print both the original and transformed features print("Original:") - print(X.head(10)) + print(X[:10]) print("---") print("Encoded feature names:") print(khe.feature_names_out_) @@ -511,51 +541,40 @@ def khiops_encoder_multitable_star(): from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] - # Create the KhiopsEncoder with 10 additional multitable features and fit it + # Create the KhiopsEncoder with 5 multitable features and fit it khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) def khiops_encoder_multitable_snowflake(): - """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset - - .. note:: - For simplicity we train from the whole dataset. To assess the performance - one usually splits the dataset into train and test subsets. - """ + """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset""" # Imports import os import pandas as pd @@ -563,33 +582,38 @@ def khiops_encoder_multitable_snowflake(): from khiops.sklearn import KhiopsEncoder # Load the tables into dataframes - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) users_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t", encoding="latin1", ) - # Build the multitable input X + # Build the multi-table spec # Note: We discard the "Gravity" field from the "Users" table as it was used to # build the target column X = { "main_table": "Accidents", "tables": { "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), }, "relations": [ ("Accidents", "Vehicles"), + ("Accidents", "Places", True), ("Vehicles", "Users"), ], } @@ -597,9 +621,12 @@ def khiops_encoder_multitable_snowflake(): # Load the target variable from the AccidentsSummary dataset y = pd.read_csv( os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze( + "columns" + ) # squeeze to ensure pandas.Series # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -622,7 +649,7 @@ def khiops_encoder_multitable_snowflake(): # Disable line too long just to have a title linking the sklearn documentation # pylint: disable=line-too-long def khiops_encoder_pipeline_with_hgbc(): - """Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" + """Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" # Imports import os import pandas as pd @@ -635,23 +662,17 @@ def khiops_encoder_pipeline_with_hgbc(): from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder - # Load the dataset into a pandas dataframe + # Load the dataset into dataframes adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] - # Create the pipeline and fit it. Steps: # - The khiops supervised column encoder, generates a full-categorical table # - One hot encoder in all columns @@ -661,8 +682,6 @@ def khiops_encoder_pipeline_with_hgbc(): ( "onehot_enc", ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), ), ("hgb_clf", HistGradientBoostingClassifier()), ] @@ -701,13 +720,13 @@ def khiops_coclustering(): from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) # Create the KhiopsCoclustering instance khcc = KhiopsCoclustering() @@ -732,9 +751,9 @@ def khiops_coclustering_simplify(): from sklearn.model_selection import train_test_split # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") splice_dna_X = pd.read_csv( - os.path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) @@ -776,32 +795,28 @@ def khiops_classifier_multitable_list(): from sklearn.model_selection import train_test_split # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") @@ -809,16 +824,16 @@ def khiops_classifier_multitable_list(): khc = KhiopsClassifier(key="AccidentId") # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) + khc.fit([X_train, X_train_secondary], y_train) # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) + y_test_pred = khc.predict([X_test, X_test_secondary]) print("Predicted classes (first 10):") print(y_test_pred[:10]) print("---") # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) print("Predicted class probabilities (first 10):") print(y_test_probas[:10]) print("---") @@ -830,117 +845,11 @@ def khiops_classifier_multitable_list(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_star_file(): - """Trains a `.KhiopsClassifier` with a file path based dataset - - .. warning:: - This dataset input method is **Deprecated** and will be removed in Khiops 11. - If you need to handle large datasets that do not easily fit into memory then you - may use the `~.khiops.core` API directly, which allows to specify file paths - directly. - """ - # Imports - import os - import pandas as pd - from khiops import core as kh - from khiops.sklearn import KhiopsClassifier - from sklearn import metrics - from sklearn.model_selection import train_test_split - - # Create output directory - results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_file") - if not os.path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not os.path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = os.path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = os.path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") - - exported_samples = [ khiops_classifier, khiops_classifier_multiclass, khiops_classifier_multitable_star, + khiops_classifier_multitable_star_file, khiops_classifier_multitable_snowflake, khiops_classifier_sparse, khiops_classifier_pickle, @@ -952,15 +861,13 @@ def khiops_classifier_multitable_star_file(): khiops_coclustering, khiops_coclustering_simplify, khiops_classifier_multitable_list, - khiops_classifier_multitable_star_file, ] def execute_samples(args): """Executes all non-interactive samples""" # Create the results directory if it does not exist - if not os.path.isdir("./kh_samples"): - os.mkdir("./kh_samples") + os.makedirs("./kh_samples", exist_ok=True) # Set the user-defined samples dir if any if args.samples_dir is not None: @@ -981,7 +888,7 @@ def execute_samples(args): print(f"{len(execution_samples)} sample(s) to execute\n") for sample in execution_samples: - print(">>> Executing samples_sklearn." + sample.__name__) + print(f">>> Executing samples_sklearn.{sample.__name__}") sample.__call__() print("> Done\n") diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index d863e532..fb5f4d22 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -50,7 +50,12 @@ is_list_like, type_error_message, ) -from khiops.sklearn.tables import Dataset, read_internal_data_table +from khiops.utils.dataset import ( + Dataset, + FileTable, + get_khiops_variable_name, + read_internal_data_table, +) # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: @@ -86,70 +91,110 @@ def _extract_basic_dictionary(dictionary): def _check_dictionary_compatibility( model_dictionary, - dataset_dictionary, + ds_dictionary, estimator_class_name, + target_variable_name=None, ): # Prefix for all error messages - error_msg_prefix = f"X contains incompatible table '{dataset_dictionary.name}'" - - # Save variable arrays and their size - model_variables = model_dictionary.variables - dataset_variables = dataset_dictionary.variables + error_msg_prefix = ( + f"Model {estimator_class_name} incompatible with " + f"table '{ds_dictionary.name}'" + ) + + # Put the variable names in sets + model_variable_names = {var.name for var in model_dictionary.variables} + ds_variable_names = {var.name for var in ds_dictionary.variables} + + # The only feature that may be missing of the dataset is the target + model_var_names_not_in_ds = model_variable_names - ds_variable_names + if len(model_var_names_not_in_ds) > 0: + if target_variable_name is None: + effective_model_var_names_not_in_ds = model_var_names_not_in_ds + else: + effective_model_var_names_not_in_ds = model_var_names_not_in_ds - { + target_variable_name + } + if len(effective_model_var_names_not_in_ds) > 0: + raise ValueError( + f"{error_msg_prefix}: Missing features: " + f"{effective_model_var_names_not_in_ds}." + ) - # Error if different number of variables - if len(model_variables) != len(dataset_variables): + # Raise an error if there are extra features in the input + ds_var_names_not_in_model = ds_variable_names - model_variable_names + if len(ds_var_names_not_in_model) > 0: raise ValueError( - f"{error_msg_prefix}: It has " - f"{len(dataset_variables)} feature(s) but {estimator_class_name} " - f"is expecting {len(model_variables)}. Reshape your data." + f"{error_msg_prefix}: Features not in model: {ds_var_names_not_in_model}." ) - # Check variables: Must have same name and type - for var_index, (model_variable, dataset_variable) in enumerate( - zip(model_variables, dataset_variables) - ): - if model_variable.name != dataset_variable.name: - raise ValueError( - f"{error_msg_prefix}: Feature #{var_index} should be named " - f"'{model_variable.name}' " - f"instead of '{dataset_variable.name}'" - ) - if model_variable.type != dataset_variable.type: - raise ValueError( - f"{error_msg_prefix}: Feature #{var_index} should convertible to " - f"'{model_variable.type}' " - f"instead of '{dataset_variable.type}'" - ) + # Check the type + for ds_var in ds_dictionary.variables: + model_var = model_dictionary.get_variable(ds_var.name) + if ds_var.type != model_var.type: + if model_var.type == "Categorical": + warnings.warn( + f"X contains variable '{ds_var.name}' which was deemed " + "numerical. It will be coerced to categorical." + ) + else: + raise ValueError( + f"{error_msg_prefix}: Khiops type for variable " + f"'{ds_var.name}' should be '{model_var.type}' " + f"not '{ds_var.type}'" + ) + +def _check_categorical_target_type(ds): + if ds.target_column is None: + raise ValueError("Target vector is not specified.") -def _check_categorical_target_type(dataset): - assert ( - dataset.main_table.target_column_id is not None - ), "Target column not specified in dataset." - if not ( - isinstance(dataset.target_column_type, pd.CategoricalDtype) - or pd.api.types.is_string_dtype(dataset.target_column_type) - or pd.api.types.is_integer_dtype(dataset.target_column_type) - or pd.api.types.is_float_dtype(dataset.target_column_type) + if ds.is_in_memory and not ( + isinstance(ds.target_column.dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(ds.target_column.dtype) + or pd.api.types.is_integer_dtype(ds.target_column.dtype) + or pd.api.types.is_float_dtype(ds.target_column.dtype) ): raise ValueError( - f"'y' has invalid type '{dataset.target_column_type}'. " + f"'y' has invalid type '{ds.target_column_type}'. " "Only string, integer, float and categorical types " "are accepted for the target." ) + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Categorical" + ): + raise ValueError( + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " + "Only Categorical types are accepted for file datasets." + ) + +def _check_numerical_target_type(ds): + # Check that the target column is specified + if ds.target_column is None: + raise ValueError("Target vector is not specified.") -def _check_numerical_target_type(dataset): - assert ( - dataset.main_table.target_column_id is not None - ), "Target column not specified in dataset." - if not pd.api.types.is_numeric_dtype(dataset.target_column_type): + # If in-memory: Check that the column is numerical and that the values are finite + # The latter is required by sklearn + if ds.is_in_memory: + if not pd.api.types.is_numeric_dtype(ds.target_column.dtype): + raise ValueError( + f"Unknown label type '{ds.target_column.dtype}'. " + "Expected a numerical type." + ) + if ds.target_column is not None: + assert_all_finite(ds.target_column) + # Otherwise: Check the the Khiops type + elif ( + not ds.is_in_memory + and ds.main_table.khiops_types[ds.target_column_id] != "Numerical" + ): raise ValueError( - f"Unknown label type '{dataset.target_column_type}'. " - "Expected a numerical type." + "Target column has invalid type " + f"'{ds.main_table.khiops_types[ds.target_column_id]}'. " + "Only Numerical types are accepted for file datasets." ) - if dataset.is_in_memory() and dataset.main_table.target_column is not None: - assert_all_finite(dataset.main_table.target_column) def _cleanup_dir(target_dir): @@ -323,12 +368,12 @@ def fit(self, X, y=None, **kwargs): return self - def _fit(self, dataset, computation_dir, **kwargs): + def _fit(self, ds, computation_dir, **kwargs): """Template pattern of a fit method Parameters ---------- - dataset : `Dataset` + ds : `Dataset` The learning dataset. computation_dir : str Path or URI where the Khiops computation results will be stored. @@ -336,25 +381,25 @@ def _fit(self, dataset, computation_dir, **kwargs): The called methods are reimplemented in concrete sub-classes """ # Check model parameters - self._fit_check_params(dataset, **kwargs) + self._fit_check_params(ds, **kwargs) # Check the dataset - self._fit_check_dataset(dataset) + self._fit_check_dataset(ds) # Train the model - self._fit_train_model(dataset, computation_dir, **kwargs) - self.n_features_in_ = dataset.main_table.n_features() + self._fit_train_model(ds, computation_dir, **kwargs) + self.n_features_in_ = ds.main_table.n_features() # If the main attributes are of the proper type finish the fitting # Otherwise it means there was an abort (early return) of the previous steps if isinstance(self.model_, kh.DictionaryDomain) and isinstance( self.model_report_, kh.KhiopsJSONObject ): - self._fit_training_post_process(dataset) + self._fit_training_post_process(ds) self.is_fitted_ = True - self.is_multitable_model_ = dataset.is_multitable() + self.is_multitable_model_ = ds.is_multitable - def _fit_check_params(self, dataset, **_): + def _fit_check_params(self, ds, **_): """Check the model parameters including those data dependent (in kwargs)""" if ( self.key is not None @@ -363,79 +408,77 @@ def _fit_check_params(self, dataset, **_): ): raise TypeError(type_error_message("key", self.key, str, "list-like")) - if not dataset.is_in_memory() and self.output_dir is None: + if not ds.is_in_memory and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") - def _fit_check_dataset(self, dataset): + def _fit_check_dataset(self, ds): """Checks the pre-conditions of the tables to build the model""" - if ( - dataset.main_table.n_samples is not None - and dataset.main_table.n_samples <= 1 - ): + if ds.main_table.n_samples is not None and ds.main_table.n_samples <= 1: raise ValueError( "Table contains one sample or less. It must contain at least 2." ) @abstractmethod - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): """Builds the model with one or more calls to khiops.core.api It must return the path of the ``.kdic`` Khiops model file and the JSON report. """ @abstractmethod - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): """Loads the model's data from Khiops files into the object""" def _transform( self, - dataset, + ds, computation_dir, _transform_create_deployment_model_fun, drop_key, + transformed_file_name, ): """Generic template method to implement transform, predict and predict_proba""" # Check if the model is fitted check_is_fitted(self) # Check if the dataset is consistent with the model - self._transform_check_dataset(dataset) + self._transform_check_dataset(ds) # Create a deployment dataset # Note: The input dataset is not necessarily ready to be deployed - deployment_dataset = self._transform_create_deployment_dataset( - dataset, computation_dir - ) + deployment_ds = self._transform_create_deployment_dataset(ds, computation_dir) # Create a deployment dictionary - deployment_dictionary_domain = _transform_create_deployment_model_fun() + deployment_dictionary_domain = _transform_create_deployment_model_fun(ds) # Deploy the model output_table_path = self._transform_deploy_model( - deployment_dataset, + deployment_ds, deployment_dictionary_domain, self.model_main_dictionary_name_, computation_dir, + transformed_file_name, ) # Post-process to return the correct output type return self._transform_deployment_post_process( - deployment_dataset, output_table_path, drop_key + deployment_ds, output_table_path, drop_key ) - def _transform_create_deployment_dataset(self, dataset, _): + def _transform_create_deployment_dataset(self, ds, _): """Creates if necessary a new dataset to execute the model deployment The default behavior is to return the same dataset. """ - return dataset + return ds def _transform_deploy_model( self, - deployment_dataset, + deployment_ds, model_dictionary_domain, model_dictionary_name, computation_dir, + transformed_file_name, ): """Deploys a generic Khiops transformation model @@ -447,7 +490,8 @@ def _transform_deploy_model( X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -469,7 +513,7 @@ def _transform_deploy_model( ( main_table_path, secondary_table_paths, - ) = deployment_dataset.create_table_files_for_khiops( + ) = deployment_ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) @@ -495,15 +539,15 @@ def _transform_deploy_model( # Set output path files output_dir = self._get_output_dir(computation_dir) log_file_path = fs.get_child_path(output_dir, "khiops.log") - output_data_table_path = fs.get_child_path(output_dir, "transformed.txt") + output_data_table_path = fs.get_child_path(output_dir, transformed_file_name) # Set the format parameters depending on the type of dataset - if deployment_dataset.is_in_memory(): + if deployment_ds.is_in_memory: field_separator = "\t" header_line = True else: - field_separator = deployment_dataset.main_table.sep - header_line = deployment_dataset.main_table.header + field_separator = deployment_ds.main_table.sep + header_line = deployment_ds.main_table.header # Call to core function deploy_model kh.deploy_model( @@ -523,16 +567,16 @@ def _transform_deploy_model( return output_data_table_path - def _transform_check_dataset(self, dataset): + def _transform_check_dataset(self, ds): """Checks the dataset before deploying a model on them""" - if not dataset.is_in_memory() and self.output_dir is None: + if ds.table_type == FileTable and self.output_dir is None: raise ValueError("'output_dir' is not set but dataset is file-based") def _transform_deployment_post_process( - self, deployment_dataset, output_table_path, drop_key + self, deployment_ds, output_table_path, drop_key ): # Return a dataframe for dataframe based datasets - if deployment_dataset.is_in_memory(): + if deployment_ds.is_in_memory: # Read the transformed table with the internal table settings with io.BytesIO(fs.read(output_table_path)) as output_table_stream: output_table_df = read_internal_data_table(output_table_stream) @@ -541,16 +585,16 @@ def _transform_deployment_post_process( # - Reorder the table to the original table order # - Because transformed data table file is sorted by key # - Drop the key columns if specified - if deployment_dataset.is_multitable(): - key_df = deployment_dataset.main_table.dataframe[ - deployment_dataset.main_table.key + if deployment_ds.is_multitable: + key_df = deployment_ds.main_table.data_source[ + deployment_ds.main_table.key ] output_table_df_or_path = key_df.merge( - output_table_df, on=deployment_dataset.main_table.key + output_table_df, on=deployment_ds.main_table.key ) if drop_key: output_table_df_or_path.drop( - deployment_dataset.main_table.key, axis=1, inplace=True + deployment_ds.main_table.key, axis=1, inplace=True ) # On mono-table: Return the read dataframe as-is else: @@ -713,7 +757,8 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -739,7 +784,7 @@ def fit(self, X, y=None, **kwargs): """ return super().fit(X, y=y, **kwargs) - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Check that at least one of the build methods parameters is set if not ( self.build_name_var or self.build_distance_vars or self.build_frequency_vars @@ -762,7 +807,7 @@ def _fit_check_params(self, dataset, **kwargs): raise TypeError( type_error_message(f"columns[{i}]", column_id, str) ) - if column_id not in dataset.main_table.column_ids: + if column_id not in ds.main_table.column_ids: raise ValueError(f"columns[{i}] ('{column_id}') not found in X") # Check that 'id_column': @@ -774,7 +819,7 @@ def _fit_check_params(self, dataset, **kwargs): raise ValueError("'id_column' is a mandatory parameter") if not isinstance(id_column, str): raise TypeError(type_error_message("key_columns", id_column, str)) - if id_column not in dataset.main_table.column_ids: + if id_column not in ds.main_table.column_ids: raise ValueError(f"id column '{id_column}' not found in X") # Deprecate the 'max_part_numbers' parameter @@ -789,11 +834,11 @@ def _fit_check_params(self, dataset, **kwargs): ) ) - def _fit_train_model(self, dataset, computation_dir, **kwargs): - assert not dataset.is_multitable(), "Coclustering not available in multitable" + def _fit_train_model(self, ds, computation_dir, **kwargs): + assert not ds.is_multitable, "Coclustering not available in multitable" # Prepare the table files and dictionary for Khiops - main_table_path, _ = dataset.create_table_files_for_khiops( + main_table_path, _ = ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) @@ -807,12 +852,12 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): elif self.variables is not None: variables = self.variables else: - variables = list(dataset.main_table.column_ids) + variables = list(ds.main_table.column_ids) # Train the coclustering model coclustering_file_path = kh.train_coclustering( - dataset.create_khiops_dictionary_domain(), - dataset.main_table.name, + ds.create_khiops_dictionary_domain(), + ds.main_table.name, main_table_path, variables, output_dir, @@ -854,18 +899,16 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): # Create a multi-table dictionary from the schema of the table # The root table contains the key of the table and points to the main table - tmp_domain = dataset.create_khiops_dictionary_domain() - main_table_dictionary = tmp_domain.get_dictionary(dataset.main_table.name) + tmp_domain = ds.create_khiops_dictionary_domain() + main_table_dictionary = tmp_domain.get_dictionary(ds.main_table.name) if not main_table_dictionary.key: main_table_dictionary.key = [self.model_id_column] - main_table_dictionary.name = ( - f"{self._khiops_model_prefix}{dataset.main_table.name}" - ) + main_table_dictionary.name = f"{self._khiops_model_prefix}{ds.main_table.name}" self.model_main_dictionary_name_ = ( - f"{self._khiops_model_prefix}Keys_{dataset.main_table.name}" + f"{self._khiops_model_prefix}Keys_{ds.main_table.name}" ) self.model_secondary_table_variable_name = ( - f"{self._khiops_model_prefix}{dataset.main_table.name}" + f"{self._khiops_model_prefix}{ds.main_table.name}" ) self._create_coclustering_model_domain( tmp_domain, coclustering_file_path, output_dir @@ -888,7 +931,7 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): self.model_report_ = simplified_cc.model_report_ self.model_report_raw_ = self.model_report_.json_data - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): assert ( len(self.model_.dictionaries) == 2 ), "'model_' does not have exactly 2 dictionaries" @@ -1046,16 +1089,16 @@ def _simplify( ) # Get dataset dictionary from model; it should not be root - dataset_dictionary = self.model_.get_dictionary( + ds_dictionary = self.model_.get_dictionary( self.model_secondary_table_variable_name ) assert ( - not dataset_dictionary.root + not ds_dictionary.root ), "Dataset dictionary in the coclustering model should not be root" - if not dataset_dictionary.key: - dataset_dictionary.key = self.model_id_column + if not ds_dictionary.key: + ds_dictionary.key = self.model_id_column domain = DictionaryDomain() - domain.add_dictionary(dataset_dictionary) + domain.add_dictionary(ds_dictionary) simplified_coclustering_file_path = fs.get_child_path( output_dir, "Coclustering.khcj" ) @@ -1150,7 +1193,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -1162,9 +1206,6 @@ def predict(self, X): `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("predict") @@ -1172,15 +1213,16 @@ def predict(self, X): kh.get_runner().root_temp_dir = computation_dir # Create the input dataset - dataset = Dataset(X) + ds = Dataset(X) # Call the template transform method try: y_pred = super()._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict, False, + "predict.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -1188,25 +1230,25 @@ def predict(self, X): kh.get_runner().root_temp_dir = initial_runner_temp_dir # Transform to numpy.array for in-memory inputs - if dataset.is_in_memory(): + if ds.is_in_memory: y_pred = y_pred.to_numpy() return y_pred - def _transform_check_dataset(self, dataset): + def _transform_check_dataset(self, ds): """Checks the tables before deploying a model on them""" assert ( len(self.model_.dictionaries) == 2 ), "'model' does not have exactly 2 dictionaries" # Call the parent method - super()._transform_check_dataset(dataset) + super()._transform_check_dataset(ds) # Coclustering models are special: # - They are mono-table only # - They are deployed with a multitable model whose main table contain # the keys of the input table and the secondary table is the input table - if dataset.is_multitable(): + if ds.is_multitable: raise ValueError("Coclustering models not available in multi-table mode") # The "model dictionary domain" in the coclustering case it is just composed @@ -1217,23 +1259,23 @@ def _transform_check_dataset(self, dataset): if dictionary.name != self.model_main_dictionary_name_: _check_dictionary_compatibility( dictionary, - dataset.main_table.create_khiops_dictionary(), + ds.main_table.create_khiops_dictionary(), self.__class__.__name__, ) - def _transform_create_deployment_dataset(self, dataset, computation_dir): - assert not dataset.is_multitable(), "'dataset' is multitable" + def _transform_create_deployment_dataset(self, ds, computation_dir): + assert not ds.is_multitable, "'dataset' is multitable" # Build the multitable deployment dataset - keys_table_name = f"keys_{dataset.main_table.name}" + keys_table_name = f"keys_{ds.main_table.name}" deploy_dataset_spec = {} deploy_dataset_spec["main_table"] = keys_table_name deploy_dataset_spec["tables"] = {} - if dataset.is_in_memory(): + if ds.is_in_memory: # Extract the keys from the main table keys_table_dataframe = pd.DataFrame( { - self.model_id_column: dataset.main_table.dataframe[ + self.model_id_column: ds.main_table.data_source[ self.model_id_column ].unique() } @@ -1244,20 +1286,20 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): keys_table_dataframe, self.model_id_column, ) - deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.dataframe, + deploy_dataset_spec["tables"][ds.main_table.name] = ( + ds.main_table.data_source, self.model_id_column, ) else: # Create the table to extract the keys (sorted) - keyed_dataset = dataset.copy() + keyed_dataset = ds.copy() keyed_dataset.main_table.key = [self.model_id_column] main_table_path = keyed_dataset.main_table.create_table_file_for_khiops( computation_dir, sort=self.auto_sort ) # Create a table storing the main table keys - keys_table_name = f"keys_{dataset.main_table.name}" + keys_table_name = f"keys_{ds.main_table.name}" keys_table_file_path = fs.get_child_path( computation_dir, f"raw_{keys_table_name}.txt" ) @@ -1266,33 +1308,33 @@ def _transform_create_deployment_dataset(self, dataset, computation_dir): keyed_dataset.main_table.name, main_table_path, keys_table_file_path, - header_line=dataset.header, - field_separator=dataset.sep, - output_header_line=dataset.header, - output_field_separator=dataset.sep, + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, trace=self.verbose, ) deploy_dataset_spec["tables"][keys_table_name] = ( keys_table_file_path, self.model_id_column, ) - deploy_dataset_spec["tables"][dataset.main_table.name] = ( - dataset.main_table.path, + deploy_dataset_spec["tables"][ds.main_table.name] = ( + ds.main_table.data_source, self.model_id_column, ) - deploy_dataset_spec["format"] = (dataset.sep, dataset.header) + deploy_dataset_spec["format"] = (ds.sep, ds.header) return Dataset(deploy_dataset_spec) - def _transform_prepare_deployment_model_for_predict(self): - return self.model_ + def _transform_prepare_deployment_model_for_predict(self, _): + return self.model_.copy() def _transform_deployment_post_process( - self, deployment_dataset, output_table_path, drop_key + self, deployment_ds, output_table_path, drop_key ): - assert deployment_dataset.is_multitable() + assert deployment_ds.is_multitable return super()._transform_deployment_post_process( - deployment_dataset, output_table_path, drop_key + deployment_ds, output_table_path, drop_key ) def fit_predict(self, X, y=None, **kwargs): @@ -1340,12 +1382,12 @@ def __init__( def _more_tags(self): return {"require_y": True} - def _fit_check_dataset(self, dataset): - super()._fit_check_dataset(dataset) - self._check_target_type(dataset) + def _fit_check_dataset(self, ds): + super()._fit_check_dataset(ds) + self._check_target_type(ds) @abstractmethod - def _check_target_type(self, dataset): + def _check_target_type(self, ds): """Checks that the target type has the correct type for the estimator""" def fit(self, X, y=None, **kwargs): @@ -1359,19 +1401,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values. + y : :external:term:`array-like` of shape (n_samples,) + The target values. - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + **Deprecated input types** (will be removed in Khiops 11): + + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -1386,9 +1430,9 @@ def fit(self, X, y=None, **kwargs): super().fit(X, y=y, **kwargs) return self - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Call parent method - super()._fit_check_params(dataset, **kwargs) + super()._fit_check_params(ds, **kwargs) # Check supervised estimator parameters if not isinstance(self.n_features, int): @@ -1404,10 +1448,10 @@ def _fit_check_params(self, dataset, **kwargs): if self.n_pairs < 0: raise ValueError("'n_pairs' must be positive") - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): # Train the model with Khiops train_args, train_kwargs = self._fit_prepare_training_function_inputs( - dataset, computation_dir + ds, computation_dir ) report_file_path, model_kdic_file_path = self._fit_core_training_function( *train_args, **train_kwargs @@ -1429,33 +1473,29 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): def _fit_core_training_function(self, *args, **kwargs): """A wrapper to the khiops.core training function for the estimator""" - def _fit_prepare_training_function_inputs(self, dataset, computation_dir): + def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Set output path files output_dir = self._get_output_dir(computation_dir) log_file_path = fs.get_child_path(output_dir, "khiops.log") - main_table_path, secondary_table_paths = dataset.create_table_files_for_khiops( + main_table_path, secondary_table_paths = ds.create_table_files_for_khiops( computation_dir, sort=self.auto_sort ) # Build the 'additional_data_tables' argument - dataset_domain = dataset.create_khiops_dictionary_domain() - secondary_data_paths = dataset_domain.extract_data_paths( - dataset.main_table.name - ) + ds_domain = ds.create_khiops_dictionary_domain() + secondary_data_paths = ds_domain.extract_data_paths(ds.main_table.name) additional_data_tables = {} for data_path in secondary_data_paths: - dictionary = dataset_domain.get_dictionary_at_data_path(data_path) + dictionary = ds_domain.get_dictionary_at_data_path(data_path) additional_data_tables[data_path] = secondary_table_paths[dictionary.name] # Build the mandatory arguments args = [ - dataset.create_khiops_dictionary_domain(), - dataset.main_table.name, + ds.create_khiops_dictionary_domain(), + ds.main_table.name, main_table_path, - dataset.main_table.get_khiops_variable_name( - dataset.main_table.target_column_id - ), + get_khiops_variable_name(ds.target_column_id), output_dir, ] @@ -1473,12 +1513,12 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): # Set the format parameters depending on the type of dataset kwargs["detect_format"] = False - if dataset.is_in_memory(): + if ds.is_in_memory: kwargs["field_separator"] = "\t" kwargs["header_line"] = True else: - kwargs["field_separator"] = dataset.main_table.sep - kwargs["header_line"] = dataset.main_table.header + kwargs["field_separator"] = ds.main_table.sep + kwargs["header_line"] = ds.main_table.header # Rename parameters to be compatible with khiops.core kwargs["max_constructed_variables"] = kwargs.pop("n_features") @@ -1495,14 +1535,12 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): return args, kwargs - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Set the target variable name - self.model_target_variable_name_ = dataset.main_table.get_khiops_variable_name( - dataset.main_table.target_column_id - ) + self.model_target_variable_name_ = get_khiops_variable_name(ds.target_column_id) # Verify it has at least one dictionary and a root dictionary in multi-table if len(self.model_.dictionaries) == 1: @@ -1516,17 +1554,11 @@ def _fit_training_post_process(self, dataset): initial_dictionary_name = dictionary.name.replace( self._khiops_model_prefix, "", 1 ) - if initial_dictionary_name == dataset.main_table.name: + if initial_dictionary_name == ds.main_table.name: self.model_main_dictionary_name_ = dictionary.name if self.model_main_dictionary_name_ is None: raise ValueError("No model dictionary after Khiops call") - # Remove the target variable in the model dictionary - model_main_dictionary = self.model_.get_dictionary( - self.model_main_dictionary_name_ - ) - model_main_dictionary.remove_variable(self.model_target_variable_name_) - # Extract, from the preparation reports, the number of evaluated features, # their names and their levels univariate_preparation_report = self.model_report_.preparation_report @@ -1584,29 +1616,29 @@ def _fit_training_post_process(self, dataset): self.feature_evaluated_importances_ = np.array([x[1] for x in combined]) self.n_features_evaluated_ = len(combined) - def _transform_check_dataset(self, dataset): - assert isinstance(dataset, Dataset), "'dataset' is not 'Dataset'" + def _transform_check_dataset(self, ds): + assert isinstance(ds, Dataset), "'ds' is not 'Dataset'" # Call the parent method - super()._transform_check_dataset(dataset) + super()._transform_check_dataset(ds) # Check the coherence between thi input table and the model - if self.is_multitable_model_ and not dataset.is_multitable(): + if self.is_multitable_model_ and not ds.is_multitable: raise ValueError( "You are trying to apply on single-table inputs a model which has " "been trained on multi-table data." ) - if not self.is_multitable_model_ and dataset.is_multitable(): + if not self.is_multitable_model_ and ds.is_multitable: raise ValueError( "You are trying to apply on multi-table inputs a model which has " "been trained on single-table data." ) # Error if different number of dictionaries - dataset_domain = dataset.create_khiops_dictionary_domain() - if len(self.model_.dictionaries) != len(dataset_domain.dictionaries): + ds_domain = ds.create_khiops_dictionary_domain() + if len(self.model_.dictionaries) != len(ds_domain.dictionaries): raise ValueError( - f"X has {len(dataset_domain.dictionaries)} table(s), " + f"X has {len(ds_domain.dictionaries)} table(s), " f"but {self.__class__.__name__} is expecting " f"{len(self.model_.dictionaries)}" ) @@ -1615,13 +1647,14 @@ def _transform_check_dataset(self, dataset): # Note: Name checking is omitted for the main table _check_dictionary_compatibility( _extract_basic_dictionary(self._get_main_dictionary()), - dataset.main_table.create_khiops_dictionary(), + ds.main_table.create_khiops_dictionary(), self.__class__.__name__, + target_variable_name=self.model_target_variable_name_, ) # Multi-table model: Check name and dictionary coherence of secondary tables dataset_secondary_tables_by_name = { - table.name: table for table in dataset.secondary_tables + table.name: table for table in ds.secondary_tables } for dictionary in self.model_.dictionaries: assert dictionary.name.startswith(self._khiops_model_prefix), ( @@ -1686,14 +1719,15 @@ def predict(self, X): try: # Create the input dataset - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) # Call the template transform method y_pred = super()._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict, True, + "predict.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -1707,10 +1741,13 @@ def predict(self, X): assert isinstance(y_pred, (str, pd.DataFrame)), "Expected str or DataFrame" return y_pred - def _transform_prepare_deployment_model_for_predict(self): + def _transform_prepare_deployment_model_for_predict(self, ds): assert ( self._predicted_target_meta_data_tag is not None ), "Predicted target metadata tag is not set" + assert hasattr( + self, "model_main_dictionary_name_" + ), "Model main dictionary name has not been set" # Create a copy of the model dictionary using only the predicted target # Also activate the key to reorder the output in the multitable case @@ -1723,6 +1760,12 @@ def _transform_prepare_deployment_model_for_predict(self): variable.used = True else: variable.used = False + + # Remove the target variable if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + return model_copy def get_feature_used_statistics(self, modeling_report): @@ -1884,10 +1927,14 @@ def __init__( self._predicted_target_meta_data_tag = "Prediction" def _is_real_target_dtype_integer(self): - assert self._original_target_type is not None, "Original target type not set" - return pd.api.types.is_integer_dtype(self._original_target_type) or ( - isinstance(self._original_target_type, pd.CategoricalDtype) - and pd.api.types.is_integer_dtype(self._original_target_type.categories) + return self._original_target_dtype is not None and ( + pd.api.types.is_integer_dtype(self._original_target_dtype) + or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_integer_dtype( + self._original_target_dtype.categories + ) + ) ) def _sorted_prob_variable_names(self): @@ -1918,19 +1965,21 @@ def fit(self, X, y, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -1940,26 +1989,23 @@ def fit(self, X, y, **kwargs): kwargs["categorical_target"] = True return super().fit(X, y, **kwargs) - def _check_target_type(self, dataset): - _check_categorical_target_type(dataset) + def _check_target_type(self, ds): + _check_categorical_target_type(ds) - def _fit_check_dataset(self, dataset): + def _fit_check_dataset(self, ds): # Call the parent method - super()._fit_check_dataset(dataset) + super()._fit_check_dataset(ds) # Check that the target is for classification in in_memory_tables - if dataset.is_in_memory(): - current_type_of_target = type_of_target(dataset.main_table.target_column) + if ds.is_in_memory: + current_type_of_target = type_of_target(ds.target_column) if current_type_of_target not in ["binary", "multiclass"]: raise ValueError( f"Unknown label type: '{current_type_of_target}' " "for classification. Maybe you passed a floating point target?" ) # Check if the target has more than 1 class - if ( - dataset.is_in_memory() - and len(np.unique(dataset.main_table.target_column)) == 1 - ): + if ds.is_in_memory and len(np.unique(ds.target_column)) == 1: raise ValueError( f"{self.__class__.__name__} can't train when only one class is present." ) @@ -1967,12 +2013,15 @@ def _fit_check_dataset(self, dataset): def _fit_core_training_function(self, *args, **kwargs): return kh.train_predictor(*args, **kwargs) - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call the parent's method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Save the target datatype - self._original_target_type = dataset.target_column_type + if ds.is_in_memory: + self._original_target_dtype = ds.target_column.dtype + else: + self._original_target_dtype = None # Save class values in the order of deployment self.classes_ = [] @@ -1980,7 +2029,7 @@ def _fit_training_post_process(self, dataset): for key in variable.meta_data.keys: if key.startswith("TargetProb"): self.classes_.append(variable.meta_data.get_value(key)) - if self._is_real_target_dtype_integer(): + if ds.is_in_memory and self._is_real_target_dtype_integer(): self.classes_ = [int(class_value) for class_value in self.classes_] self.classes_.sort() self.classes_ = column_or_1d(self.classes_) @@ -2024,7 +2073,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2051,18 +2101,25 @@ def predict(self, X): y_pred = y_pred.to_numpy(copy=False).ravel() # If integer and string just transform - if pd.api.types.is_integer_dtype(self._original_target_type): - y_pred = y_pred.astype(self._original_target_type) - elif pd.api.types.is_string_dtype(self._original_target_type): + if pd.api.types.is_integer_dtype(self._original_target_dtype): + y_pred = y_pred.astype(self._original_target_dtype) + # If str transform to str + # Note: If the original type is None then it was learned with a file dataset + elif self._original_target_dtype is None or pd.api.types.is_string_dtype( + self._original_target_dtype + ): y_pred = y_pred.astype(str, copy=False) # If category first coerce the type to the categories' type else: - assert pd.api.types.is_categorical_dtype(self._original_target_type) + assert isinstance(self._original_target_dtype, pd.CategoricalDtype), ( + "_original_target_dtype is not categorical" + f", it is '{self._original_target_dtype}'" + ) if pd.api.types.is_integer_dtype( - self._original_target_type.categories.dtype + self._original_target_dtype.categories.dtype ): y_pred = y_pred.astype( - self._original_target_type.categories.dtype, copy=False + self._original_target_dtype.categories.dtype, copy=False ) else: y_pred = y_pred.astype(str, copy=False) @@ -2078,7 +2135,8 @@ def predict_proba(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2108,12 +2166,13 @@ def predict_proba(self, X): # Call the generic transfrom method try: - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) y_probas = self._transform( - dataset, + ds, computation_dir, self._transform_prepare_deployment_model_for_predict_proba, True, + "predict_proba.txt", ) # Cleanup and restore the runner's temporary dir finally: @@ -2123,7 +2182,7 @@ def predict_proba(self, X): # For in-memory datasets: # - Reorder the columns to that of self.classes_ # - Transform to np.ndarray - if dataset.is_in_memory(): + if ds.is_in_memory: assert isinstance( y_probas, (pd.DataFrame, np.ndarray) ), "y_probas is not a Pandas DataFrame nor Numpy array" @@ -2134,7 +2193,11 @@ def predict_proba(self, X): assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray" return y_probas - def _transform_prepare_deployment_model_for_predict_proba(self): + def _transform_prepare_deployment_model_for_predict_proba(self, ds): + assert hasattr( + self, "model_target_variable_name_" + ), "Target variable name has not been set" + # Create a copy of the model dictionary with only the probabilities used # We also activate the key to reorder the output in the multitable case model_copy = self.model_.copy() @@ -2148,6 +2211,11 @@ def _transform_prepare_deployment_model_for_predict_proba(self): else: variable.used = False + # Remove the target variable if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + return model_copy @@ -2286,19 +2354,22 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): + + - str: A path to a data table file for file-based ``dict`` dataset + specifications. - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. Returns ------- self : `KhiopsRegressor` @@ -2312,9 +2383,9 @@ def fit(self, X, y=None, **kwargs): def _fit_core_training_function(self, *args, **kwargs): return kh.train_predictor(*args, **kwargs) - def _fit_train_model(self, dataset, computation_dir, **kwargs): + def _fit_train_model(self, ds, computation_dir, **kwargs): # Call the parent method - super()._fit_train_model(dataset, computation_dir, **kwargs) + super()._fit_train_model(ds, computation_dir, **kwargs) # Warn when there are no informative variables if self.model_report_.preparation_report.informative_variable_number == 0: @@ -2323,9 +2394,9 @@ def _fit_train_model(self, dataset, computation_dir, **kwargs): "The fitted model is the mean regressor." ) - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Remove variables depending on the target variables_to_eliminate = [] @@ -2349,8 +2420,8 @@ def _fit_training_post_process(self, dataset): self.feature_used_importances_ = feature_used_importances_ self.n_features_used_ = len(self.feature_used_names_) - def _check_target_type(self, dataset): - _check_numerical_target_type(dataset) + def _check_target_type(self, ds): + _check_numerical_target_type(ds) # Deactivate useless super delegation because the method have different docstring # pylint: disable=useless-super-delegation @@ -2366,7 +2437,8 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2375,14 +2447,16 @@ def predict(self, X): Returns ------- - `ndarray ` + `numpy.ndarray` or str + An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. The key columns are added for - multi-table tasks. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. + multi-table tasks. The array is in the form of: + - `numpy.ndarray` if X is :external:term:`array-like`, or dataset spec + containing `pandas.DataFrame` table. + - str (a path for the file containing the array) if X is a dataset spec + containing file-path tables. """ # Call the parent's method y_pred = super().predict(X) @@ -2567,9 +2641,9 @@ def _numerical_transform_method(self): ) return _transform_types_numerical[self.transform_type_numerical] - def _fit_check_params(self, dataset, **kwargs): + def _fit_check_params(self, ds, **kwargs): # Call parent method - super()._fit_check_params(dataset, **kwargs) + super()._fit_check_params(ds, **kwargs) # Check 'transform_type_categorical' parameter if not isinstance(self.transform_type_categorical, str): @@ -2601,11 +2675,11 @@ def _fit_check_params(self, dataset, **kwargs): "cannot be both None with n_trees == 0." ) - def _check_target_type(self, dataset): + def _check_target_type(self, ds): if self.categorical_target: - _check_categorical_target_type(dataset) + _check_categorical_target_type(ds) else: - _check_numerical_target_type(dataset) + _check_numerical_target_type(ds) def _fit_core_training_function(self, *args, **kwargs): return kh.train_recoder(*args, **kwargs) @@ -2621,19 +2695,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are secondary ones joined to the main table using ``key`` estimator parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -2645,10 +2721,10 @@ def fit(self, X, y=None, **kwargs): # pylint: enable=useless-super-delegation - def _fit_prepare_training_function_inputs(self, dataset, computation_dir): + def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Call the parent method args, kwargs = super()._fit_prepare_training_function_inputs( - dataset, computation_dir + ds, computation_dir ) # Rename encoder parameters, delete unused ones @@ -2664,9 +2740,9 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): return args, kwargs - def _fit_training_post_process(self, dataset): + def _fit_training_post_process(self, ds): # Call parent method - super()._fit_training_post_process(dataset) + super()._fit_training_post_process(ds) # Eliminate the target variable from the main dictionary self._get_main_dictionary() @@ -2674,7 +2750,7 @@ def _fit_training_post_process(self, dataset): # Save the encoded feature names self.feature_names_out_ = [] for variable in self._get_main_dictionary().variables: - if variable.used: + if variable.used and variable.name != ds.target_column_id: self.feature_names_out_.append(variable.name) # Activate the key columns in multitable @@ -2694,7 +2770,8 @@ def transform(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The @@ -2706,9 +2783,6 @@ def transform(self, X): `ndarray ` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in Khiops 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("transform") @@ -2717,21 +2791,37 @@ def transform(self, X): # Create and transform the dataset try: - dataset = Dataset(X, key=self.key) + ds = Dataset(X, key=self.key) X_transformed = super()._transform( - dataset, + ds, computation_dir, - self.model_.copy, + self._transform_prepare_deployment_model, True, + "transform.txt", ) # Cleanup and restore the runner's temporary dir finally: self._cleanup_computation_dir(computation_dir) kh.get_runner().root_temp_dir = initial_runner_temp_dir - if dataset.is_in_memory(): + if ds.is_in_memory: return X_transformed.to_numpy(copy=False) return X_transformed + def _transform_prepare_deployment_model(self, ds): + assert hasattr( + self, "model_target_variable_name_" + ), "Target variable name has not been set" + + # Create a copy of the model dictionary domain with the target variable + # if it is not present in the input dataset + # Note: We use `list` to avoid a warning of numpy about the `in` operator + model_copy = self.model_.copy() + model_dictionary = model_copy.get_dictionary(self.model_main_dictionary_name_) + if self.model_target_variable_name_ not in list(ds.main_table.column_ids): + model_dictionary.remove_variable(self.model_target_variable_name_) + + return model_copy + def fit_transform(self, X, y=None, **kwargs): """Fit and transforms its inputs @@ -2740,19 +2830,22 @@ def fit_transform(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in Khiops 11): + + **Deprecated input types** (will be removed in Khiops 11): - tuple: A pair (``path_to_file``, ``separator``). - list: A sequence of dataframes or paths, or pairs path-separator. The first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + secondary ones joined to the main table using ``key`` estimator + parameter. y : :external:term:`array-like` of shape (n_samples,) - :external:term:`array-like` object containing the target values. + The target values. + + **Deprecated input types** (will be removed in Khiops 11): - **Deprecated input modes** (will be removed in Khiops 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- diff --git a/khiops/sklearn/tables.py b/khiops/sklearn/tables.py deleted file mode 100644 index b4709671..00000000 --- a/khiops/sklearn/tables.py +++ /dev/null @@ -1,1613 +0,0 @@ -###################################################################################### -# Copyright (c) 2024 Orange. All rights reserved. # -# This software is distributed under the BSD 3-Clause-clear License, the text of # -# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # -# see the "LICENSE.md" file for more details. # -###################################################################################### -"""Classes for handling diverse data tables""" -import csv -import io -import warnings -from abc import ABC, abstractmethod -from collections.abc import Iterable, Mapping, Sequence - -import numpy as np -import pandas as pd -import scipy.sparse as sp -from sklearn.utils import check_array -from sklearn.utils.validation import column_or_1d - -import khiops.core as kh -import khiops.core.internals.filesystems as fs -from khiops.core.dictionary import VariableBlock -from khiops.core.internals.common import ( - deprecation_message, - is_dict_like, - is_list_like, - type_error_message, -) - -# Disable PEP8 variable names because of scikit-learn X,y conventions -# To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names tables.py -# pylint: disable=invalid-name - - -def get_khiops_type(numpy_type): - """Translates a numpy type to a Khiops dictionary type - - Parameters - ---------- - numpy_type : `numpy.dtype`: - Numpy type of the column - - Returns - ------- - str - Khiops type name. Either "Categorical", "Numerical" or "Timestamp" - """ - lower_numpy_type = str(numpy_type).lower() - - # timedelta64 and datetime64 types - if "time" in lower_numpy_type: - return "Timestamp" - # float, int, uint types - elif "int" in lower_numpy_type or "float" in lower_numpy_type: - return "Numerical" - # bool_ and object, character, bytes_, str_, void, record and other types - else: - return "Categorical" - - -def read_internal_data_table(file_path_or_stream): - """Reads into a DataFrame a data table file with the internal format settings - - The table is read with the following settings: - - - Use tab as separator - - Read the column names from the first line - - Use '"' as quote character - - Use `csv.QUOTE_MINIMAL` - - double quoting enabled (quotes within quotes can be escaped with '""') - - UTF-8 encoding - - Parameters - ---------- - file_path_or_stream : str or file object - The path of the internal data table file to be read or a readable file - object. - - Returns - ------- - `pandas.DataFrame` - The dataframe representation. - """ - return pd.read_csv( - file_path_or_stream, - sep="\t", - header=0, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - encoding="utf-8", - ) - - -def write_internal_data_table(dataframe, file_path_or_stream): - """Writes a DataFrame to data table file with the internal format settings - - The table is written with the following settings: - - - Use tab as separator - - Write the column names on the first line - - Use '"' as quote character - - Use `csv.QUOTE_MINIMAL` - - double quoting enabled (quotes within quotes can be escaped with '""') - - UTF-8 encoding - - The index is not written - - Parameters - ---------- - dataframe : `pandas.DataFrame` - The dataframe to write. - file_path_or_stream : str or file object - The path of the internal data table file to be written or a writable file - object. - """ - dataframe.to_csv( - file_path_or_stream, - sep="\t", - header=True, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - encoding="utf-8", - index=False, - ) - - -class Dataset: - """A representation of a dataset - - Parameters - ---------- - X : `pandas.DataFrame` or dict (**Deprecated types**: tuple and list) - Either: - - A single dataframe - - A ``dict`` dataset specification - y : `pandas.Series` or str, optional - The target column. - categorical_target : bool, default True - ``True`` if the vector ``y`` should be considered as a categorical variable. If - ``False`` it is considered as numeric. Ignored if ``y`` is ``None``. - key : str - The name of the key column for all tables. - **Deprecated:** Will be removed in pyKhiops 11. - """ - - def __init__(self, X, y=None, categorical_target=True, key=None): - # Initialize members - self.main_table = None - self.secondary_tables = None - self.relations = None - self.sep = None - self.header = None - - # Initialization from different types of input "X" - # A single pandas dataframe - if isinstance(X, pd.DataFrame): - self._init_tables_from_dataframe( - X, y, categorical_target=categorical_target - ) - # A single numpy array (or compatible object) - elif hasattr(X, "__array__"): - self._init_tables_from_numpy_array( - X, - y, - categorical_target=categorical_target, - ) - # A scipy.sparse.spmatrix - elif isinstance(X, sp.spmatrix): - self._init_tables_from_sparse_matrix( - X, y, categorical_target=categorical_target - ) - # Special rejection for scipy.sparse.sparray (to pass the sklearn tests) - # Note: We don't use scipy.sparse.sparray because it is not implemented in scipy - # 1.10 which is the latest supporting py3.8 - elif isinstance( - X, - ( - sp.bsr_array, - sp.coo_array, - sp.csc_array, - sp.csr_array, - sp.dia_array, - sp.dok_array, - sp.lil_array, - ), - ): - check_array(X, accept_sparse=False) - # A tuple spec - elif isinstance(X, tuple): - warnings.warn( - deprecation_message( - "Tuple dataset input", - "11.0.0", - replacement="dict dataset spec", - quote=False, - ), - stacklevel=3, - ) - self._init_tables_from_tuple(X, y, categorical_target=categorical_target) - # A sequence - # We try first for compatible python arrays then the deprecated sequences spec - elif is_list_like(X): - # Try to transform to a numerical array with sklearn's check_array - # On failure we try the old deprecated sequence interface - # When the old list interface is eliminated this will considerably reduce - # this branch's code - try: - X_checked = check_array(X, ensure_2d=True, force_all_finite=False) - self._init_tables_from_numpy_array( - X_checked, y, categorical_target=categorical_target - ) - except ValueError: - warnings.warn( - deprecation_message( - "List dataset input", - "11.0.0", - replacement="dict dataset spec", - quote=False, - ), - stacklevel=3, - ) - self._init_tables_from_sequence(X, y, key=key) - # A dict specification - elif is_dict_like(X): - self._init_tables_from_mapping(X, y) - # Fail if X is not recognized - else: - raise TypeError( - type_error_message("X", X, "array-like", tuple, Sequence, Mapping) - ) - - assert self.main_table is not None, "'main_table' is 'None' after init" - assert isinstance( - self.secondary_tables, list - ), "'secondary_tables' is not a list after init" - assert not self.is_multitable() or len( - self.secondary_tables - ), "'secondary_tables' is empty in a multi-table dataset" - - def _init_tables_from_dataframe(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type pandas.DataFrame""" - assert isinstance(X, pd.DataFrame), "'X' must be a pandas.DataFrame" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - self.main_table = PandasTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_sparse_matrix(self, X, y=None, categorical_target=True): - """Initializes the dataset from a 'X' of type scipy.sparse.spmatrix""" - assert isinstance(X, sp.spmatrix), "'X' must be a scipy.sparse.spmatrix" - if y is not None and not hasattr(y, "__array__"): - raise TypeError(type_error_message("y", y, "array-like")) - - self.main_table = SparseTable( - "main_table", X, target_column=y, categorical_target=categorical_target - ) - self.secondary_tables = [] - - def _init_tables_from_numpy_array(self, X, y=None, categorical_target=True): - assert hasattr( - X, "__array__" - ), "'X' must be a numpy.ndarray or implement __array__" - - if y is not None: - y_checked = column_or_1d(y, warn=True) - else: - y_checked = None - self.main_table = NumpyTable( - "main_table", - X, - target_column=y_checked, - categorical_target=categorical_target, - ) - self.secondary_tables = [] - - def _init_tables_from_tuple(self, X, y=None, categorical_target=True): - """Initializes the spec from a 'X' of type tuple""" - assert isinstance(X, tuple), "'X' must be a tuple" - - # Check the input tuple - self._check_input_tuple(X, y) - - # Obtain path and separator - path, sep = X - - # Initialization - self.main_table = FileTable( - "main_table", - categorical_target=categorical_target, - target_column_id=y, - path=path, - sep=sep, - ) - self.secondary_tables = [] - - def _check_input_tuple(self, X, y=None): - if len(X) != 2: - raise ValueError(f"'X' tuple input must have length 2 not {len(X)}") - if not isinstance(X[0], str): - raise TypeError(type_error_message("X[0]", X[0], str)) - if not isinstance(X[1], str): - raise TypeError(type_error_message("X[1]", X[1], str)) - if y is not None and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - - def _init_tables_from_sequence(self, X, y=None, categorical_target=True, key=None): - """Initializes the spec from a list-like 'X'""" - assert is_list_like(X), "'X' must be a list-like" - - # Check the input sequence - self._check_input_sequence(X, y, key=key) - - # Initialize the tables - if isinstance(X[0], pd.DataFrame): - self.main_table = PandasTable( - "main_table", - X[0], - target_column=y, - categorical_target=categorical_target, - key=key, - ) - self.secondary_tables = [] - for index, dataframe in enumerate(X[1:], start=1): - self.secondary_tables.append( - PandasTable(f"secondary_table_{index:02d}", dataframe, key=key) - ) - else: - self.main_table = FileTable( - "main_table", - X[0], - target_column_id=y, - categorical_target=categorical_target, - key=key, - ) - self.secondary_tables = [] - for index, table_path in enumerate(X[1:], start=1): - self.secondary_tables.append( - FileTable(f"secondary_table_{index:02d}", table_path, key=key) - ) - # Create a list of relations - main_table_name = self.main_table.name - self.relations = [ - (main_table_name, table.name, False) for table in self.secondary_tables - ] - - def _check_input_sequence(self, X, y=None, key=None): - # Check the first table - if len(X) == 0: - raise ValueError("'X' must be a non-empty sequence") - if not isinstance(X[0], (str, pd.DataFrame)): - raise TypeError(type_error_message("X[0]", X[0], str, pd.DataFrame)) - - # Check that the secondary table types are coherent with that of the first - main_table_type = type(X[0]) - for i, secondary_X in enumerate(X[1:], start=1): - if not isinstance(secondary_X, main_table_type): - raise TypeError( - type_error_message(f"X[{i}]", X[i], main_table_type) - + " as the first table in X" - ) - - # Check the type of y - if y is not None: - if isinstance(X[0], str) and not isinstance(y, str): - raise TypeError(type_error_message("y", y, str)) - elif isinstance(X[0], pd.DataFrame) and not isinstance(y, pd.Series): - raise TypeError(type_error_message("y", y, pd.Series)) - - # Check the type of key - if not is_list_like(key) and not isinstance(key, str): - raise TypeError(type_error_message("key", key, "list-like", str)) - if is_list_like(key): - for column_index, column_name in enumerate(key): - if not isinstance(column_name, str): - raise TypeError( - type_error_message( - f"key[{column_index}]", key[column_index], str - ) - ) - - def _init_tables_from_mapping(self, X, y=None, categorical_target=True): - """Initializes the table spec from a dict-like 'X'""" - assert is_dict_like(X), "'X' must be dict-like" - - # Check the input mapping - self._check_input_mapping(X, y) - - # Initialize tables - if len(X["tables"]) == 1: - main_table_name = list(X["tables"])[0] - main_table_source, main_table_key = list(X["tables"].values())[0] - if isinstance(main_table_key, str): - main_table_key = [main_table_key] - else: - main_table_name = X["main_table"] - main_table_source, main_table_key = X["tables"][main_table_name] - - # Case of paths - if isinstance(main_table_source, str): - warnings.warn( - deprecation_message( - "File-path dataset input", - "11.0.0", - "dataframe-based dataset or khiops.core API", - quote=False, - ), - stacklevel=4, - ) - if "format" in X: - self.sep, self.header = X["format"] - else: - self.sep = "\t" - self.header = True - self.main_table = FileTable( - main_table_name, - main_table_source, - target_column_id=y, - categorical_target=categorical_target, - key=main_table_key, - sep=self.sep, - header=self.header, - ) - self.secondary_tables = [] - for table_name, (table_source, table_key) in X["tables"].items(): - if isinstance(table_key, str): - table_key = [table_key] - if table_name != main_table_name: - self.secondary_tables.append( - FileTable( - table_name, - table_source, - key=table_key, - sep=self.sep, - header=self.header, - ) - ) - # Case of dataframes - elif isinstance(main_table_source, pd.DataFrame): - self.main_table = PandasTable( - main_table_name, - main_table_source, - key=main_table_key, - target_column=y, - categorical_target=categorical_target, - ) - self.secondary_tables = [] - for table_name, (table_source, table_key) in X["tables"].items(): - if table_name != main_table_name: - self.secondary_tables.append( - PandasTable(table_name, table_source, key=table_key) - ) - # Case of sparse matrices - elif isinstance(main_table_source, sp.spmatrix): - self.main_table = SparseTable( - main_table_name, - main_table_source, - key=main_table_key, - target_column=y, - categorical_target=categorical_target, - ) - self.secondary_tables = [] - # Case of numpyarray - else: - self.main_table = NumpyTable( - main_table_name, - main_table_source, - target_column=y, - categorical_target=categorical_target, - ) - if len(X["tables"]) > 1: - raise ValueError( - "Multi-table schemas are only allowed " - "with pandas dataframe source tables." - ) - self.secondary_tables = [] - - if "relations" not in X: - # the schema is by default 'star' - # create a list of relations [(main_table, secondary_table, False), ...] - self.relations = [ - (self.main_table.name, table.name, False) - for table in self.secondary_tables - ] - else: - # the schema could be 'star' or 'snowflake' - # unify the size of all relation tuples - # by adding 'False' to non-entities - # check user-specified relations - self._check_relations(X) - relations = [] - for relation in X["relations"]: - parent, child = relation[:2] - relations.append( - ( - parent, - child, - relation[2] if len(relation) == 3 else False, - ) - ) - self.relations = relations - - def _check_cycle_exists(self, relations, main_table_name): - """Check existence of a cycle into 'relations'""" - tables_to_visit = [main_table_name] - tables_visited = set() - while tables_to_visit: - current_table = tables_to_visit.pop(0) - tables_visited.add(current_table) - for relation in relations: - parent_table, child_table = relation[:2] - if parent_table == current_table: - tables_to_visit.append(child_table) - if tables_visited.intersection(tables_to_visit): - raise ValueError( - f"Relations at X['relations'] contain a cycle which" - f" includes the relation '{relation}'" - ) - - def _check_relation_keys(self, X, left_table_name, right_table_name): - """Check coherence of keys""" - _, left_table_key = X["tables"][left_table_name] - _, right_table_key = X["tables"][right_table_name] - table_key_error = False - if isinstance(left_table_key, str) and isinstance(right_table_key, str): - table_key_error = right_table_key != left_table_key - elif isinstance(left_table_key, str) and is_list_like(right_table_key): - table_key_error = left_table_key not in right_table_key - elif is_list_like(left_table_key) and is_list_like(right_table_key): - table_key_error = not set(left_table_key).issubset(set(right_table_key)) - elif is_list_like(left_table_key) and isinstance(right_table_key, str): - table_key_error = True - - if table_key_error: - if isinstance(right_table_key, str): - right_table_key_msg = f"[{right_table_key}]" - else: - right_table_key_msg = f"[{', '.join(right_table_key)}]" - if isinstance(left_table_key, str): - left_table_key_msg = f"[{left_table_key}]" - else: - left_table_key_msg = f"[{', '.join(left_table_key)}]" - raise ValueError( - f"key for table '{right_table_name}' " - f"{right_table_key_msg} is incompatible with " - f"that of table " - f"'{left_table_name}' {left_table_key_msg}" - ) - - def _check_relations(self, X): - """Check relations""" - main_table_name = X["main_table"] - relations = X["relations"] - parents_and_children = [relation[:2] for relation in relations] - for relation in relations: - parent_table, child_table = relation[:2] - for table in (parent_table, child_table): - if not isinstance(table, str): - raise TypeError( - type_error_message("Table of a relation", table, str) - ) - if parent_table == child_table: - raise ValueError( - f"Tables in relation '({parent_table}, {child_table})' " - f"are the same. They must be different." - ) - if parents_and_children.count(relation[:2]) > 1: - raise ValueError( - f"Relation '({parent_table}, {child_table})' occurs " - f"'{parents_and_children.count(relation[:2])}' times. " - f"Each relation must be unique." - ) - if not parent_table in X["tables"].keys(): - raise ValueError( - f"X['tables'] does not contain a table named '{parent_table}'. " - f"All tables in X['relations'] must be declared in X['tables']" - ) - if not child_table in X["tables"].keys(): - raise ValueError( - f"X['tables'] does not contain a table named '{child_table}'. " - f"All tables in X['relations'] must be declared in X['tables']." - ) - if len(relation) == 3: - is_one_to_one_relation = relation[2] - if not isinstance(is_one_to_one_relation, bool): - raise TypeError( - type_error_message( - f"1-1 flag for relation " - f"({parent_table}, {child_table})", - is_one_to_one_relation, - bool, - ) - ) - self._check_relation_keys(X, parent_table, child_table) - self._check_cycle_exists(relations, main_table_name) - - def _check_input_mapping(self, X, y=None): - # Check the "tables" field (basic) - if "tables" not in X: - raise ValueError("Mandatory key 'tables' missing from dict 'X'") - if not is_dict_like(X["tables"]): - raise TypeError(type_error_message("X['tables']", X["tables"], Mapping)) - if len(X["tables"]) == 0: - raise ValueError("X['tables'] cannot be empty") - - # Check coherence of each table specification - for table_name, table_input in X["tables"].items(): - if not isinstance(table_input, tuple): - raise TypeError( - type_error_message( - f"Table input at X['tables']['{table_name}']", - table_input, - tuple, - ) - ) - if len(table_input) != 2: - raise ValueError( - f"Table input tuple at X['tables']['{table_name}'] " - f"must have size 2 not {len(table_input)}" - ) - table_source, table_key = table_input - if not isinstance( - table_source, (pd.DataFrame, sp.spmatrix, str) - ) and not hasattr(table_source, "__array__"): - raise TypeError( - type_error_message( - f"Table source at X['tables']['{table_name}']", - table_source, - "array-like or scipy.sparse.spmatrix", - str, - ) - ) - if ( - table_key is not None - and not is_list_like(table_key) - and not isinstance(table_key, str) - ): - raise TypeError( - type_error_message( - f"Table key at X['tables']['{table_name}']", - table_key, - str, - Sequence, - ) - ) - - if table_key is not None: - for column_name in table_key: - if not isinstance(column_name, str): - raise TypeError( - type_error_message( - "Column name of table key " - f"at X['tables']['{table_name}']", - column_name, - str, - ) - ) - - # Multi-table specific table checks - if len(X["tables"]) > 1: - # Check the "main_table" field - if "main_table" not in X: - raise ValueError( - "'main_table' must be specified for multi-table datasets" - ) - if not isinstance(X["main_table"], str): - raise TypeError( - type_error_message("X['main_table']", X["main_table"], str) - ) - if X["main_table"] not in X["tables"]: - raise ValueError( - f"X['main_table'] ({X['main_table']}) " - f"must be present in X['tables']" - ) - main_table_source, main_table_key = X["tables"][X["main_table"]] - if main_table_key is None: - raise ValueError("key of the root table is 'None'") - if len(main_table_key) == 0: - raise ValueError( - "key of the root table must be non-empty for multi-table datasets" - ) - - # Check that all secondary tables have non-None keys - for table_name, (_, table_key) in X["tables"].items(): - if table_name != X["main_table"] and table_key is None: - raise ValueError( - f"key of the secondary table '{table_name}' is 'None':" - " table keys must be specified in multitable datasets" - ) - - if "relations" in X: - # check the 'relations' field - if not is_list_like(X["relations"]): - raise TypeError( - type_error_message( - "Relations at X['tables']['relations']", - X["relations"], - "list-like", - ) - ) - else: - for relation in X["relations"]: - if not isinstance(relation, tuple): - raise TypeError( - type_error_message("Relation", relation, tuple) - ) - if len(relation) not in (2, 3): - raise ValueError( - f"A relation must be of size 2 or 3, " - f"not {len(relation)}" - ) - - # Check the 'format' field - if "format" in X: - if not isinstance(X["format"], tuple): - raise TypeError(type_error_message("X['format']", X["format"], tuple)) - if not isinstance(X["format"][0], str): - raise TypeError( - type_error_message("X['format'] 1st element", X["format"][0], str) - ) - if not isinstance(X["format"][1], bool): - raise TypeError( - type_error_message("X['format'] 2nd element", X["format"][1], bool) - ) - sep, _ = X["format"][0], X["format"][1] - if len(sep) != 1: - raise ValueError(f"Separator must be a single character. Value: {sep}") - - # Check the target coherence with X's tables - if y is not None: - if len(X["tables"]) == 1: - main_table_source, _ = list(X["tables"].values())[0] - else: - main_table_source, _ = X["tables"][X["main_table"]] - if ( - isinstance(main_table_source, pd.DataFrame) - and not isinstance(y, pd.Series) - and not isinstance(y, pd.DataFrame) - ): - raise TypeError( - type_error_message("y", y, pd.Series, pd.DataFrame) - + " (X's tables are of type pandas.DataFrame)" - ) - if ( - isinstance(main_table_source, sp.spmatrix) - or hasattr(main_table_source, "__array__") - ) and not hasattr(y, "__array__"): - raise TypeError( - type_error_message("y", y, "array-like") - + " (X's tables are of type numpy.ndarray" - + " or scipy.sparse.spmatrix)" - ) - if isinstance(main_table_source, str) and not isinstance(y, str): - raise TypeError( - type_error_message("y", y, str) - + " (X's tables are of type str [file paths])" - ) - - def is_in_memory(self): - """Tests whether the dataset is in memory - - A dataset is in memory if it is constituted either of only pandas.DataFrame - tables, numpy.ndarray, or scipy.sparse.spmatrix tables. - - Returns - ------- - bool - `True` if the dataset is constituted of pandas.DataFrame tables. - """ - return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) - - def is_multitable(self): - """Tests whether the dataset is a multi-table one - - Returns - ------- - bool - ``True`` if the dataset is multi-table. - """ - return self.secondary_tables is not None and len(self.secondary_tables) > 0 - - def copy(self): - """Creates a copy of the dataset - - Referenced dataframes in tables are copied as references - """ - dataset_spec = {} - dataset_spec["main_table"] = self.main_table.name - dataset_spec["tables"] = {} - if self.is_in_memory(): - dataset_spec["tables"][self.main_table.name] = ( - self.main_table.dataframe, - self.main_table.key, - ) - for table in self.secondary_tables: - dataset_spec["tables"][table.name] = (table.dataframe, table.key) - else: - dataset_spec["tables"][self.main_table.name] = ( - self.main_table.path, - self.main_table.key, - ) - for table in self.secondary_tables: - dataset_spec["tables"][table.name] = (table.path, table.key) - dataset_spec["format"] = (self.sep, self.header) - return Dataset(dataset_spec) - - def create_khiops_dictionary_domain(self): - """Creates a Khiops dictionary domain representing this dataset - - Returns - ------- - `.DictionaryDomain` - The dictionary domain object representing this dataset - """ - assert self.main_table is not None, "'main_table' must be initialized" - - # Create root dictionary and add it to the domain - dictionary_domain = kh.DictionaryDomain() - root_dictionary = self.main_table.create_khiops_dictionary() - dictionary_domain.add_dictionary(root_dictionary) - - # Create the dictionaries for each secondary table and the table variables in - # root dictionary that point to each secondary table - # This is performed using a breadth-first-search over the graph of relations - # Note: In general 'name' and 'object_type' fields of Variable can be different - if self.secondary_tables: - root_dictionary.root = True - table_names = [table.name for table in self.secondary_tables] - tables_to_visit = [self.main_table.name] - while tables_to_visit: - current_table = tables_to_visit.pop(0) - for relation in self.relations: - parent_table, child_table, is_one_to_one_relation = relation - if parent_table == current_table: - tables_to_visit.append(child_table) - parent_table_name = parent_table - index_table = table_names.index(child_table) - table = self.secondary_tables[index_table] - parent_table_dictionary = dictionary_domain.get_dictionary( - parent_table_name - ) - dictionary = table.create_khiops_dictionary() - dictionary_domain.add_dictionary(dictionary) - table_variable = kh.Variable() - if is_one_to_one_relation: - table_variable.type = "Entity" - else: - table_variable.type = "Table" - table_variable.name = table.name - table_variable.object_type = table.name - parent_table_dictionary.add_variable(table_variable) - return dictionary_domain - - def create_table_files_for_khiops(self, target_dir, sort=True): - """Prepares the tables of the dataset to be used by Khiops - - If this is a multi-table dataset it will create sorted copies the tables. - - Parameters - ---------- - target_dir : str - The directory where the sorted tables will be created - - Returns - ------- - tuple - A tuple containing: - - - The path of the main table - - A dictionary containing the relation [table-name -> file-path] for the - secondary tables. The dictionary is empty for monotable datasets. - """ - # Sort the main table unless: - # - The caller specifies not to do it (sort = False) - # - The dataset is mono-table and the main table has no key - sort_main_table = sort and ( - self.is_multitable() or self.main_table.key is not None - ) - main_table_path = self.main_table.create_table_file_for_khiops( - target_dir, sort=sort_main_table - ) - - # Create a copy of each secondary table - secondary_table_paths = {} - for table in self.secondary_tables: - secondary_table_paths[table.name] = table.create_table_file_for_khiops( - target_dir, sort=sort - ) - return main_table_path, secondary_table_paths - - @property - def target_column_type(self): - """The target column's type""" - if self.main_table.target_column_id is None: - raise ValueError("Target column is not set") - if self.is_in_memory(): - return self.main_table.target_column.dtype - else: - return self.main_table.table_sample_df.dtypes[ - self.main_table.target_column_id - ] - - def __repr__(self): - return str(self.create_khiops_dictionary_domain()) - - -class DatasetTable(ABC): - """A generic dataset table""" - - def __init__(self, name, categorical_target=True, key=None): - # Check input - if not isinstance(name, str): - raise TypeError(type_error_message("name", name, str)) - if not name: - raise ValueError("'name' cannot be empty") - if key is not None: - if not is_list_like(key) and not isinstance(key, (str, int)): - raise TypeError(type_error_message("key", key, str, int, "list-like")) - if is_list_like(key): - for column_index, column_id in enumerate(key): - if not isinstance(column_id, (str, int)): - raise TypeError( - type_error_message( - f"key[{column_index}]", column_id, str, int - ) - + f" at table '{name}'" - ) - - # Initialization (must be completed by concrete sub-classes) - self.name = name - self.categorical_target = categorical_target - if is_list_like(key) or key is None: - self.key = key - else: - self.key = [key] - self.target_column_id = None - self.column_ids = None - self.khiops_types = None - self.n_samples = None - - def check_key(self): - """Checks that the key columns exist""" - if self.key is not None: - if not is_list_like(self.key): - raise TypeError( - type_error_message("key", self.key, str, int, "list-like") - ) - for column_name in self.key: - if column_name not in self.column_ids: - raise ValueError( - f"Column '{column_name}' not present in table '{self.name}'" - ) - - @abstractmethod - def create_table_file_for_khiops(self, output_dir, sort=True): - """Creates a copy of the table at the specified directory""" - - def n_features(self): - """Returns the number of features of the table - - The target column does not count. - """ - return len(self.column_ids) - - def create_khiops_dictionary(self): - """Creates a Khiops dictionary representing this table - - Returns - ------- - `.Dictionary`: - The Khiops Dictionary object describing this table's schema - - """ - assert self.column_ids is not None, "Dataset column list is None" - assert self.key is None or is_list_like(self.key), "'key' is not list-like" - - # Create dictionary object - dictionary = kh.Dictionary() - dictionary.name = self.name - if self.key is not None: - dictionary.key = list(self.key) - - # For each column add a Khiops variable to the dictionary - for column_id in self._get_all_column_ids(): - variable = kh.Variable() - - # Set the variable name for string and integer column indexes - if isinstance(column_id, str): - variable.name = str(column_id) - else: - assert isinstance(column_id, (np.int64, int)) - variable.name = f"Var{column_id}" - - # Set the type of the column/variable - # Case of a column in the key : Set to categorical - if self.key is not None and column_id in self.key: - variable.type = "Categorical" - # Case of the target column: Set to specified type - elif column_id == self.target_column_id: - assert self.target_column_id is not None - if self.categorical_target: - variable.type = "Categorical" - else: - variable.type = "Numerical" - # The rest of columns: Obtain the type from dtypes - else: - variable.type = self.khiops_types[column_id] - dictionary.add_variable(variable) - return dictionary - - @abstractmethod - def _get_all_column_ids(self): - """Returns the column ids including the target""" - - -class PandasTable(DatasetTable): - """Table encapsulating the features dataframe X and the target labels y - - X is of type pandas.DataFrame. - y is of type pandas.Series or pandas.DataFrame. - - Parameters - ---------- - name : str - Name for the table. - dataframe : `pandas.DataFrame` - The data frame to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. - """ - - def __init__( - self, name, dataframe, key=None, target_column=None, categorical_target=True - ): - # Call the parent method - super().__init__(name, categorical_target=categorical_target, key=key) - - # Check inputs specific to this sub-class - if not isinstance(dataframe, pd.DataFrame): - raise TypeError(type_error_message("dataframe", dataframe, pd.DataFrame)) - if dataframe.shape[0] == 0: - raise ValueError("'dataframe' is empty") - if target_column is not None: - if not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - if isinstance(target_column, pd.Series): - if ( - target_column.name is not None - and target_column.name in dataframe.columns - ): - raise ValueError( - f"Target series name '{target_column.name}' " - f"is already present in dataframe : {list(dataframe.columns)}" - ) - elif isinstance(target_column, pd.DataFrame): - number_of_target_columns = len(target_column.columns) - if number_of_target_columns != 1: - raise ValueError( - "Target dataframe should contain exactly one column. " - f"It contains {number_of_target_columns}." - ) - target_column = target_column.iloc[:, 0] - - # Initialize the attributes - self.dataframe = dataframe - self.n_samples = len(self.dataframe) - - # Initialize feature columns and verify their types - self.column_ids = self.dataframe.columns.values - if not np.issubdtype(self.column_ids.dtype, np.integer): - if np.issubdtype(self.column_ids.dtype, object): - for i, column_id in enumerate(self.column_ids): - if not isinstance(column_id, str): - raise TypeError( - f"Dataframe column ids must be either all integers or " - f"all strings. Column id at index {i} ('{column_id}') is" - f" of type '{type(column_id).__name__}'" - ) - else: - raise TypeError( - f"Dataframe column ids must be either all integers or " - f"all strings. The column index has dtype " - f"'{self.column_ids.dtype}'" - ) - - # Initialize Khiops types - self.khiops_types = { - column_id: get_khiops_type(self.dataframe.dtypes[column_id]) - for column_id in self.column_ids - } - - # Initialize target column (if any) - self.target_column = target_column - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - self.target_column_id = target_column.name - else: - if pd.api.types.is_integer_dtype(self.column_ids): - self.target_column_id = self.column_ids[-1] + 1 - else: - assert pd.api.types.is_string_dtype(self.column_ids) - self.target_column_id = "UnknownTargetColumn" - - # Check key integrity - self.check_key() - - def __repr__(self): - dtypes_str = ( - str(self.dataframe.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") - ) - return ( - f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtypes={dtypes_str}; target={self.target_column_id}>" - ) - - def _get_all_column_ids(self): - if self.target_column is not None: - all_column_ids = list(self.column_ids) + [self.target_column_id] - else: - all_column_ids = list(self.column_ids) - return all_column_ids - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, np.int64) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): - assert not sort or self.key is not None, "Cannot sort table without a key" - assert not sort or is_list_like( - self.key - ), "Cannot sort table with a key is that is not list-like" - assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" - - # Create the output table resource object - output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") - - # Write the output dataframe - output_dataframe = self._create_dataframe_copy() - - # Sort by key if requested (as string) - if sort: - output_dataframe.sort_values( - by=self.key, - key=lambda array: array.astype("str"), - inplace=True, - kind="mergesort", - ) - - # Write the dataframe to an internal table file - with io.StringIO() as output_dataframe_stream: - write_internal_data_table(output_dataframe, output_dataframe_stream) - fs.write( - output_table_path, output_dataframe_stream.getvalue().encode("utf-8") - ) - - return output_table_path - - def _create_dataframe_copy(self): - """Creates an in memory copy of the dataframe with the target column""" - # Create a copy of the dataframe and add a copy of the target column (if any) - if self.target_column is not None: - if ( - isinstance(self.target_column, pd.Series) - and self.target_column.name is not None - ): - output_target_column = self.target_column.reset_index(drop=True) - else: - output_target_column = pd.Series( - self.target_column, name=self.target_column_id - ) - output_dataframe = pd.concat( - [self.dataframe.reset_index(drop=True), output_target_column], - axis=1, - ) - else: - output_dataframe = self.dataframe.copy() - - # Rename the columns - output_dataframe_column_names = {} - for column_id in self._get_all_column_ids(): - output_dataframe_column_names[column_id] = self.get_khiops_variable_name( - column_id - ) - output_dataframe.rename( - output_dataframe_column_names, axis="columns", inplace=True - ) - - return output_dataframe - - -class NumpyTable(DatasetTable): - """Table encapsulating (X,y) pair with types (ndarray, ndarray) - - Parameters - ---------- - name : str - Name for the table. - array : :external:term:`array-like` of shape (n_samples, n_features_in) - The data frame to be encapsulated. - key : :external:term`array-like` of int, optional - The names of the columns composing the key - target_column : :external:term:`array-like` of shape (n_samples,) , optional - The series representing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. - """ - - def __init__( - self, name, array, key=None, target_column=None, categorical_target=True - ): - # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) - - # Check the array's types and shape - if not hasattr(array, "__array__"): - raise TypeError(type_error_message("array", array, np.ndarray)) - - # Check (and potentially transform with a copy) the array's data - checked_array = check_array(array, ensure_2d=True, force_all_finite=False) - - # Check the target's types and shape - if target_column is not None: - checked_target_column = column_or_1d(target_column, warn=True) - - # Initialize the members - self.array = checked_array - self.column_ids = list(range(self.array.shape[1])) - self.target_column_id = self.array.shape[1] - if target_column is not None: - self.target_column = checked_target_column - else: - self.target_column = None - self.categorical_target = categorical_target - self.khiops_types = { - column_id: get_khiops_type(self.array.dtype) - for column_id in self.column_ids - } - self.n_samples = len(self.array) - - def __repr__(self): - dtype_str = str(self.array.dtype) - return ( - f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtype={dtype_str}; target={self.target_column_id}>" - ) - - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - - def create_table_file_for_khiops(self, output_dir, sort=True): - assert not sort or self.key is not None, "Cannot sort table without a key" - assert not sort or is_list_like( - self.key - ), "Cannot sort table with a key is that is not list-like" - assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" - - # Create the output table resource object - output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") - - # Write the output dataframe - output_dataframe = pd.DataFrame(self.array.copy()) - output_dataframe.columns = [f"Var{column_id}" for column_id in self.column_ids] - if self.target_column is not None: - output_dataframe[f"Var{self.target_column_id}"] = self.target_column - - # Sort by key if requested (as string) - if sort: - np.sort( - output_dataframe, - by=self.key, - key=lambda array: array.astype("str"), - inplace=True, - kind="mergesort", - ) - - # Write the dataframe to an internal table file - with io.StringIO() as output_dataframe_stream: - write_internal_data_table(output_dataframe, output_dataframe_stream) - fs.write( - output_table_path, output_dataframe_stream.getvalue().encode("utf-8") - ) - - return output_table_path - - -class SparseTable(DatasetTable): - """Table encapsulating feature matrix X and target array y - - X is of type scipy.sparse.spmatrix. - y is array-like. - - Parameters - ---------- - name : str - Name for the table. - matrix : `scipy.sparse.spmatrix` - The sparse matrix to be encapsulated. - key : list-like of str, optional - The names of the columns composing the key - target_column : :external:term:`array-like`, optional - The array containing the target column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. - """ - - def __init__( - self, name, matrix, key=None, target_column=None, categorical_target=True - ): - assert key is None, "'key' must be unset for sparse matrix tables" - # Call the parent method - super().__init__(name, key=key, categorical_target=categorical_target) - - # Check the sparse matrix types - if not isinstance(matrix, sp.spmatrix): - raise TypeError( - type_error_message("matrix", matrix, "scipy.sparse.spmatrix") - ) - if not np.issubdtype(matrix.dtype, np.number): - raise TypeError( - type_error_message("'matrix' dtype", matrix.dtype, "numeric") - ) - - # Check the target's types - if target_column is not None and not hasattr(target_column, "__array__"): - raise TypeError( - type_error_message("target_column", target_column, "array-like") - ) - - # Initialize the members - self.matrix = matrix - self.column_ids = list(range(self.matrix.shape[1])) - self.target_column_id = self.matrix.shape[1] - self.target_column = target_column - self.categorical_target = categorical_target - self.khiops_types = { - column_id: get_khiops_type(self.matrix.dtype) - for column_id in self.column_ids - } - self.n_samples = self.matrix.shape[0] - - def __repr__(self): - dtype_str = str(self.matrix.dtype) - return ( - f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " - f"dtype={dtype_str}; target={self.target_column_id}>" - ) - - def create_khiops_dictionary(self): - """Creates a Khiops dictionary representing this sparse table - - Adds metadata to each sparse variable - - Returns - ------- - `.Dictionary`: - The Khiops Dictionary object describing this table's schema - - """ - - # create dictionary as usual - dictionary = super().create_khiops_dictionary() - - # create variable block for containing the sparse variables - variable_block = VariableBlock() - variable_block.name = "SparseVariables" - - # For each variable, add metadata, named `VarKey` - variable_names = [variable.name for variable in dictionary.variables] - target_column_variable_name = self.get_khiops_variable_name( - self.target_column_id - ) - for i, variable_name in enumerate(variable_names, 1): - if variable_name != target_column_variable_name: - variable = dictionary.remove_variable(variable_name) - variable.meta_data.add_value("VarKey", i) - variable_block.add_variable(variable) - dictionary.add_variable_block(variable_block) - - return dictionary - - def _get_all_column_ids(self): - n_columns = len(self.column_ids) - if self.target_column is not None: - n_columns += 1 - return list(range(n_columns)) - - def get_khiops_variable_name(self, column_id): - """Return the khiops variable name associated to a column id""" - assert column_id == self.target_column_id or column_id in self.column_ids - if isinstance(column_id, str): - variable_name = column_id - else: - assert isinstance(column_id, (np.int64, int)) - variable_name = f"Var{column_id}" - return variable_name - - def _flatten(self, iterable): - if isinstance(iterable, Iterable): - for iterand in iterable: - if isinstance(iterand, Iterable): - yield from self._flatten(iterand) - else: - yield iterand - - def _write_sparse_block(self, row_index, stream, target=None): - assert row_index in range( - self.matrix.shape[0] - ), "'row_index' must be coherent with the shape of the sparse matrix" - if target is not None: - assert target in self.target_column, "'target' must be in the target column" - stream.write(f"{target}\t") - row = self.matrix.getrow(row_index) - # Variable indices are not always sorted in `row.indices` - # Khiops needs variable indices to be sorted - sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") - - # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API - # is not homogeneous with other sparse matrices: it stores - # opaque Python lists as elements - # Thus: - # - if isinstance(self.matrix, sp.lil_matrix) and Python 3.8, then - # row.data is np.array([list([...])]) - # - else, row.data is np.array([...]) - # TODO: remove this flattening once Python 3.8 support is dropped - sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ - sorted_indices.argsort() - ] - for variable_index, variable_value in zip(sorted_indices, sorted_data): - stream.write(f"{variable_index + 1}:{variable_value} ") - stream.write("\n") - - def create_table_file_for_khiops(self, output_dir, sort=True): - # Create the output table resource object - output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") - - # Write the sparse matrix to an internal table file - with io.StringIO() as output_sparse_matrix_stream: - if self.target_column is not None: - target_column_name = self.get_khiops_variable_name( - self.target_column_id - ) - output_sparse_matrix_stream.write( - f"{target_column_name}\tSparseVariables\n" - ) - for target, row_index in zip( - self.target_column, range(self.matrix.shape[0]) - ): - self._write_sparse_block( - row_index, output_sparse_matrix_stream, target=target - ) - else: - output_sparse_matrix_stream.write("SparseVariables\n") - for row_index in range(self.matrix.shape[0]): - self._write_sparse_block(row_index, output_sparse_matrix_stream) - fs.write( - output_table_path, - output_sparse_matrix_stream.getvalue().encode("utf-8"), - ) - - return output_table_path - - -class FileTable(DatasetTable): - """A table representing a delimited text file - - Parameters - ---------- - name : str - Name for the table. - path : str - Path of the file containing the table. - sep : str, optional - Field separator character. If not specified it will be inferred from the file. - header : bool, optional - Indicates if the table - key : list-like of str, optional - The names of the columns composing the key - target_column_id : str, optional - Name of the target variable column. - categorical_target : bool, default ``True``. - ``True`` if the target column is categorical. - """ - - def __init__( - self, - name, - path, - target_column_id=None, - categorical_target=True, - key=None, - sep="\t", - header=True, - ): - # Initialize parameters - super().__init__(name=name, categorical_target=categorical_target, key=key) - - # Check inputs specific to this sub-class - if not isinstance(path, str): - raise TypeError(type_error_message("path", path, str)) - if not fs.exists(path): - raise ValueError(f"Non-existent data table file: {path}") - - # Initialize members specific to this sub-class - self.path = path - self.sep = sep - self.header = header - self.target_column_id = target_column_id - - # Obtain the columns and their types from a sample of the data table - # We build the sample by reading the first 100 rows / 4MB of the file - table_file_head_contents = fs.read(self.path, size=4096 * 1024 - 1) - with io.BytesIO(table_file_head_contents) as table_file_head_contents_stream: - self.table_sample_df = pd.read_csv( - table_file_head_contents_stream, - nrows=100, - sep=self.sep, - header=0 if self.header else None, - ) - - # Raise error if there is no data in the table - if self.table_sample_df.shape[0] == 0: - raise ValueError(f"Empty data table file: {self.path}") - - # Save the columns and their types - self.column_ids = self.table_sample_df.columns.values - self.khiops_types = { - column_id: get_khiops_type(data_type) - for column_id, data_type in self.table_sample_df.dtypes.items() - } - - # Check key integrity - self.check_key() - - def _get_all_column_ids(self): - return list(self.column_ids) - - def get_khiops_variable_name(self, column_id): - assert column_id in self._get_all_column_ids() - return column_id - - def create_table_file_for_khiops(self, output_dir, sort=True): - assert not sort or self.key is not None, "key is 'None'" - - # Create the input and output file resources - if sort: - output_table_file_path = fs.get_child_path( - output_dir, f"sorted_{self.name}.txt" - ) - else: - output_table_file_path = fs.get_child_path( - output_dir, f"copy_{self.name}.txt" - ) - - # Fail if they have the same path - if output_table_file_path == self.path: - raise ValueError(f"Cannot overwrite this table's path: {self.path}") - - # Create a sorted copy if requested - if sort: - # Create the sorting dictionary domain - sort_dictionary_domain = kh.DictionaryDomain() - sort_dictionary_domain.add_dictionary(self.create_khiops_dictionary()) - - # Delegate the sorting and copy to khiops.core.sort_data_table - # We use the same input format of the original table - kh.sort_data_table( - sort_dictionary_domain, - self.name, - self.path, - output_table_file_path, - self.key, - field_separator=self.sep, - header_line=self.header, - output_field_separator=self.sep, - output_header_line=self.header, - ) - - # Otherwise copy the contents to the output file - else: - fs.write(output_table_file_path, fs.read(self.path)) - - return output_table_file_path diff --git a/khiops/utils/__init__.py b/khiops/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py new file mode 100644 index 00000000..500312ae --- /dev/null +++ b/khiops/utils/dataset.py @@ -0,0 +1,1508 @@ +###################################################################################### +# Copyright (c) 2024 Orange. All rights reserved. # +# This software is distributed under the BSD 3-Clause-clear License, the text of # +# which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # +# see the "LICENSE.md" file for more details. # +###################################################################################### +"""Classes for handling diverse data tables""" +import csv +import io +import json +import os +import warnings +from abc import ABC, abstractmethod +from collections.abc import Iterable, Mapping, Sequence + +import numpy as np +import pandas as pd +from scipy import sparse as sp +from sklearn.utils import check_array +from sklearn.utils.validation import column_or_1d + +import khiops.core as kh +import khiops.core.internals.filesystems as fs +from khiops.core.dictionary import VariableBlock +from khiops.core.exceptions import KhiopsRuntimeError +from khiops.core.internals.common import ( + deprecation_message, + is_dict_like, + is_list_like, + type_error_message, +) + +# Disable PEP8 variable names because of scikit-learn X,y conventions +# To capture invalid-names other than X,y run: +# pylint --disable=all --enable=invalid-names dataset.py +# pylint: disable=invalid-name + + +def check_dataset_spec(ds_spec): + """Checks that a dataset spec is valid + + Parameters + ---------- + ds_spec : dict + A specification of a multi-table dataset (see :doc:`/multi_table_primer`). + + Raises + ------ + TypeError + If there are objects of the spec with invalid type. + ValueError + If there are objects of the spec with invalid values. + """ + # Check the spec type + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, Mapping)) + + # Check the "tables" field + if "tables" not in ds_spec: + raise ValueError("'tables' entry missing from dataset dict spec") + if not is_dict_like(ds_spec["tables"]): + raise TypeError( + type_error_message("'tables' entry", ds_spec["tables"], Mapping) + ) + if len(ds_spec["tables"]) == 0: + raise ValueError("'tables' dictionary cannot be empty") + for table_name, table_entry in ds_spec["tables"].items(): + _check_table_entry(table_name, table_entry) + + # Multi-table specific table checks + if len(ds_spec["tables"]) > 1: + _check_multitable_spec(ds_spec) + + # Check the 'format' field + if "format" in ds_spec: + _check_format_entry(ds_spec["format"]) + + +def _check_table_entry(table_name, table_spec): + if not isinstance(table_spec, tuple): + raise TypeError( + type_error_message(f"'{table_name}' table entry", table_spec, tuple) + ) + if len(table_spec) != 2: + raise ValueError( + f"'{table_name}' table entry must have size 2, not {len(table_spec)}" + ) + source, key = table_spec + if not isinstance(source, (pd.DataFrame, sp.spmatrix, str)) and not hasattr( + source, "__array__" + ): + raise TypeError( + type_error_message( + f"'{table_name}' table's source", + source, + "array-like", + "scipy.sparse.spmatrix", + str, + ) + ) + _check_table_key(table_name, key) + + +def _check_table_key(table_name, key): + if key is not None: + if not is_list_like(key) and not isinstance(key, str): + raise TypeError( + type_error_message(f"'{table_name}' table's key", key, str, Sequence) + ) + if len(key) == 0: + raise ValueError(f"'{table_name}' table's key is empty") + for column_name in key: + if not isinstance(column_name, str): + raise TypeError( + type_error_message( + f"'{table_name}' table's key column name", + column_name, + str, + ) + ) + + +def _check_multitable_spec(ds_spec): + # Check the main table + if "main_table" not in ds_spec: + raise ValueError( + "'main_table' entry must be specified for multi-table datasets" + ) + if not isinstance(ds_spec["main_table"], str): + raise TypeError( + type_error_message("'main_table' entry", ds_spec["main_table"], str) + ) + if ds_spec["main_table"] not in ds_spec["tables"]: + raise ValueError( + f"A table entry with the main table's name ('{ds_spec['main_table']}') " + f"must be present in the 'tables' dictionary" + ) + + # Check that all tables have non-None keys + for table_name, (_, table_key) in ds_spec["tables"].items(): + if table_key is None: + table_kind = "main" if ds_spec["main_table"] == table_name else "secondary" + raise ValueError( + f"key of {table_kind} table '{table_name}' is 'None': " + "table keys must be specified in multi-table datasets" + ) + + # Check that all the tables have the same type as the main + main_table_type = type(ds_spec["tables"][ds_spec["main_table"]][0]) + for table_name, (table_source, _) in ds_spec["tables"].items(): + if table_name != ds_spec["main_table"]: + if not isinstance(table_source, main_table_type): + raise ValueError( + f"Secondary table '{table_name}' has type " + f"'{type(table_source).__name__}' which is different from the " + f"main table's type '{main_table_type.__name__}'." + ) + + # If the 'relations' entry exists check it + if "relations" in ds_spec: + relations_spec = ds_spec["relations"] + # Otherwise build a star schema relations spec and check it + else: + relations_spec = [ + (ds_spec["main_table"], table) + for table in ds_spec["tables"].keys() + if table != ds_spec["main_table"] + ] + _check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) + + +def _check_relations_entry(main_table_name, tables_spec, relations_spec): + # Check the types and size of the relation entries + if not is_list_like(relations_spec): + raise TypeError( + type_error_message("'relations' entry", relations_spec, "list-like") + ) + for i, relation in enumerate(relations_spec, 1): + # Check that the relation is a 2 or 3 tuple + if not isinstance(relation, tuple): + raise TypeError(type_error_message("Relation", relation, tuple)) + if len(relation) not in (2, 3): + raise ValueError(f"A relation must be of size 2 or 3, not {len(relation)}") + + # Check the types of the tuple contents + parent_table, child_table = relation[:2] + if not isinstance(parent_table, str): + raise TypeError( + type_error_message(f"Relation #{i}'s parent table", parent_table, str) + ) + if not isinstance(child_table, str): + raise TypeError( + type_error_message(f"Relation #{i}'s child table", child_table, str) + ) + if len(relation) == 3 and not isinstance(relation[2], bool): + raise TypeError( + type_error_message( + f"Relation #{i} ({parent_table}, {child_table}) 1-1 flag", + relation[2], + bool, + ) + ) + + # Check structure and coherence with the rest of the spec + parents_and_children = [relation[:2] for relation in relations_spec] + for i, relation in enumerate(relations_spec, 1): + parent_table, child_table = relation[:2] + if parent_table == child_table: + raise ValueError( + f"Relation #{i}'s tables are equal: ({parent_table}, {child_table}). " + "They must be different." + ) + for table in (parent_table, child_table): + if not table in tables_spec.keys(): + raise ValueError( + f"Relation #{i} ({parent_table}, {child_table}) contains " + f"non-existent table '{table}'. All relation tables must exist " + "in the 'tables' entry." + ) + if parents_and_children.count(relation[:2]) > 1: + raise ValueError( + f"Relation #{i} ({parent_table}, {child_table}) occurs " + f"{parents_and_children.count(relation[:2])} times. " + f"Each relation must be unique." + ) + + # Check hierachical keys + _check_hierarchical_keys( + i, + parent_table, + tables_spec[parent_table][1], + child_table, + tables_spec[child_table][1], + ) + + # Check there are no cycles + _check_no_cycles(relations_spec, main_table_name) + + +def _check_hierarchical_keys( + relation_id, parent_table, parent_table_key, child_table, child_table_key +): + """Check that the parent table's key is contained in the child table's key""" + # Perform the check and save the error status + error_found = False + if isinstance(parent_table_key, str) and isinstance(child_table_key, str): + error_found = child_table_key != parent_table_key + elif isinstance(parent_table_key, str) and is_list_like(child_table_key): + error_found = parent_table_key not in child_table_key + elif is_list_like(parent_table_key) and is_list_like(child_table_key): + error_found = not set(parent_table_key).issubset(child_table_key) + elif is_list_like(parent_table_key) and isinstance(child_table_key, str): + error_found = ( + len(parent_table_key) != 1 or child_table_key not in parent_table_key + ) + + # Report any error found + if error_found: + if isinstance(child_table_key, str): + child_table_key_msg = f"[{child_table_key}]" + else: + child_table_key_msg = f"[{', '.join(child_table_key)}]" + if isinstance(parent_table_key, str): + parent_table_key_msg = f"[{parent_table_key}]" + else: + parent_table_key_msg = f"[{', '.join(parent_table_key)}]" + raise ValueError( + f"Relation #{relation_id} child table '{child_table}' " + f"key ({child_table_key_msg}) does not contain that of parent table " + f"'{parent_table}' ({parent_table_key_msg})." + ) + + +def _check_no_cycles(relations_spec, main_table_name): + """Check that there are no cycles in the 'relations' entry""" + tables_to_visit = [main_table_name] + tables_visited = set() + while tables_to_visit: + current_table = tables_to_visit.pop(0) + tables_visited.add(current_table) + for relation in relations_spec: + parent_table, child_table = relation[:2] + if parent_table == current_table: + tables_to_visit.append(child_table) + if tables_visited.intersection(tables_to_visit): + raise ValueError( + "'relations' entry contains a cycle that includes " + f"the relation ({parent_table}, {child_table})." + ) + + +def _check_format_entry(format_spec): + if not isinstance(format_spec, tuple): + raise TypeError(type_error_message("'format' entry", format_spec, tuple)) + if len(format_spec) != 2: + raise ValueError( + f"'format' entry must be a tuple of size 2, not {len(format_spec)}" + ) + sep, header = format_spec + if not isinstance(sep, str): + raise TypeError( + type_error_message("'format' tuple's 1st element (separator)", sep, str) + ) + if not isinstance(header, bool): + raise TypeError( + type_error_message("'format' tuple's 2nd element (header)", header, bool) + ) + if len(sep) != 1: + raise ValueError(f"'format' separator must be a single char, got '{sep}'") + + +def get_khiops_type(numpy_type): + """Translates a numpy dtype to a Khiops dictionary type + + Parameters + ---------- + numpy_type : `numpy.dtype`: + Numpy type of the column + + Returns + ------- + str + Khiops type name. Either "Categorical", "Numerical" or "Timestamp" + """ + lower_numpy_type = str(numpy_type).lower() + + # timedelta64 and datetime64 types + if "datetime64" in lower_numpy_type or "timedelta64" in lower_numpy_type: + khiops_type = "Timestamp" + # float, int, uint types + elif "int" in lower_numpy_type or "float" in lower_numpy_type: + khiops_type = "Numerical" + # bool_ and object, character, bytes_, str_, void, record and other types + else: + khiops_type = "Categorical" + + return khiops_type + + +def get_khiops_variable_name(column_id): + """Return the khiops variable name associated to a column id""" + if isinstance(column_id, str): + variable_name = column_id + else: + assert isinstance(column_id, np.int64) + variable_name = f"Var{column_id}" + return variable_name + + +def read_internal_data_table(file_path_or_stream): + """Reads into a DataFrame a data table file with the internal format settings + + The table is read with the following settings: + + - Use tab as separator + - Read the column names from the first line + - Use '"' as quote character + - Use `csv.QUOTE_MINIMAL` + - double quoting enabled (quotes within quotes can be escaped with '""') + - UTF-8 encoding + + Parameters + ---------- + file_path_or_stream : str or file object + The path of the internal data table file to be read or a readable file + object. + + Returns + ------- + `pandas.DataFrame` + The dataframe representation. + """ + return pd.read_csv( + file_path_or_stream, + sep="\t", + header=0, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + encoding="utf-8", + ) + + +def write_internal_data_table(dataframe, file_path_or_stream): + """Writes a DataFrame to data table file with the internal format settings + + The table is written with the following settings: + + - Use tab as separator + - Write the column names on the first line + - Use '"' as quote character + - Use `csv.QUOTE_MINIMAL` + - double quoting enabled (quotes within quotes can be escaped with '""') + - UTF-8 encoding + - The index is not written + + Parameters + ---------- + dataframe : `pandas.DataFrame` + The dataframe to write. + file_path_or_stream : str or file object + The path of the internal data table file to be written or a writable file + object. + """ + dataframe.to_csv( + file_path_or_stream, + sep="\t", + header=True, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + encoding="utf-8", + index=False, + ) + + +class Dataset: + """A representation of a dataset + + Parameters + ---------- + X : `pandas.DataFrame` or dict (**Deprecated types**: tuple and list) + Either: + - A single dataframe + - A ``dict`` dataset specification + y : `pandas.Series` or str, optional + The target column. + categorical_target : bool, default True + ``True`` if the vector ``y`` should be considered as a categorical variable. If + ``False`` it is considered as numeric. Ignored if ``y`` is ``None``. + key : str + The name of the key column for all tables. + **Deprecated:** Will be removed in khiops-python 11. + """ + + def __init__(self, X, y=None, categorical_target=True, key=None): + # Initialize members + self.main_table = None + self.secondary_tables = None + self.relations = None + self.categorical_target = categorical_target + self.target_column = None + self.target_column_id = None + self.sep = None + self.header = None + + # Initialization from different types of input "X" + # A single pandas dataframe + if isinstance(X, pd.DataFrame): + self.main_table = PandasTable("main_table", X) + self.secondary_tables = [] + # A single numpy array (or compatible object) + elif hasattr(X, "__array__"): + self.main_table = NumpyTable("main_table", X) + self.secondary_tables = [] + # A scipy.sparse.spmatrix + elif isinstance(X, sp.spmatrix): + self.main_table = SparseTable("main_table", X) + self.secondary_tables = [] + # Special rejection for scipy.sparse.sparray (to pass the sklearn tests) + # Note: We don't use scipy.sparse.sparray because it is not implemented in scipy + # 1.10 which is the latest supporting py3.8 + elif isinstance( + X, + ( + sp.bsr_array, + sp.coo_array, + sp.csc_array, + sp.csr_array, + sp.dia_array, + sp.dok_array, + sp.lil_array, + ), + ): + check_array(X, accept_sparse=False) + # A tuple spec + elif isinstance(X, tuple): + warnings.warn( + deprecation_message( + "Tuple dataset input", + "11.0.0", + replacement="dict dataset spec", + quote=False, + ), + stacklevel=3, + ) + # Check the input tuple + self._check_input_tuple(X) + + # Obtain path and separator + path, sep = X + + # Initialization + self.main_table = FileTable("main_table", path=path, sep=sep) + self.secondary_tables = [] + + # A dataset sequence spec + # We try first for compatible python arrays then the deprecated sequences spec + elif is_list_like(X): + # Try to transform to a numerical array with sklearn's check_array + # On failure we try the old deprecated sequence interface + # When the old list interface is eliminated this will considerably reduce + # this branch's code + try: + X_checked = check_array(X, ensure_2d=True, force_all_finite=False) + self.main_table = NumpyTable("main_table", X_checked) + self.secondary_tables = [] + except ValueError: + warnings.warn( + deprecation_message( + "List dataset input", + "11.0.0", + replacement="dict dataset spec", + quote=False, + ), + stacklevel=3, + ) + self._init_tables_from_sequence(X, key=key) + # A a dataset dict spec + elif is_dict_like(X): + self._init_tables_from_mapping(X) + # Fail if X is not recognized + else: + raise TypeError( + type_error_message("X", X, "array-like", tuple, Sequence, Mapping) + ) + + # Initialization of the target column if any + if y is not None: + self._init_target_column(y) + + # Index the tables by name + self._tables_by_name = { + table.name: table for table in [self.main_table] + self.secondary_tables + } + + # Post-conditions + assert self.main_table is not None, "'main_table' is 'None' after init" + assert isinstance( + self.secondary_tables, list + ), "'secondary_tables' is not a list after init" + assert not self.is_multitable or len( + self.secondary_tables + ), "'secondary_tables' is empty in a multi-table dataset" + assert ( + y is None or self.target_column is not None + ), "'y' is set but 'target_column' is None" + + def _check_input_tuple(self, X): + if len(X) != 2: + raise ValueError(f"'X' tuple input must have length 2 not {len(X)}") + if not isinstance(X[0], str): + raise TypeError(type_error_message("X[0]", X[0], str)) + if not isinstance(X[1], str): + raise TypeError(type_error_message("X[1]", X[1], str)) + + def _init_tables_from_sequence(self, X, key=None): + """Initializes the spec from a list-like 'X'""" + assert is_list_like(X), "'X' must be a list-like" + + # Check the input sequence + self._check_input_sequence(X, key=key) + + # Initialize the tables + if isinstance(X[0], pd.DataFrame): + self.main_table = PandasTable("main_table", X[0], key=key) + self.secondary_tables = [] + for index, dataframe in enumerate(X[1:], start=1): + self.secondary_tables.append( + PandasTable(f"secondary_table_{index:02d}", dataframe, key=key) + ) + else: + self.main_table = FileTable("main_table", X[0], key=key) + self.secondary_tables = [] + for index, table_path in enumerate(X[1:], start=1): + self.secondary_tables.append( + FileTable(f"secondary_table_{index:02d}", table_path, key=key) + ) + + # Create a list of relations + main_table_name = self.main_table.name + self.relations = [ + (main_table_name, table.name, False) for table in self.secondary_tables + ] + + def _check_input_sequence(self, X, key=None): + # Check the first table + if len(X) == 0: + raise ValueError("'X' must be a non-empty sequence") + if not isinstance(X[0], (str, pd.DataFrame)): + raise TypeError(type_error_message("X[0]", X[0], str, pd.DataFrame)) + + # Check that the secondary table types are coherent with that of the first + main_table_type = type(X[0]) + for i, secondary_X in enumerate(X[1:], start=1): + if not isinstance(secondary_X, main_table_type): + raise TypeError( + type_error_message(f"Table at index {i}", X[i], main_table_type) + + " as the first table in X" + ) + + # Check the key for the main_table (it is the same for the others) + _check_table_key("main_table", key) + + def _init_tables_from_mapping(self, X): + """Initializes the table spec from a dict-like 'X'""" + assert is_dict_like(X), "'X' must be dict-like" + + # Check the input mapping + check_dataset_spec(X) + + # Initialize tables objects + if len(X["tables"]) == 1: + main_table_name = list(X["tables"])[0] + main_table_source, main_table_key = list(X["tables"].values())[0] + if isinstance(main_table_key, str): + main_table_key = [main_table_key] + else: + main_table_name = X["main_table"] + main_table_source, main_table_key = X["tables"][main_table_name] + + # Initialize a file dataset + if isinstance(main_table_source, str): + # Obtain the file format parameters + if "format" in X: + self.sep, self.header = X["format"] + else: + self.sep = "\t" + self.header = True + + # Initialize the tables + self.main_table = FileTable( + main_table_name, + main_table_source, + key=main_table_key, + sep=self.sep, + header=self.header, + ) + self.secondary_tables = [] + for table_name, (table_source, table_key) in X["tables"].items(): + if isinstance(table_key, str): + table_key = [table_key] + if table_name != main_table_name: + self.secondary_tables.append( + FileTable( + table_name, + table_source, + key=table_key, + sep=self.sep, + header=self.header, + ) + ) + # Initialize a Pandas dataset + elif isinstance(main_table_source, pd.DataFrame): + self.main_table = PandasTable( + main_table_name, + main_table_source, + key=main_table_key, + ) + self.secondary_tables = [] + for table_name, (table_source, table_key) in X["tables"].items(): + if table_name != main_table_name: + self.secondary_tables.append( + PandasTable(table_name, table_source, key=table_key) + ) + # Initialize a sparse dataset (monotable) + elif isinstance(main_table_source, sp.spmatrix): + self.main_table = SparseTable( + main_table_name, + main_table_source, + key=main_table_key, + ) + self.secondary_tables = [] + # Initialize a numpyarray dataset (monotable) + else: + self.main_table = NumpyTable( + main_table_name, + main_table_source, + ) + if len(X["tables"]) > 1: + raise ValueError( + "Multi-table schemas are only allowed " + "with pandas dataframe source tables" + ) + self.secondary_tables = [] + + # If the relations are not specified intialize to a star schema + if "relations" not in X: + self.relations = [ + (self.main_table.name, table.name, False) + for table in self.secondary_tables + ] + # Otherwise initialize the relations in the spec + else: + relations = [] + for relation in X["relations"]: + parent, child = relation[:2] + relations.append( + (parent, child, relation[2] if len(relation) == 3 else False) + ) + self.relations = relations + + def _init_target_column(self, y): + assert self.main_table is not None + assert self.secondary_tables is not None + + # Check y's type + # For in memory target columns: + # - column_or_1d checks *and transforms* to a numpy.array if successful + # - warn=True in column_or_1d is necessary to pass sklearn checks + if isinstance(y, str): + y_checked = y + else: + y_checked = column_or_1d(y, warn=True) + + # Check the target type coherence with those of X's tables + if isinstance( + self.main_table, (PandasTable, SparseTable, NumpyTable) + ) and isinstance(y_checked, str): + if isinstance(self.main_table, PandasTable): + type_message = "pandas.DataFrame" + elif isinstance(self.main_table, SparseTable): + type_message = "scipy.sparse.spmatrix" + else: + type_message = "numpy.ndarray" + raise TypeError( + type_error_message("y", y, "array-like") + + f" (X's tables are of type {type_message})" + ) + if isinstance(self.main_table.data_source, str) and not isinstance( + y_checked, str + ): + raise TypeError( + type_error_message("y", y, str) + + " (X's tables are of type str [file paths])" + ) + + # Initialize the members related to the target + # Case when y is a memory array + if hasattr(y_checked, "__array__"): + self.target_column = y_checked + + # Initialize the id of the target column + if isinstance(y, pd.Series) and y.name is not None: + self.target_column_id = y.name + elif isinstance(y, pd.DataFrame): + self.target_column_id = y.columns[0] + else: + if pd.api.types.is_integer_dtype(self.main_table.column_ids): + self.target_column_id = self.main_table.column_ids[-1] + 1 + else: + assert pd.api.types.is_string_dtype(self.main_table.column_ids) + self.target_column_id = "UnknownTargetColumn" + + # Fail if there is a column in the main_table with the target column's name + if self.target_column_id in self.main_table.column_ids: + raise ValueError( + f"Target column name '{self.target_column_id}' " + f"is already present in the main table. " + f"Column names: {list(self.main_table.column_ids)}" + ) + # Case when y is column id: Set both the column and the id to it + else: + assert isinstance(y, str), type_error_message("y", y, str) + self.target_column = y + self.target_column_id = y + + # Check the target column exists in the main table + if self.target_column_id not in self.main_table.column_ids: + raise ValueError( + f"Target column '{self.target_column}' not present in main table. " + f"Column names: {list(self.main_table.column_ids)}'" + ) + + # Force the target column type from the parameters + if self.categorical_target: + self.main_table.khiops_types[self.target_column_id] = "Categorical" + else: + self.main_table.khiops_types[self.target_column_id] = "Numerical" + + @property + def is_in_memory(self): + """bool : ``True`` if the dataset is in-memory + + A dataset is in-memory if it is constituted either of only pandas.DataFrame + tables, numpy.ndarray, or scipy.sparse.spmatrix tables. + """ + + return isinstance(self.main_table, (PandasTable, NumpyTable, SparseTable)) + + @property + def table_type(self): + """type : The table type of this dataset's tables + + Possible values: + + - `PandasTable` + - `NumpyTable` + - `SparseTable` + - `FileTable` + """ + return type(self.main_table) + + @property + def is_multitable(self): + """bool : ``True`` if the dataset is multitable""" + return self.secondary_tables is not None and len(self.secondary_tables) > 0 + + def to_spec(self): + """Returns a dictionary specification of this dataset""" + ds_spec = {} + ds_spec["main_table"] = self.main_table.name + ds_spec["tables"] = {} + ds_spec["tables"][self.main_table.name] = ( + self.main_table.data_source, + self.main_table.key, + ) + for table in self.secondary_tables: + ds_spec["tables"][table.name] = (table.data_source, table.key) + if self.relations: + ds_spec["relations"] = [] + ds_spec["relations"].extend(self.relations) + if self.table_type == FileTable: + ds_spec["format"] = (self.sep, self.header) + + return ds_spec + + def copy(self): + """Creates a copy of the dataset + + Referenced pandas.DataFrame's, numpy.nparray's and scipy.sparse.spmatrix's in + tables are copied as references. + """ + return Dataset(self.to_spec()) + + def get_table(self, table_name): + """Returns a table by its name + + Parameters + ---------- + table_name: str + The name of the table to be retrieved. + + Returns + ------- + `DatasetTable` + The table object for the specified name. + + Raises + ------ + `KeyError` + If there is no table with the specified name. + """ + return self._tables_by_name[table_name] + + def create_khiops_dictionary_domain(self): + """Creates a Khiops dictionary domain representing this dataset + + Returns + ------- + `.DictionaryDomain` + The dictionary domain object representing this dataset + """ + assert self.main_table is not None, "'main_table' must be initialized" + + # Create root dictionary and add it to the domain + dictionary_domain = kh.DictionaryDomain() + main_dictionary = self.main_table.create_khiops_dictionary() + dictionary_domain.add_dictionary(main_dictionary) + + # For in-memory datasets: Add the target variable if available + if self.is_in_memory and self.target_column is not None: + variable = kh.Variable() + variable.name = get_khiops_variable_name(self.target_column_id) + if self.categorical_target: + variable.type = "Categorical" + else: + variable.type = "Numerical" + main_dictionary.add_variable(variable) + + # Create the dictionaries for each secondary table and the table variables in + # root dictionary that point to each secondary table + # This is performed using a breadth-first-search over the graph of relations + # Note: In general 'name' and 'object_type' fields of Variable can be different + if self.secondary_tables: + main_dictionary.root = True + table_names = [table.name for table in self.secondary_tables] + tables_to_visit = [self.main_table.name] + while tables_to_visit: + current_table = tables_to_visit.pop(0) + for relation in self.relations: + parent_table, child_table, is_one_to_one_relation = relation + if parent_table == current_table: + tables_to_visit.append(child_table) + parent_table_name = parent_table + index_table = table_names.index(child_table) + table = self.secondary_tables[index_table] + parent_table_dictionary = dictionary_domain.get_dictionary( + parent_table_name + ) + dictionary = table.create_khiops_dictionary() + dictionary_domain.add_dictionary(dictionary) + table_variable = kh.Variable() + if is_one_to_one_relation: + table_variable.type = "Entity" + else: + table_variable.type = "Table" + table_variable.name = table.name + table_variable.object_type = table.name + parent_table_dictionary.add_variable(table_variable) + + return dictionary_domain + + def create_table_files_for_khiops(self, output_dir, sort=True): + """Prepares the tables of the dataset to be used by Khiops + + If this is a multi-table dataset it will create sorted copies the tables. + + Parameters + ---------- + output_dir : str + The directory where the sorted tables will be created. + + Returns + ------- + tuple + A tuple containing: + + - The path of the main table + - A dictionary containing the relation [table-name -> file-path] for the + secondary tables. The dictionary is empty for monotable datasets. + """ + # Sort the main table unless: + # - The caller specifies not to do it (sort = False) + # - The dataset is mono-table and the main table has no key + sort_main_table = sort and ( + self.is_multitable or self.main_table.key is not None + ) + + # In-memory dataset: Create the table files and add the target column + if self.is_in_memory: + main_table_path = self.main_table.create_table_file_for_khiops( + output_dir, + sort=sort_main_table, + target_column=self.target_column, + target_column_id=self.target_column_id, + ) + # File dataset: Create the table files (the target column is in the file) + else: + main_table_path = self.main_table.create_table_file_for_khiops( + output_dir, + sort=sort_main_table, + ) + + # Create a copy of each secondary table + secondary_table_paths = {} + for table in self.secondary_tables: + secondary_table_paths[table.name] = table.create_table_file_for_khiops( + output_dir, sort=sort + ) + + return main_table_path, secondary_table_paths + + def __repr__(self): + return str(self.create_khiops_dictionary_domain()) + + +# pylint: enable=invalid-name + + +class DatasetTable(ABC): + """A generic dataset table""" + + def __init__(self, name, key=None): + # Check input + if not isinstance(name, str): + raise TypeError(type_error_message("name", name, str)) + if not name: + raise ValueError("'name' cannot be empty") + if key is not None: + if not is_list_like(key) and not isinstance(key, (str, int)): + raise TypeError(type_error_message("key", key, str, int, "list-like")) + if is_list_like(key): + for column_index, column_id in enumerate(key): + if not isinstance(column_id, (str, int)): + raise TypeError( + type_error_message( + f"key[{column_index}]", column_id, str, int + ) + + f" at table '{name}'" + ) + + # Initialization (must be completed by concrete sub-classes) + self.name = name + self.data_source = None + if is_list_like(key) or key is None: + self.key = key + else: + self.key = [key] + self.column_ids = None + self.khiops_types = None + self.n_samples = None + + def check_key(self): + """Checks that the key columns exist""" + if self.key is not None: + if not is_list_like(self.key): + raise TypeError( + type_error_message("key", self.key, str, int, "list-like") + ) + for column_name in self.key: + if column_name not in self.column_ids: + raise ValueError( + f"Column '{column_name}' not present in table '{self.name}'" + ) + + @abstractmethod + def create_table_file_for_khiops(self, output_dir, sort=True): + """Creates a copy of the table at the specified directory""" + + def n_features(self): + """Returns the number of features of the table + + The target column does not count. + """ + return len(self.column_ids) + + def create_khiops_dictionary(self): + """Creates a Khiops dictionary representing this table + + Returns + ------- + `.Dictionary`: + The Khiops Dictionary object describing this table's schema + + """ + assert self.column_ids is not None, "Dataset column list is None" + assert self.key is None or is_list_like(self.key), "'key' is not list-like" + + # Create dictionary object + dictionary = kh.Dictionary() + dictionary.name = self.name + if self.key is not None: + dictionary.key = self.key + + # For each column add a Khiops variable to the dictionary + for column_id in self.column_ids: + variable = kh.Variable() + variable.name = get_khiops_variable_name(column_id) + + # Set the type of the column/variable + # Case of a column in the key : Set to categorical + if self.key is not None and column_id in self.key: + variable.type = "Categorical" + # The rest of columns: Obtain the type from dtypes + else: + variable.type = self.khiops_types[column_id] + dictionary.add_variable(variable) + return dictionary + + +class PandasTable(DatasetTable): + """DatasetTable encapsulating a pandas dataframe + + Parameters + ---------- + name : str + Name for the table. + dataframe : `pandas.DataFrame` + The data frame to be encapsulated. It must be non-empty. + key : list of str, optional + The names of the columns composing the key. + """ + + def __init__(self, name, dataframe, key=None): + # Call the parent method + super().__init__(name, key=key) + + # Check inputs specific to this sub-class + if not isinstance(dataframe, pd.DataFrame): + raise TypeError(type_error_message("dataframe", dataframe, pd.DataFrame)) + if dataframe.shape[0] == 0: + raise ValueError("'dataframe' is empty") + + # Initialize the attributes + self.data_source = dataframe + self.n_samples = len(self.data_source) + + # Initialize feature columns and verify their types + self.column_ids = self.data_source.columns.values + if not np.issubdtype(self.column_ids.dtype, np.integer): + if np.issubdtype(self.column_ids.dtype, object): + for i, column_id in enumerate(self.column_ids): + if not isinstance(column_id, str): + raise TypeError( + f"Dataframe column ids must be either all integers or " + f"all strings. Column id at index {i} ('{column_id}') is" + f" of type '{type(column_id).__name__}'" + ) + else: + raise TypeError( + f"Dataframe column ids must be either all integers or " + f"all strings. The column index has dtype " + f"'{self.column_ids.dtype}'" + ) + + # Initialize Khiops types + self.khiops_types = { + column_id: get_khiops_type(self.data_source.dtypes[column_id]) + for column_id in self.column_ids + } + + # Check key integrity + self.check_key() + + def __repr__(self): + dtypes_str = ( + str(self.data_source.dtypes).replace("\n", ", ")[:-16].replace(" ", ":") + ) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtypes={dtypes_str}>" + ) + + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): + assert not sort or self.key is not None, "Cannot sort table without a key" + assert not sort or is_list_like( + self.key + ), "Cannot sort table with a key is that is not list-like" + assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None + + # Create the output table resource object + output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") + + # Write the output dataframe + output_dataframe = self.data_source.copy() + output_names = { + column_id: get_khiops_variable_name(column_id) + for column_id in self.column_ids + } + output_dataframe.rename(columns=output_names, inplace=True) + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) + + # Sort by key if requested (as string) + if sort: + output_dataframe.sort_values( + by=self.key, + key=lambda array: array.astype("str"), + inplace=True, + kind="mergesort", + ) + + # Write the dataframe to an internal table file + with io.StringIO() as output_dataframe_stream: + write_internal_data_table(output_dataframe, output_dataframe_stream) + fs.write( + output_table_path, output_dataframe_stream.getvalue().encode("utf-8") + ) + + return output_table_path + + +class NumpyTable(DatasetTable): + """DatasetTable encapsulating a NumPy array + + Parameters + ---------- + name : str + Name for the table. + array : `numpy.ndarray` of shape (n_samples, n_features_in) + The data frame to be encapsulated. + key : :external:term`array-like` of int, optional + The names of the columns composing the key. + """ + + def __init__(self, name, array, key=None): + # Call the parent method + super().__init__(name, key=key) + + # Check the array's types and shape + if not hasattr(array, "__array__"): + raise TypeError(type_error_message("array", array, np.ndarray)) + + # Initialize the members + self.data_source = check_array(array, ensure_2d=True, force_all_finite=False) + self.column_ids = column_or_1d(range(self.data_source.shape[1])) + self.khiops_types = { + column_id: get_khiops_type(self.data_source.dtype) + for column_id in self.column_ids + } + self.n_samples = len(self.data_source) + + def __repr__(self): + dtype_str = str(self.data_source.dtype) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtype={dtype_str}; target={self.target_column_id}>" + ) + + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): + assert not sort or self.key is not None, "Cannot sort table without a key" + assert not sort or is_list_like( + self.key + ), "Cannot sort table with a key is that is not list-like" + assert not sort or len(self.key) > 0, "Cannot sort table with an empty key" + + # Create the output table resource object + output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") + + # Write the output dataframe + # Note: This is not optimized for memory. + output_dataframe = pd.DataFrame(self.data_source.copy()) + output_dataframe.columns = [ + get_khiops_variable_name(column_id) for column_id in self.column_ids + ] + if target_column is not None: + output_dataframe[get_khiops_variable_name(target_column_id)] = ( + target_column.copy() + ) + + # Sort by key if requested (as string) + if sort: + np.sort( + output_dataframe, + by=self.key, + key=lambda array: array.astype("str"), + inplace=True, + kind="mergesort", + ) + + # Write the dataframe to an internal table file + with io.StringIO() as output_dataframe_stream: + write_internal_data_table(output_dataframe, output_dataframe_stream) + fs.write( + output_table_path, output_dataframe_stream.getvalue().encode("utf-8") + ) + + return output_table_path + + +class SparseTable(DatasetTable): + """DatasetTable encapsulating a SciPy sparse matrix + + Parameters + ---------- + name : str + Name for the table. + matrix : `scipy.sparse.spmatrix` + The sparse matrix to be encapsulated. + key : list of str, optional + The names of the columns composing the key. + """ + + def __init__(self, name, matrix, key=None): + assert key is None, "'key' must be unset for sparse matrix tables" + # Call the parent method + super().__init__(name, key=key) + + # Check the sparse matrix types + if not isinstance(matrix, sp.spmatrix): + raise TypeError( + type_error_message("matrix", matrix, "scipy.sparse.spmatrix") + ) + if not np.issubdtype(matrix.dtype, np.number): + raise TypeError( + type_error_message("'matrix' dtype", matrix.dtype, "numeric") + ) + + # Initialize the members + self.data_source = matrix + self.column_ids = column_or_1d(range(matrix.shape[1])) + self.khiops_types = { + column_id: get_khiops_type(matrix.dtype) for column_id in self.column_ids + } + self.n_samples = self.data_source.shape[0] + + def __repr__(self): + dtype_str = str(self.data_source.dtype) + return ( + f"<{self.__class__.__name__}; cols={list(self.column_ids)}; " + f"dtype={dtype_str}>" + ) + + def create_khiops_dictionary(self): + """Creates a Khiops dictionary representing this sparse table + + Adds metadata to each sparse variable + + Returns + ------- + `.Dictionary`: + The Khiops Dictionary object describing this table's schema + + """ + + # create dictionary as usual + dictionary = super().create_khiops_dictionary() + + # create variable block for containing the sparse variables + variable_block = VariableBlock() + variable_block.name = "SparseVariables" + + # For each variable, add metadata, named `VarKey` + variable_names = [variable.name for variable in dictionary.variables] + for i, variable_name in enumerate(variable_names, 1): + variable = dictionary.remove_variable(variable_name) + variable.meta_data.add_value("VarKey", i) + variable_block.add_variable(variable) + dictionary.add_variable_block(variable_block) + + return dictionary + + def _flatten(self, iterable): + if isinstance(iterable, Iterable): + for iterand in iterable: + if isinstance(iterand, Iterable): + yield from self._flatten(iterand) + else: + yield iterand + + def _write_sparse_block(self, row_index, stream, target_value=None): + + # Access the sparse row + row = self.data_source.getrow(row_index) + # Variable indices are not always sorted in `row.indices` + # Khiops needs variable indices to be sorted + sorted_indices = np.sort(row.nonzero()[1], axis=-1, kind="mergesort") + + # Flatten row for Python < 3.9 scipy.sparse.lil_matrix whose API + # is not homogeneous with other sparse matrices: it stores + # opaque Python lists as elements + # Thus: + # - if isinstance(self.data_source, sp.lil_matrix) and Python 3.8, then + # row.data is np.array([list([...])]) + # - else, row.data is np.array([...]) + # TODO: remove this flattening once Python 3.8 support is dropped + sorted_data = np.fromiter(self._flatten(row.data), row.data.dtype)[ + sorted_indices.argsort() + ] + for variable_index, variable_value in zip(sorted_indices, sorted_data): + stream.write(f"{variable_index + 1}:{variable_value} ") + + # Write the target value at the end of the record if available + if target_value is not None: + stream.write(f"\t{target_value}\n") + else: + stream.write("\n") + + def create_table_file_for_khiops( + self, output_dir, sort=True, target_column=None, target_column_id=None + ): + assert target_column is not None or target_column_id is None + assert target_column_id is not None or target_column is None + + # Create the output table resource object + output_table_path = fs.get_child_path(output_dir, f"{self.name}.txt") + + # Write the sparse matrix to an internal table file + with io.StringIO() as output_sparse_matrix_stream: + if target_column is not None: + output_sparse_matrix_stream.write( + f"SparseVariables\t{get_khiops_variable_name(target_column_id)}\n" + ) + for target_value, row_index in zip( + target_column, range(self.data_source.shape[0]) + ): + self._write_sparse_block( + row_index, + output_sparse_matrix_stream, + target_value=target_value, + ) + else: + output_sparse_matrix_stream.write("SparseVariables\n") + for row_index in range(self.data_source.shape[0]): + self._write_sparse_block(row_index, output_sparse_matrix_stream) + fs.write( + output_table_path, + output_sparse_matrix_stream.getvalue().encode("utf-8"), + ) + + return output_table_path + + +class FileTable(DatasetTable): + """DatasetTable encapsulating a delimited text data file + + Parameters + ---------- + name : str + Name for the table. + path : str + Path of the file containing the table. + key : list-like of str, optional + The names of the columns composing the key. + sep : str, optional + Field separator character. If not specified it will be inferred from the file. + header : bool, optional + Indicates if the table. + """ + + def __init__( + self, + name, + path, + key=None, + sep="\t", + header=True, + ): + # Initialize parameters + super().__init__(name=name, key=key) + + # Check the parameters specific to this sub-class + if not isinstance(path, str): + raise TypeError(type_error_message("path", path, str)) + if not fs.exists(path): + raise ValueError(f"Non-existent data table file: {path}") + + # Initialize members specific to this sub-class + self.data_source = path + self.sep = sep + self.header = header + + # Build a dictionary file from the input data table + # Note: We use export_dictionary_as_json instead of read_dictionary_file + # because it makes fail the sklearn mocked tests (this is technical debt) + try: + tmp_kdic_path = kh.get_runner().create_temp_file("file_table_", ".kdic") + tmp_kdicj_path = kh.get_runner().create_temp_file("file_table_", ".kdicj") + kh.build_dictionary_from_data_table( + self.data_source, + self.name, + tmp_kdic_path, + field_separator=self.sep, + header_line=header, + ) + kh.export_dictionary_as_json(tmp_kdic_path, tmp_kdicj_path) + with open(tmp_kdicj_path, encoding="utf8") as tmp_kdicj: + json_domain = json.load(tmp_kdicj) + finally: + os.remove(tmp_kdic_path) + os.remove(tmp_kdicj_path) + + # Alert the user if the parsing failed + if len(json_domain["dictionaries"]) == 0: + raise KhiopsRuntimeError( + f"Failed to build a dictionary " + f"from data table file: {self.data_source}" + ) + + # Set the column names and types + variables = json_domain["dictionaries"][0]["variables"] + self.column_ids = [var["name"] for var in variables] + self.khiops_types = {var["name"]: var["type"] for var in variables} + + # Check key integrity + self.check_key() + + def create_table_file_for_khiops(self, output_dir, sort=True): + assert not sort or self.key is not None, "key is 'None'" + + # Create the input and output file resources + if sort: + output_table_file_path = fs.get_child_path( + output_dir, f"sorted_{self.name}.txt" + ) + else: + output_table_file_path = fs.get_child_path( + output_dir, f"copy_{self.name}.txt" + ) + + # Fail if they have the same path + if output_table_file_path == self.data_source: + raise ValueError(f"Cannot overwrite this table's path: {self.data_source}") + + # Create a sorted copy if requested + if sort: + # Create the sorting dictionary domain + sort_dictionary_domain = kh.DictionaryDomain() + sort_dictionary_domain.add_dictionary(self.create_khiops_dictionary()) + + # Delegate the sorting and copy to khiops.core.sort_data_table + # We use the same input format of the original table + kh.sort_data_table( + sort_dictionary_domain, + self.name, + self.data_source, + output_table_file_path, + self.key, + field_separator=self.sep, + header_line=self.header, + output_field_separator=self.sep, + output_header_line=self.header, + ) + + # Otherwise copy the contents to the output file + else: + fs.write(output_table_file_path, fs.read(self.data_source)) + + return output_table_file_path diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py new file mode 100644 index 00000000..e8c4d192 --- /dev/null +++ b/khiops/utils/helpers.py @@ -0,0 +1,327 @@ +"""General helper functions""" + +import itertools +import os + +from sklearn.model_selection import train_test_split + +from khiops import core as kh +from khiops.core.internals.common import is_dict_like, type_error_message +from khiops.utils.dataset import Dataset, FileTable, PandasTable + + +def sort_dataset(ds_spec, output_dir=None): + """Sorts a dataset by its table key columns + + + The dataset may be multi-table or not. If it is monotable the key of the only table + must be specified. + + Parameters + ---------- + ds_spec: dict + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. + output_dir: str, optional + *Only for file datasets:* The output directory for the sorted files. + + + Notes + ----- + + The sorting algorithm is mergesort, which ensures sort stability. The sorting engine + for dataframes is Pandas and for file-based datasets is Khiops. + + Examples + -------- + See the following functions of the ``samples.py`` documentation script: + - `samples.sort_data_tables_mt()` + """ + # Check the types + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) + + # Build the dataset + ds = Dataset(ds_spec) + + # Check special arguments in function of the dataset + if ds.table_type == FileTable and output_dir is None: + raise ValueError("'output_dir' must be specified for file based datasets") + + # Make a copy of the dataset (note: data sources are just reference) + out_ds = ds.copy() + + # Replace each datasource with the sorted table + for table in [out_ds.main_table] + out_ds.secondary_tables: + if isinstance(table, PandasTable): + table.data_source = _sort_df_table(table) + else: + assert isinstance(table, FileTable) + table.data_source = _sort_file_table(table, ds.sep, ds.header, output_dir) + + return out_ds.to_spec() + + +def _sort_df_table(table): + assert isinstance(table, PandasTable), type_error_message( + "table", table, PandasTable + ) + out_data_source = table.data_source.sort_values( + by=table.key, + key=lambda array: array.astype("str"), + inplace=False, + kind="mergesort", + ) + + return out_data_source + + +def _sort_file_table(table, sep, header, output_dir): + assert isinstance(table, FileTable), type_error_message("table", table, FileTable) + domain = kh.DictionaryDomain() + dictionary = table.create_khiops_dictionary() + domain.add_dictionary(dictionary) + out_data_source = os.path.join(output_dir, f"{dictionary.name}.txt") + kh.sort_data_table( + domain, + dictionary.name, + table.data_source, + out_data_source, + field_separator=sep, + header_line=header, + output_field_separator=sep, + output_header_line=header, + ) + + return out_data_source + + +# Note: We build the splits with lists and itertools.chain avoid pylint warning about +# unbalanced-tuple-unpacking. See issue https://github.com/pylint-dev/pylint/issues/5671 + + +def train_test_split_dataset( + ds_spec, target_column=None, test_size=0.25, output_dir=None, **kwargs +): + """Splits a dataset spec into train and test + + Parameters + ---------- + ds_spec : ``dict`` + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. + target_column : :external:term:`array-like`, optional + The target values. + test_size : float, default 0.25 + The proportion of the dataset (between 0.0 and 1.0) to be included in the test + split. + output_dir : str, optional + *Only for file datasets:* The output directory for the split data files. + ... : + Other optional parameters for `sklearn.model_selection.train_test_split` + + + Examples + -------- + See the following functions of the ``samples_sklearn.py`` documentation script: + - `samples_sklearn.khiops_classifier_multitable_star` + - `samples_sklearn.khiops_classifier_multitable_star_file` + - `samples_sklearn.khiops_classifier_multitable_snowflake` + """ + # Check the types + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) + + # Build the dataset for the feature table + ds = Dataset(ds_spec) + + # Check the parameter coherence + if not ds.is_in_memory: + if target_column is not None: + raise ValueError("'target_column' cannot be used with file path datasets") + if output_dir is None: + raise ValueError("'output_dir' must be specified for file path datasets") + if not isinstance(output_dir, str): + raise TypeError(type_error_message("output_dir", output_dir, str)) + + # Perform the split for each type of dataset + if ds.is_in_memory: + # Obtain the keys for the other test_train_split function + sklearn_split_params = {} + for param in ("train_size", "random_state", "shuffle", "stratify"): + if param in kwargs: + sklearn_split_params[param] = kwargs[param] + + if target_column is None: + train_ds, test_ds = _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + train_target_column = None + test_target_column = None + else: + train_ds, test_ds, train_target_column, test_target_column = ( + _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + ) + else: + train_ds, test_ds = _train_test_split_file_dataset(ds, test_size, output_dir) + train_target_column = None + test_target_column = None + + # Create the return tuple + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + if target_column is None: + split = itertools.chain((train_ds.to_spec(), test_ds.to_spec())) + else: + split = itertools.chain( + ( + train_ds.to_spec(), + test_ds.to_spec(), + train_target_column, + test_target_column, + ) + ) + + return split + + +def _train_test_split_in_memory_dataset( + ds, target_column, test_size, split_params=None +): + # Create shallow copies of the feature dataset + train_ds = ds.copy() + test_ds = ds.copy() + + # Split the main table and the target (if any) + if target_column is None: + train_ds.main_table.data_source, test_ds.main_table.data_source = ( + train_test_split( + ds.main_table.data_source, test_size=test_size, **split_params + ) + ) + train_target_column = None + test_target_column = None + else: + ( + train_ds.main_table.data_source, + test_ds.main_table.data_source, + train_target_column, + test_target_column, + ) = train_test_split( + ds.main_table.data_source, + target_column, + test_size=test_size, + **split_params, + ) + + # Split the secondary tables tables + # Note: The tables are traversed in BFS + todo_relations = [ + relation for relation in ds.relations if relation[0] == ds.main_table.name + ] + while todo_relations: + current_parent_table_name, current_child_table_name, _ = todo_relations.pop(0) + for relation in ds.relations: + parent_table_name, _, _ = relation + if parent_table_name == current_child_table_name: + todo_relations.append(relation) + + for new_ds in (train_ds, test_ds): + origin_child_table = ds.get_table(current_child_table_name) + new_child_table = new_ds.get_table(current_child_table_name) + new_parent_table = new_ds.get_table(current_parent_table_name) + new_parent_key_cols_df = new_parent_table.data_source[new_parent_table.key] + new_child_table.data_source = new_parent_key_cols_df.merge( + origin_child_table.data_source, on=new_parent_table.key + ) + + # Build the return value + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + if target_column is None: + split = itertools.chain((train_ds, test_ds)) + else: + split = itertools.chain( + (train_ds, test_ds, train_target_column, test_target_column) + ) + + return split + + +def _train_test_split_file_dataset(ds, test_size, output_dir): + domain = ds.create_khiops_dictionary_domain() + secondary_data_paths = domain.extract_data_paths(ds.main_table.name) + additional_data_tables = {} + output_additional_data_tables = { + "train": {}, + "test": {}, + } + # Initialize the split datasets as copies of the original one + split_dss = { + "train": ds.copy(), + "test": ds.copy(), + } + for split, split_ds in split_dss.items(): + split_ds.main_table.data_source = os.path.join( + output_dir, split, f"{split_ds.main_table.name}.txt" + ) + + for data_path in secondary_data_paths: + dictionary = domain.get_dictionary_at_data_path(data_path) + table = ds.get_table(dictionary.name) + additional_data_tables[data_path] = table.data_source + for ( + split, + split_output_additional_data_tables, + ) in output_additional_data_tables.items(): + data_table_path = os.path.join(output_dir, split, f"{table.name}.txt") + split_output_additional_data_tables[data_path] = data_table_path + split_dss[split].get_table(table.name).data_source = data_table_path + + # Construct the split with Khiops by deploying a idempotent model with selection + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["train"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["train"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Include sample", + ) + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["test"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["test"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Exclude sample", + ) + + # Note: We use `itertools.chain` to avoid pylint false positive about + # unbalanced-tuple-unpacking. This warning appears when calling the function so + # users would be warned. To remove when the following issue is fixed: + # https://github.com/pylint-dev/pylint/issues/5671 + return itertools.chain(split_dss.values()) diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsClassifier/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsEncoder/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/transformed.txt b/tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/Adult/KhiopsRegressor/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsClassifier/predict.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsCoclustering/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transform.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsEncoder/transform.txt diff --git a/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/transformed.txt b/tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/predict.txt similarity index 100% rename from tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/transformed.txt rename to tests/resources/sklearn/results/ref_predictions/SpliceJunction/KhiopsRegressor/predict.txt diff --git a/tests/test_dataset_class.py b/tests/test_dataset_class.py index 3a0d417f..be0034c6 100644 --- a/tests/test_dataset_class.py +++ b/tests/test_dataset_class.py @@ -8,6 +8,7 @@ import os import shutil import unittest +import warnings import numpy as np import pandas as pd @@ -16,10 +17,10 @@ from pandas.testing import assert_frame_equal from sklearn import datasets -from khiops.sklearn.tables import Dataset +from khiops.utils.dataset import Dataset -class KhiopsConsistensyOfFilesAndDictionariesWithInputDataTests(unittest.TestCase): +class DatasetInputOutputConsistencyTests(unittest.TestCase): """Test consistency of the created files with the input data The following tests allow to verify that: @@ -85,7 +86,7 @@ def create_monotable_dataframe(self): "2019-03-29", "2019-03-30", "2019-03-31", - ] + ], ), "New": [ True, @@ -170,6 +171,10 @@ def create_multitable_star_data_files(self, main_table_path, secondary_table_pat secondary_table.to_csv(secondary_table_path, sep="\t", index=False) def create_multitable_snowflake_dataframes(self): + # Set the random seed for reproducibility + np.random.seed(31416) + + # Create the main table main_table_data = { "User_ID": [ "60B2Xk_3Fw", @@ -187,6 +192,7 @@ def create_multitable_snowflake_dataframes(self): } main_table = pd.DataFrame(main_table_data) + # Create the secondary tables secondary_table_data_1 = { "User_ID": np.random.choice(main_table["User_ID"], 20), "VAR_1": np.random.choice(["a", "b", "c", "d"], 20), @@ -195,7 +201,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(20).tolist(), 2), } secondary_table_1 = pd.DataFrame(secondary_table_data_1) - secondary_table_data_2 = { "User_ID": np.random.choice( main_table["User_ID"], len(main_table), replace=False @@ -208,7 +213,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_4": np.round(np.random.rand(len(main_table)).tolist(), 2), } secondary_table_2 = pd.DataFrame(secondary_table_data_2) - tertiary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 100), "VAR_1": np.random.choice(["a", "b", "c", "d"], 100), @@ -216,7 +220,6 @@ def create_multitable_snowflake_dataframes(self): "VAR_3": np.round(np.random.rand(100).tolist(), 2), } tertiary_table = pd.DataFrame(tertiary_table_data) - quaternary_table_data = { "User_ID": np.random.choice(main_table["User_ID"], 50), "VAR_1": np.random.choice(["a", "b", "c", "d"], 50), @@ -255,55 +258,53 @@ def create_multitable_snowflake_data_files( tertiary_table.to_csv(tertiary_table_path, sep="\t", index=False) quaternary_table.to_csv(quaternary_table_path, sep="\t", index=False) - def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema): + def create_fixture_ds_spec(self, output_dir, data_type, multitable, schema): if not multitable: if data_type == "df": - reference_table = self.create_monotable_dataframe() - features = reference_table.drop(["class"], axis=1) - dataset_spec = { + ref_table = self.create_monotable_dataframe() + features = ref_table.drop(["class"], axis=1) + ds_spec = { "main_table": "Reviews", "tables": {"Reviews": (features, "User_ID")}, } - label = reference_table["class"] + label = ref_table["class"] else: assert data_type == "file" - reference_table_path = os.path.join(output_dir, "Reviews.csv") - self.create_monotable_data_file(reference_table_path) - dataset_spec = { + ref_table_path = os.path.join(output_dir, "Reviews.csv") + self.create_monotable_data_file(ref_table_path) + ds_spec = { "main_table": "Reviews", - "tables": {"Reviews": (reference_table_path, "User_ID")}, + "tables": {"Reviews": (ref_table_path, "User_ID")}, "format": ("\t", True), } label = "class" elif schema == "star": if data_type == "df": ( - reference_main_table, - reference_secondary_table, + ref_main_table, + ref_secondary_table, ) = self.create_multitable_star_dataframes() - features_reference_main_table = reference_main_table.drop( - "class", axis=1 - ) - dataset_spec = { + features_ref_main_table = ref_main_table.drop("class", axis=1) + ds_spec = { "main_table": "id_class", "tables": { - "id_class": (features_reference_main_table, "User_ID"), - "logs": (reference_secondary_table, "User_ID"), + "id_class": (features_ref_main_table, "User_ID"), + "logs": (ref_secondary_table, "User_ID"), }, } - label = reference_main_table["class"] + label = ref_main_table["class"] else: assert data_type == "file" - reference_main_table_path = os.path.join(output_dir, "id_class.csv") - reference_secondary_table_path = os.path.join(output_dir, "logs.csv") + ref_main_table_path = os.path.join(output_dir, "id_class.csv") + ref_secondary_table_path = os.path.join(output_dir, "logs.csv") self.create_multitable_star_data_files( - reference_main_table_path, reference_secondary_table_path + ref_main_table_path, ref_secondary_table_path ) - dataset_spec = { + ds_spec = { "main_table": "id_class", "tables": { - "id_class": (reference_main_table_path, "User_ID"), - "logs": (reference_secondary_table_path, "User_ID"), + "id_class": (ref_main_table_path, "User_ID"), + "logs": (ref_secondary_table_path, "User_ID"), }, "format": ("\t", True), } @@ -312,30 +313,28 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) assert schema == "snowflake" if data_type == "df": ( - reference_main_table, - reference_secondary_table_1, - reference_secondary_table_2, - reference_tertiary_table, - reference_quaternary_table, + ref_main_table, + ref_secondary_table_1, + ref_secondary_table_2, + ref_tertiary_table, + ref_quaternary_table, ) = self.create_multitable_snowflake_dataframes() - features_reference_main_table = reference_main_table.drop( - "class", axis=1 - ) - dataset_spec = { + features_ref_main_table = ref_main_table.drop("class", axis=1) + ds_spec = { "main_table": "A", "tables": { "D": ( - reference_tertiary_table, + ref_tertiary_table, ["User_ID", "VAR_1", "VAR_2"], ), - "B": (reference_secondary_table_1, ["User_ID", "VAR_1"]), + "B": (ref_secondary_table_1, ["User_ID", "VAR_1"]), "E": ( - reference_quaternary_table, + ref_quaternary_table, ["User_ID", "VAR_1", "VAR_2", "VAR_3"], ), - "C": (reference_secondary_table_2, ["User_ID"]), - "A": (features_reference_main_table, "User_ID"), + "C": (ref_secondary_table_2, ["User_ID"]), + "A": (features_ref_main_table, "User_ID"), }, "relations": [ ("B", "D", False), @@ -344,40 +343,40 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) ("A", "B", False), ], } - label = reference_main_table["class"] + label = ref_main_table["class"] else: assert data_type == "file" - reference_main_table_path = os.path.join(output_dir, "A.csv") - reference_secondary_table_path_1 = os.path.join(output_dir, "B.csv") - reference_secondary_table_path_2 = os.path.join(output_dir, "C.csv") - reference_tertiary_table_path = os.path.join(output_dir, "D.csv") - reference_quaternary_table_path = os.path.join(output_dir, "E.csv") + ref_main_table_path = os.path.join(output_dir, "A.csv") + ref_secondary_table_path_1 = os.path.join(output_dir, "B.csv") + ref_secondary_table_path_2 = os.path.join(output_dir, "C.csv") + ref_tertiary_table_path = os.path.join(output_dir, "D.csv") + ref_quaternary_table_path = os.path.join(output_dir, "E.csv") self.create_multitable_snowflake_data_files( - reference_main_table_path, - reference_secondary_table_path_1, - reference_secondary_table_path_2, - reference_tertiary_table_path, - reference_quaternary_table_path, + ref_main_table_path, + ref_secondary_table_path_1, + ref_secondary_table_path_2, + ref_tertiary_table_path, + ref_quaternary_table_path, ) - dataset_spec = { + ds_spec = { "main_table": "A", "tables": { "B": ( - reference_secondary_table_path_1, + ref_secondary_table_path_1, ["User_ID", "VAR_1"], ), "E": ( - reference_quaternary_table_path, + ref_quaternary_table_path, ["User_ID", "VAR_1", "VAR_2", "VAR_3"], ), "C": ( - reference_secondary_table_path_2, + ref_secondary_table_path_2, ["User_ID"], ), - "A": (reference_main_table_path, "User_ID"), + "A": (ref_main_table_path, "User_ID"), "D": ( - reference_tertiary_table_path, + ref_tertiary_table_path, ["User_ID", "VAR_1", "VAR_2"], ), }, @@ -391,12 +390,12 @@ def create_fixture_dataset_spec(self, output_dir, data_type, multitable, schema) } label = "class" - return dataset_spec, label + return ds_spec, label - def get_reference_dictionaries(self, multitable, schema=None): - reference_dictionaries = [] + def get_ref_var_types(self, multitable, data_type="df", schema=None): + ref_var_types = {} if not multitable: - reference_dictionary = { + ref_var_types["Reviews"] = { "User_ID": "Categorical", "Age": "Numerical", "Clothing ID": "Numerical", @@ -407,32 +406,42 @@ def get_reference_dictionaries(self, multitable, schema=None): "Positive Feedback average": "Numerical", "class": "Categorical", } - reference_dictionaries.extend([reference_dictionary]) + # Special type changes for file datasets: + # - "Date" field from "Timestamp" to "Date", the type Khiops detects + # - "Recommended IND" field from "Numerical" to "Categorical" because + # Khiops doesn't parse it well + if data_type == "file": + ref_var_types["Reviews"]["Date"] = "Date" + ref_var_types["Reviews"]["Recommended IND"] = "Categorical" + warnings.warn("Changed field `Recommended IND` to avoid a Khiops bug") elif schema == "star": - reference_main_dictionary = { + ref_var_types["id_class"] = { "User_ID": "Categorical", "class": "Categorical", "logs": "Table", } - reference_secondary_dictionary = { + ref_var_types["logs"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", "VAR_3": "Numerical", "VAR_4": "Numerical", } - reference_dictionaries.extend( - [reference_main_dictionary, reference_secondary_dictionary] - ) + # Special change for the file type: + # - logs.VAR_3 is binary and detected as "Categorical" by Khiops + if data_type == "file": + ref_var_types["logs"]["VAR_3"] = "Categorical" else: - assert schema == "snowflake" - reference_main_dictionary = { + assert ( + schema == "snowflake" + ), f"'schema' should be 'snowflake' not '{schema}'" + ref_var_types["A"] = { "User_ID": "Categorical", "class": "Categorical", "B": "Table", "C": "Entity", } - reference_secondary_dictionary_1 = { + ref_var_types["B"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", @@ -440,55 +449,52 @@ def get_reference_dictionaries(self, multitable, schema=None): "VAR_4": "Numerical", "D": "Table", } - reference_secondary_dictionary_2 = { + ref_var_types["C"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Numerical", "VAR_3": "Numerical", "VAR_4": "Numerical", } - reference_tertiary_dictionary = { + ref_var_types["D"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Categorical", "VAR_3": "Numerical", "E": "Table", } - reference_quaternary_dictionary = { + ref_var_types["E"] = { "User_ID": "Categorical", "VAR_1": "Categorical", "VAR_2": "Categorical", "VAR_3": "Categorical", "VAR_4": "Categorical", } - reference_dictionaries.extend( - [ - reference_main_dictionary, - reference_secondary_dictionary_1, - reference_secondary_dictionary_2, - reference_tertiary_dictionary, - reference_quaternary_dictionary, - ] - ) + # Special change for the file type: + # - B.VAR_3 is binary and detected as "Categorical" by Khiops + # - C.VAR_3 is binary and detected as "Categorical" by Khiops + if data_type == "file": + ref_var_types["B"]["VAR_3"] = "Categorical" + ref_var_types["C"]["VAR_3"] = "Categorical" - return reference_dictionaries + return ref_var_types def test_dataset_is_correctly_built(self): """Test that the dataset structure is consistent with the input spec""" - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="snowflake" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) self.assertEqual(dataset.main_table.name, "A") self.assertEqual(len(dataset.secondary_tables), 4) - dataset_secondary_table_names = set( + dataset_secondary_table_names = { secondary_table.name for secondary_table in dataset.secondary_tables - ) + } self.assertEqual(dataset_secondary_table_names, {"B", "C", "D", "E"}) self.assertEqual(len(dataset.relations), 4) - spec_relations = dataset_spec["relations"] + spec_relations = ds_spec["relations"] for relation, spec_relation in zip(dataset.relations, spec_relations): self.assertEqual(relation[:2], spec_relation[:2]) if len(spec_relation) == 3: @@ -496,34 +502,34 @@ def test_dataset_is_correctly_built(self): else: self.assertFalse(relation[2]) - def test_created_file_from_dataframe_monotable(self): + def test_out_file_from_dataframe_monotable(self): """Test consistency of the created data file with the input dataframe - This test verifies that the content of the input dataframe is equal to that of the csv file created by khiops.sklearn. """ # Create a monotable dataset object from fixture data - spec, y = self.create_fixture_dataset_spec( + spec, y = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=False, schema=None ) dataset = Dataset(spec, y=y) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = pd.read_csv(created_table_path, sep="\t") + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = pd.read_csv(out_table_path, sep="\t") # Cast "Date" columns to datetime as we don't automatically recognize dates - created_table["Date"] = created_table["Date"].astype("datetime64[ns]") - reference_table = spec["tables"]["Reviews"][0] - reference_table["class"] = y + out_table["Date"] = out_table["Date"].astype("datetime64[ns]") + ref_table = spec["tables"]["Reviews"][0] + ref_table["class"] = y # Check that the dataframes are equal assert_frame_equal( - created_table, - reference_table.sort_values(by="User_ID").reset_index(drop=True), + ref_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, ) - def test_created_file_from_numpy_array_monotable(self): + def test_out_file_from_numpy_array_monotable(self): """Test consistency of the created data file with the input numpy array""" # Create a monotable dataset from a numpy array iris = datasets.load_iris() @@ -531,14 +537,12 @@ def test_created_file_from_numpy_array_monotable(self): dataset = Dataset(spec, y=iris.target, categorical_target=True) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = np.loadtxt( - created_table_path, delimiter="\t", skiprows=1, ndmin=2 - ) + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = np.loadtxt(out_table_path, delimiter="\t", skiprows=1, ndmin=2) # Check that the arrays are equal assert_equal( - created_table, + out_table, np.concatenate( (iris.data, iris.target.reshape(len(iris.target), 1)), axis=1 ), @@ -557,12 +561,14 @@ def _create_test_sparse_matrix_with_target(self): return sparse_matrix, target_array def _load_khiops_sparse_file(self, stream): - # skip header + # Skip header next(stream) + + # Read the sparse file target_vector = [] feature_matrix = [] for line in stream: - target, features = line.split(b"\t") + features, target_value = line.split(b"\t") feature_row = np.zeros(100) for feature in features.strip().split(b" "): indexed_feature = feature.split(b":") @@ -575,12 +581,12 @@ def _load_khiops_sparse_file(self, stream): feature_index, feature_value = indexed_feature feature_row[int(feature_index) - 1] = float(feature_value) feature_matrix.append(feature_row) - target_vector.append(float(target)) + target_vector.append(float(target_value)) target_array = np.array(target_vector) sparse_matrix = sp.csr_matrix(feature_matrix) return sparse_matrix, target_array - def test_created_file_from_sparse_matrix_monotable(self): + def test_out_file_from_sparse_matrix_monotable(self): """Test consistency of the created data file with the input sparse matrix""" # Load input sparse matrix and target array @@ -594,10 +600,10 @@ def test_created_file_from_sparse_matrix_monotable(self): X=input_sparse_matrix, y=input_target, categorical_target=True ) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - with open(created_table_path, "rb") as created_table_stream: + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(out_table_path, "rb") as out_table_stream: sparse_matrix, target_array = self._load_khiops_sparse_file( - created_table_stream + out_table_stream ) # Check that the arrays are equal @@ -614,7 +620,7 @@ def test_created_file_from_sparse_matrix_monotable(self): ), ) - def test_created_file_from_sparse_matrix_monotable_specification(self): + def test_out_file_from_sparse_matrix_monotable_specification(self): """Test consistency of the created data file with the input sparse matrix""" # Load input sparse matrix and target array @@ -628,10 +634,10 @@ def test_created_file_from_sparse_matrix_monotable_specification(self): dataset = Dataset(spec, y=input_target, categorical_target=True) # Create and load the intermediary Khiops file - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - with open(created_table_path, "rb") as created_table_stream: + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + with open(out_table_path, "rb") as out_table_stream: sparse_matrix, target_array = self._load_khiops_sparse_file( - created_table_stream + out_table_stream ) # Check that the arrays are equal @@ -648,31 +654,31 @@ def test_created_file_from_sparse_matrix_monotable_specification(self): ), ) - def test_created_file_from_data_file_monotable(self): + def test_out_file_from_data_file_monotable(self): """Test consistency of the created data file with the input data file - This test verifies that the content of the input data file is equal to that of the csv file created by khiops.sklearn. """ # Create the test dataset - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=False, schema=None ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) - created_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) - created_table = pd.read_csv(created_table_path, sep="\t") + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) + out_table = pd.read_csv(out_table_path, sep="\t") - reference_table_path = dataset_spec["tables"]["Reviews"][0] - reference_table = pd.read_csv(reference_table_path, sep="\t") + ref_table_path = ds_spec["tables"]["Reviews"][0] + ref_table = pd.read_csv(ref_table_path, sep="\t") # Check that the dataframes are equal assert_frame_equal( - created_table, - reference_table.sort_values(by="User_ID").reset_index(drop=True), + ref_table.sort_values(by="User_ID").reset_index(drop=True), + out_table, ) - def test_created_files_from_dataframes_multitable_star(self): + def test_out_files_from_dataframes_multitable_star(self): """Test consistency of the created data files with the input dataframes - This test verifies that the content of the input dataframes, defined through a @@ -680,10 +686,10 @@ def test_created_files_from_dataframes_multitable_star(self): schema of the dataset is "star". """ # Create the test dataset - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="star" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) # Create the Khiops intermediary files ( @@ -693,487 +699,237 @@ def test_created_files_from_dataframes_multitable_star(self): # Load the intermediary files secondary_table_path = secondary_table_paths["logs"] - created_main_table = pd.read_csv(main_table_path, sep="\t") - created_secondary_table = pd.read_csv(secondary_table_path, sep="\t") + out_main_table = pd.read_csv(main_table_path, sep="\t") + out_secondary_table = pd.read_csv(secondary_table_path, sep="\t") - reference_main_table = dataset_spec["tables"]["id_class"][0] - reference_main_table["class"] = label - reference_secondary_table = dataset_spec["tables"]["logs"][0] + ref_main_table = ds_spec["tables"]["id_class"][0] + ref_main_table["class"] = label + ref_secondary_table = ds_spec["tables"]["logs"][0] # Clean created test data assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) assert_frame_equal( - created_secondary_table.sort_values( - by=created_secondary_table.columns.tolist(), ascending=True + ref_secondary_table.sort_values( + by=ref_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_secondary_table.sort_values( - by=reference_secondary_table.columns.tolist(), ascending=True + out_secondary_table.sort_values( + by=out_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_data_files_multitable_star(self): + def test_out_files_from_data_files_multitable_star(self): """Test consistency of the created data files with the input data files - This test verifies that the content of the input data files, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "star". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=True, schema="star" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) main_table_path, dico_secondary_table = dataset.create_table_files_for_khiops( self.output_dir ) secondary_table_path = dico_secondary_table["logs"] - created_main_table = pd.read_csv(main_table_path, sep="\t") - created_secondary_table = pd.read_csv(secondary_table_path, sep="\t") - - reference_table_path = dataset_spec["tables"]["id_class"][0] - reference_main_table = pd.read_csv(reference_table_path, sep="\t") - reference_secondary_table_path = dataset_spec["tables"]["logs"][0] - reference_secondary_table = pd.read_csv( - reference_secondary_table_path, sep="\t" - ) + out_main_table = pd.read_csv(main_table_path, sep="\t") + out_secondary_table = pd.read_csv(secondary_table_path, sep="\t") + + ref_table_path = ds_spec["tables"]["id_class"][0] + ref_main_table = pd.read_csv(ref_table_path, sep="\t") + ref_secondary_table_path = ds_spec["tables"]["logs"][0] + ref_secondary_table = pd.read_csv(ref_secondary_table_path, sep="\t") # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) assert_frame_equal( - created_secondary_table.sort_values( - by=created_secondary_table.columns.tolist(), ascending=True + ref_secondary_table.sort_values( + by=ref_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_secondary_table.sort_values( - by=reference_secondary_table.columns.tolist(), ascending=True + out_secondary_table.sort_values( + by=out_secondary_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_dataframes_multitable_snowflake(self): + def test_out_files_from_dataframes_multitable_snowflake(self): """Test consistency of the created data files with the input dataframes - This test verifies that the content of the input dataframes, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "snowflake". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=None, data_type="df", multitable=True, schema="snowflake" ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) ( main_table_path, additional_table_paths, ) = dataset.create_table_files_for_khiops(self.output_dir) - created_main_table = pd.read_csv(main_table_path, sep="\t") - reference_main_table = dataset_spec["tables"]["A"][0] - reference_main_table["class"] = label + out_main_table = pd.read_csv(main_table_path, sep="\t") + ref_main_table = ds_spec["tables"]["A"][0] + ref_main_table["class"] = label # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) additional_table_names = list(additional_table_paths.keys()) for name in additional_table_names: additional_table_path = additional_table_paths[name] - created_additional_table = pd.read_csv(additional_table_path, sep="\t") - reference_additional_table = dataset_spec["tables"][name][0] + out_additional_table = pd.read_csv(additional_table_path, sep="\t") + ref_additional_table = ds_spec["tables"][name][0] assert_frame_equal( - created_additional_table.sort_values( - by=created_additional_table.columns.tolist(), ascending=True + ref_additional_table.sort_values( + by=ref_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_additional_table.sort_values( - by=reference_additional_table.columns.tolist(), ascending=True + out_additional_table.sort_values( + by=out_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_files_from_data_files_multitable_snowflake(self): + def test_out_files_from_data_files_multitable_snowflake(self): """Test consistency of the created s with the input data files - This test verifies that the content of the input data files, defined through a dictionary, is equal to that of the csv files created by khiops.sklearn. The schema of the dataset is "snowflake". """ - dataset_spec, label = self.create_fixture_dataset_spec( + ds_spec, label = self.create_fixture_ds_spec( output_dir=self.output_dir, data_type="file", multitable=True, schema="snowflake", ) - dataset = Dataset(dataset_spec, label) + dataset = Dataset(ds_spec, label) main_table_path, additional_table_paths = dataset.create_table_files_for_khiops( self.output_dir ) - created_main_table = pd.read_csv(main_table_path, sep="\t") - reference_main_table_path = dataset_spec["tables"]["A"][0] - reference_main_table = pd.read_csv(reference_main_table_path, sep="\t") + out_main_table = pd.read_csv(main_table_path, sep="\t") + ref_main_table_path = ds_spec["tables"]["A"][0] + ref_main_table = pd.read_csv(ref_main_table_path, sep="\t") # assertions assert_frame_equal( - created_main_table, - reference_main_table.sort_values(by="User_ID", ascending=True).reset_index( + ref_main_table.sort_values(by="User_ID", ascending=True).reset_index( drop=True ), + out_main_table, ) additional_table_names = list(additional_table_paths.keys()) for name in additional_table_names: additional_table_path = additional_table_paths[name] - created_additional_table = pd.read_csv(additional_table_path, sep="\t") - reference_additional_table_path = dataset_spec["tables"][name][0] - reference_additional_table = pd.read_csv( - reference_additional_table_path, sep="\t" - ) + out_additional_table = pd.read_csv(additional_table_path, sep="\t") + ref_additional_table_path = ds_spec["tables"][name][0] + ref_additional_table = pd.read_csv(ref_additional_table_path, sep="\t") assert_frame_equal( - created_additional_table.sort_values( - by=created_additional_table.columns.tolist(), ascending=True + out_additional_table.sort_values( + by=out_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), - reference_additional_table.sort_values( - by=reference_additional_table.columns.tolist(), ascending=True + ref_additional_table.sort_values( + by=ref_additional_table.columns.tolist(), ascending=True ).reset_index(drop=True), ) - def test_created_dictionary_from_dataframe_monotable(self): - """Test consistency of the created dictionary with the input dataframe - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input monotable dataset. Data is here provided through a dataframe. - """ - - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=False, schema=None - ) - - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_dictionary = created_dictionary_domain.dictionaries[0] - created_dictionary_variable_types = { - var.name: var.type for var in created_dictionary.variables - } - reference_dictionary_variable_types = self.get_reference_dictionaries( - multitable=False - )[0] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 1) - self.assertEqual(created_dictionary.name, "Reviews") - self.assertEqual(created_dictionary.root, False) - self.assertEqual(len(created_dictionary.key), 1) - self.assertEqual( - created_dictionary_variable_types, reference_dictionary_variable_types - ) - - def test_created_dictionary_from_data_file_monotable(self): - """Test consistency of the created dictionary with the input data file - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input monotable dataset. Data is here provided through a data file. - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=False, schema=None - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_dictionary = created_dictionary_domain.dictionaries[0] - created_dictionary_variable_types = { - var.name: var.type for var in created_dictionary.variables - } - reference_dictionary_variable_types = self.get_reference_dictionaries( - multitable=False - )[0] - reference_dictionary_variable_types["Date"] = "Categorical" - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 1) - self.assertEqual(created_dictionary.name, "Reviews") - self.assertEqual(created_dictionary.root, False) - self.assertEqual(len(created_dictionary.key), 1) - self.assertEqual( - created_dictionary_variable_types, reference_dictionary_variable_types - ) - - def test_created_dictionary_from_dataframes_multitable_star(self): - """Test consistency of the created dictionaries with the input dataframes - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through dataframes - and its schema is "star". - """ - - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=True, schema="star" - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_main_dictionary = created_dictionary_domain.dictionaries[0] - created_secondary_dictionary = created_dictionary_domain.dictionaries[1] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 2) - self.assertEqual(created_main_dictionary.name, "id_class") - self.assertEqual(created_secondary_dictionary.name, "logs") - self.assertEqual(created_main_dictionary.root, True) - self.assertEqual(created_secondary_dictionary.root, False) - self.assertEqual(created_main_dictionary.key[0], "User_ID") - - created_main_dictionary_variable_types = { - var.name: var.type for var in created_main_dictionary.variables - } - created_secondary_dictionary_variable_types = { - var.name: var.type for var in created_secondary_dictionary.variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="star" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types = reference_dictionaries[1] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types, - reference_secondary_dictionary_variable_types, - ) - - def test_created_dictionary_from_data_files_multitable_star(self): - """Test consistency of the created dictionaries with the input data files - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through data files - and its schema is "star". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=True, schema="star" - ) - - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - created_main_dictionary = created_dictionary_domain.dictionaries[0] - created_secondary_dictionary = created_dictionary_domain.dictionaries[1] - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 2) - self.assertEqual(created_main_dictionary.name, "id_class") - self.assertEqual(created_secondary_dictionary.name, "logs") - self.assertEqual(created_main_dictionary.root, True) - self.assertEqual(created_secondary_dictionary.root, False) - self.assertEqual(created_main_dictionary.key[0], "User_ID") - - created_main_dictionary_variable_types = { - var.name: var.type for var in created_main_dictionary.variables - } - created_secondary_dictionary_variable_types = { - var.name: var.type for var in created_secondary_dictionary.variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="star" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types = reference_dictionaries[1] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types, - reference_secondary_dictionary_variable_types, - ) - - def test_created_dictionary_from_dataframes_multitable_snowflake(self): - """Test consistency of the created dictionaries with the input dataframes - - - This test verifies that the dictionary file (.kdic) created by - khiops.sklearn contains information that is consistent with the - input multitable dataset. Data is here provided through dataframes - and its schema is "snowflake". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=None, data_type="df", multitable=True, schema="snowflake" - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - table_names = dataset_spec["tables"].keys() - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 5) - for name in table_names: - created_dictionary = created_dictionary_domain.get_dictionary(name) - self.assertEqual(created_dictionary.name, name) - if name == "A": - self.assertEqual(created_dictionary.root, True) - self.assertEqual( - created_dictionary.key[0], dataset_spec["tables"][name][1] + def test_create_khiops_domain(self): + """Test consistency of the dataset method create_khiops_domain""" + fixtures = [ + { + "output_dir": None, + "data_type": "df", + "multitable": False, + "schema": None, + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": False, + "schema": None, + }, + { + "output_dir": None, + "data_type": "df", + "multitable": True, + "schema": "star", + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": True, + "schema": "star", + }, + { + "output_dir": None, + "data_type": "df", + "multitable": True, + "schema": "snowflake", + }, + { + "output_dir": self.output_dir, + "data_type": "file", + "multitable": True, + "schema": "snowflake", + }, + ] + + for fixture in fixtures: + with self.subTest(**fixture): + ds = Dataset(*self.create_fixture_ds_spec(**fixture)) + ref_var_types = self.get_ref_var_types( + multitable=fixture["multitable"], + data_type=fixture["data_type"], + schema=fixture["schema"], ) - else: - self.assertEqual(created_dictionary.root, False) - self.assertEqual( - created_dictionary.key, dataset_spec["tables"][name][1] - ) - - created_main_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("A").variables - } - created_secondary_dictionary_variable_types_1 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("B").variables - } - created_secondary_dictionary_variable_types_2 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("C").variables - } - created_tertiary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("D").variables - } - created_quaternary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("E").variables - } - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="snowflake" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types_1 = reference_dictionaries[1] - reference_secondary_dictionary_variable_types_2 = reference_dictionaries[2] - reference_tertiary_dictionary_variable_types = reference_dictionaries[3] - reference_quaternary_dictionary_variable_types = reference_dictionaries[4] + self._test_domain_coherence(ds, ref_var_types) - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_1, - reference_secondary_dictionary_variable_types_1, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_2, - reference_secondary_dictionary_variable_types_2, - ) - self.assertEqual( - created_tertiary_dictionary_variable_types, - reference_tertiary_dictionary_variable_types, - ) - self.assertEqual( - created_quaternary_dictionary_variable_types, - reference_quaternary_dictionary_variable_types, - ) + def _test_domain_coherence(self, ds, ref_var_types): + # Create the dictionary domain associated to the fixture dataset + out_domain = ds.create_khiops_dictionary_domain() - def test_created_dictionary_from_data_files_multitable_snowflake(self): - """Test consistency of the created dictionaries with the input data files + # Check that the domain has the same number of tables as the dataset + self.assertEqual(len(out_domain.dictionaries), 1 + len(ds.secondary_tables)) - - This test verifies that the dictionary file created by khiops.sklearn - contains information that is consistent with the input multitable dataset. - Data is here provided through data files and its schema is "snowflake". - """ - dataset_spec, label = self.create_fixture_dataset_spec( - output_dir=self.output_dir, - data_type="file", - multitable=True, - schema="snowflake", - ) - dataset = Dataset(dataset_spec, label) - created_dictionary_domain = dataset.create_khiops_dictionary_domain() - table_names = dataset_spec["tables"].keys() - - # assertions - self.assertEqual(len(created_dictionary_domain.dictionaries), 5) - for name in table_names: - created_dictionary = created_dictionary_domain.get_dictionary(name) - self.assertEqual(created_dictionary.name, name) - - if name == "A": - self.assertEqual(created_dictionary.root, True) - self.assertEqual( - created_dictionary.key[0], dataset_spec["tables"][name][1] - ) - - else: - self.assertEqual(created_dictionary.root, False) - self.assertEqual( - created_dictionary.key, dataset_spec["tables"][name][1] - ) - - created_main_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("A").variables - } - - created_secondary_dictionary_variable_types_1 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("B").variables - } - - created_secondary_dictionary_variable_types_2 = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("C").variables + # Check that the domain has the same table names as the reference + ref_table_names = { + table.name for table in [ds.main_table] + ds.secondary_tables } + out_table_names = {dictionary.name for dictionary in out_domain.dictionaries} + self.assertEqual(ref_table_names, out_table_names) - created_tertiary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("D").variables - } - - created_quaternary_dictionary_variable_types = { - var.name: var.type - for var in created_dictionary_domain.get_dictionary("E").variables - } - - reference_dictionaries = self.get_reference_dictionaries( - multitable=True, schema="snowflake" - ) - reference_main_dictionary_variable_types = reference_dictionaries[0] - reference_secondary_dictionary_variable_types_1 = reference_dictionaries[1] - reference_secondary_dictionary_variable_types_2 = reference_dictionaries[2] - reference_tertiary_dictionary_variable_types = reference_dictionaries[3] - reference_quaternary_dictionary_variable_types = reference_dictionaries[4] - - # assertions - self.assertEqual( - created_main_dictionary_variable_types, - reference_main_dictionary_variable_types, - ) + # Check that the output domain has a root table iff the dataset is multitable self.assertEqual( - created_secondary_dictionary_variable_types_1, - reference_secondary_dictionary_variable_types_1, - ) - self.assertEqual( - created_secondary_dictionary_variable_types_2, - reference_secondary_dictionary_variable_types_2, - ) - self.assertEqual( - created_tertiary_dictionary_variable_types, - reference_tertiary_dictionary_variable_types, - ) - self.assertEqual( - created_quaternary_dictionary_variable_types, - reference_quaternary_dictionary_variable_types, - ) + ds.is_multitable, out_domain.get_dictionary(ds.main_table.name).root + ) + + # Check that: + # - the table keys are the same as the dataset + # - the domain has the same variable names as the reference + for table in [ds.main_table] + ds.secondary_tables: + with self.subTest(table=table.name): + self.assertEqual(table.key, out_domain.get_dictionary(table.name).key) + out_dictionary_var_types = { + var.name: var.type + for var in out_domain.get_dictionary(table.name).variables + } + self.assertEqual(ref_var_types[table.name], out_dictionary_var_types) diff --git a/tests/test_dataset_errors.py b/tests/test_dataset_errors.py index 3cd09f40..457d6529 100644 --- a/tests/test_dataset_errors.py +++ b/tests/test_dataset_errors.py @@ -14,12 +14,12 @@ import pandas as pd from khiops.core.internals.common import type_error_message -from khiops.sklearn.tables import Dataset, FileTable, PandasTable +from khiops.utils.dataset import Dataset, FileTable, PandasTable # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names estimators.py +# pylint --disable=all --enable=invalid-names test_dataset_errors.py # pylint: disable=invalid-name class AnotherType(object): """A placeholder class that is not of any basic type to test TypeError's""" @@ -448,119 +448,67 @@ def test_y_type_must_be_str_or_array_like_1d(self): dataframe.to_csv(table_path, sep="\t", index=False) tuple_spec = (table_path, "\t") bad_y = dataframe["class"] - expected_msg = type_error_message("y", bad_y, str) + expected_msg = ( + type_error_message("y", bad_y, str) + + " (X's tables are of type str [file paths])" + ) self.assert_dataset_fails(tuple_spec, bad_y, TypeError, expected_msg) # Test when X is a dataframe: expects array-like - bad_y = AnotherType() - expected_msg = type_error_message("y", bad_y, "array-like") - self.assert_dataset_fails(dataframe, bad_y, TypeError, expected_msg) - - ######################### - # Tests for X dict spec # - ######################### - - def test_dict_spec_relations_must_be_list_like(self): - """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"] = AnotherType() - expected_msg = type_error_message( - "Relations at X['tables']['relations']", - bad_spec["relations"], - "list-like", - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relations_must_be_tuple(self): - """Test Dataset raising TypeError when a relation is not a tuple""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = AnotherType() - expected_msg = type_error_message("Relation", bad_spec["relations"][0], "tuple") - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relations_must_be_of_size_2_or_3(self): - """Test Dataset raising ValueError when a relation is not of size 2 or 3""" - bad_spec, y = self.create_fixture_dataset_spec() - for size in [0, 1, 4, 5]: - bad_spec["relations"][0] = tuple((f"Table{i}" for i in range(size))) - expected_msg = f"A relation must be of size 2 or 3, not {size}" - with self.subTest(tuple_size=size): - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - - def test_dict_spec_table_relation_must_be_str(self): - """Test Dataset raising TypeError when a relation table is not a str""" - # Test the error in the left table - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = (AnotherType(), "BTable") - expected_msg = type_error_message( - "Table of a relation", bad_spec["relations"][0][0], str - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - # Test the error in the right table - bad_spec["relations"][0] = ("ATable", AnotherType()) - expected_msg = type_error_message( - "Table of a relation", bad_spec["relations"][0][1], str - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_entiy_flag_relation_must_be_bool(self): - """Test Dataset raising TypeError when the entity flag is not boolean""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("B", "D", AnotherType()) - expected_msg = type_error_message( - "1-1 flag for relation (B, D)", bad_spec["relations"][0][2], bool - ) - self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) - - def test_dict_spec_relation_tables_must_not_be_the_same(self): - """Test Dataset raising TypeError when tables of a relation are the same""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("Table", "Table") + bad_y = "TargetColumn" expected_msg = ( - "Tables in relation '(Table, Table)' are the same. " - "They must be different." + type_error_message("y", bad_y, "array-like") + + " (X's tables are of type pandas.DataFrame)" ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + self.assert_dataset_fails(dataframe, bad_y, TypeError, expected_msg) - def test_dict_spec_relation_table_must_be_in_table_list(self): - """Test Dataset raising ValueError when a rel. table is not in the table list""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"][0] = ("NonExistentTable", "D") - expected_msg = ( - "X['tables'] does not contain a table named 'NonExistentTable'. " - "All tables in X['relations'] must be declared in X['tables']" + def test_df_dataset_fails_if_target_column_is_already_in_the_features(self): + """Test in-memory table failing when the target is already in the features""" + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) + features_table = spec["tables"]["Reviews"][0] + bad_y = features_table["Recommended IND"] + with self.assertRaises(ValueError) as context: + Dataset(spec, bad_y) + output_error_msg = str(context.exception) + expected_msg_prefix = ( + "Target column name 'Recommended IND' is already present in the main table." ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + self.assertIn(expected_msg_prefix, output_error_msg) - def test_dict_spec_relation_must_appear_once(self): - """Test Dataset raising ValueError if a relation appears more than once""" - bad_spec, y = self.create_fixture_dataset_spec() - bad_spec["relations"].append(("B", "D")) - expected_msg = ( - "Relation '(B, D)' occurs '2' times. Each relation must be unique." - ) - self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + def test_file_dataset_fails_if_table_does_not_contain_the_target_column(self): + """Test FileTable failing if the table does not contain the target column""" + table_path = os.path.join(self.output_dir, "table.csv") + table = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + table.to_csv(table_path, sep="\t", index=False) + with self.assertRaises(ValueError) as context: + Dataset({"tables": {"main_table": (table_path, None)}}, y="TargetColumn") + output_error_msg = str(context.exception) + expected_msg_prefix = "Target column 'TargetColumn' not present in" + self.assertIn(expected_msg_prefix, output_error_msg) + + ##################################### + # Tests for dictionary dataset spec # + ##################################### def test_dict_spec_key_tables_must_be_present(self): """Test Dataset raising ValueError if the 'tables' key is missing""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["tables"] - expected_msg = "Mandatory key 'tables' missing from dict 'X'" + expected_msg = "'tables' entry missing from dataset dict spec" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_key_tables_must_be_mapping(self): """Test Dataset raising TypeError if the 'tables' key is not a mapping""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"] = AnotherType() - expected_msg = type_error_message("X['tables']", bad_spec["tables"], Mapping) + expected_msg = type_error_message("'tables' entry", bad_spec["tables"], Mapping) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) def test_dict_spec_table_list_cannot_be_empty(self): """Test Dataset raising ValueError if the 'tables' key is empty""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"] = {} - expected_msg = "X['tables'] cannot be empty" + expected_msg = "'tables' dictionary cannot be empty" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_table_input_type_must_be_a_tuple(self): @@ -568,7 +516,7 @@ def test_dict_spec_table_input_type_must_be_a_tuple(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = list(bad_spec["tables"]["D"]) expected_msg = type_error_message( - "Table input at X['tables']['D']", bad_spec["tables"]["D"], tuple + "'D' table entry", bad_spec["tables"]["D"], tuple ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -576,7 +524,7 @@ def test_dict_spec_table_input_tuple_must_have_size_2(self): """Test Dataset raising ValueError when a table entry is a tuple of size != 2""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (*bad_spec["tables"]["D"], "AnotherT", "YetAnotherT") - expected_msg = "Table input tuple at X['tables']['D'] must have size 2 not 4" + expected_msg = "'D' table entry must have size 2, not 4" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_source_table_type_must_be_adequate(self): @@ -584,9 +532,10 @@ def test_dict_spec_source_table_type_must_be_adequate(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (AnotherType(), bad_spec["tables"]["D"][-1]) expected_msg = type_error_message( - "Table source at X['tables']['D']", + "'D' table's source", bad_spec["tables"]["D"][0], - "array-like or scipy.sparse.spmatrix", + "array-like", + "scipy.sparse.spmatrix", str, ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -596,7 +545,7 @@ def test_dict_spec_table_key_must_be_str_or_sequence(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["tables"]["D"] = (bad_spec["tables"]["D"][0], AnotherType()) expected_msg = type_error_message( - "Table key at X['tables']['D']", + "'D' table's key", bad_spec["tables"]["D"][1], str, Sequence, @@ -610,15 +559,15 @@ def test_dict_spec_table_key_column_type_must_be_str(self): bad_key = ["User_ID", AnotherType(), "VAR_2"] bad_spec["tables"]["D"] = (dataframe, bad_key) expected_msg = type_error_message( - "Column name of table key at X['tables']['D']", bad_key[1], str + "'D' table's key column name", bad_key[1], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) def test_dict_spec_main_table_must_be_specified_for_multitable_datasets(self): - """Test Dataset raising ValueError if 'main_table' is not a key in a MT spec""" + """Test Dataset raising ValueError if 'main_table' is not a key in an MT spec""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["main_table"] - expected_msg = "'main_table' must be specified for multi-table datasets" + expected_msg = "'main_table' entry must be specified for multi-table datasets" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_main_table_must_be_str(self): @@ -626,7 +575,7 @@ def test_dict_spec_main_table_must_be_str(self): bad_spec, y = self.create_fixture_dataset_spec() bad_spec["main_table"] = 1 expected_msg = type_error_message( - "X['main_table']", bad_spec["main_table"], str + "'main_table' entry", bad_spec["main_table"], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -634,51 +583,75 @@ def test_dict_spec_main_table_not_declared_in_tables(self): """Test Dataset raising ValueError if the main table is not in the table list""" bad_spec, y = self.create_fixture_dataset_spec() del bad_spec["tables"][bad_spec["main_table"]] - expected_msg = "X['main_table'] (A) must be present in X['tables']" + expected_msg = ( + "A table entry with the main table's name ('A') " + "must be present in the 'tables' dictionary" + ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - def test_dic_spec_main_table_key_must_be_specified(self): - """Test Dataset raising ValueError if a MT spec doesn't have a main table key""" + def test_dict_spec_main_table_key_must_be_specified(self): + """Test Dataset raise ValueError if an MT spec doesn't have a main table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"][bad_spec["main_table"]] bad_spec["tables"][bad_spec["main_table"]] = (dataframe, None) - expected_msg = "key of the root table is 'None'" + expected_msg = ( + "key of main table 'A' is 'None': " + "table keys must be specified in multi-table datasets" + ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) - def test_dict_spec_main_table_key_must_be_non_empty_for_multitable_datasets(self): - """Test Dataset raising ValueError if a MT spec have an empty main table key""" + def test_dict_spec_table_key_must_be_non_empty_for_multitable_datasets(self): + """Test Dataset raising ValueError if an MT spec have an empty table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"][bad_spec["main_table"]] bad_spec["tables"][bad_spec["main_table"]] = (dataframe, []) - expected_msg = ( - "key of the root table must be non-empty for multi-table datasets" - ) + expected_msg = f"'{bad_spec['main_table']}' table's key is empty" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_secondary_table_key_must_be_specified(self): - """Test Dataset raising ValueError if a MT spec doesn't have a sec. table key""" + """Test Dataset raise ValueError if an MT spec doesn't have a sec. table key""" bad_spec, y = self.create_fixture_dataset_spec() dataframe, _ = bad_spec["tables"]["D"] bad_spec["tables"]["D"] = (dataframe, None) expected_msg = ( - "key of the secondary table 'D' is 'None': " - "table keys must be specified in multitable datasets" + "key of secondary table 'D' is 'None': " + "table keys must be specified in multi-table datasets" ) self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + def test_dict_spec_secondary_tables_must_have_the_same_type_as_the_main_table(self): + """Test Dataset raising ValueError if main and sec. table's types don't match""" + bad_spec, _ = self.create_fixture_dataset_spec() + alt_spec, _ = self.create_fixture_dataset_spec( + data_type="file", output_dir=self.output_dir + ) + bad_spec["tables"]["D"] = alt_spec["tables"]["D"] + expected_msg = ( + "Secondary table 'D' has type 'str' which is different " + "from the main table's type 'DataFrame'." + ) + self.assert_dataset_fails(bad_spec, None, ValueError, expected_msg) + def test_dict_spec_format_must_be_tuple(self): """Test Dataset raising a TypeError if the format field is not a tuple""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["format"] = AnotherType() - expected_msg = type_error_message("X['format']", bad_spec["format"], tuple) + expected_msg = type_error_message("'format' entry", bad_spec["format"], tuple) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + def test_dict_spec_format_must_have_size_2(self): + """Test Dataset raising a ValueError if its 'format' entry is not of size 2""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["format"] = (",", True, AnotherType(), AnotherType(), AnotherType()) + expected_msg = "'format' entry must be a tuple of size 2, not 5" + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + def test_dict_spec_format_tuple_1st_element_must_be_str(self): """Test Dataset raising a TypeError if any of the format fields are not str""" bad_spec, y = self.create_fixture_dataset_spec() bad_spec["format"] = (AnotherType(), True) expected_msg = type_error_message( - "X['format'] 1st element", bad_spec["format"][0], str + "'format' tuple's 1st element (separator)", bad_spec["format"][0], str ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -690,7 +663,7 @@ def test_dict_spec_format_tuple_2nd_element_must_be_bool(self): ) bad_spec["format"] = (",", AnotherType()) expected_msg = type_error_message( - "X['format'] 2nd element", bad_spec["format"][1], bool + "'format' tuple's 2nd element (header)", bad_spec["format"][1], bool ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) @@ -701,15 +674,15 @@ def test_dict_spec_format_tuple_1st_element_must_be_a_single_character(self): data_type="file", ) bad_spec["format"] = (";;", True) - expected_msg = "Separator must be a single character. Value: ;;" + expected_msg = "'format' separator must be a single char, got ';;'" self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) def test_dict_spec_y_type_must_be_series_or_df_when_x_is_df_spec(self): - """Test Dataset raising TypeError if X a is df-dict-spec and y isn't a Series""" + """Test Dataset raising TypeError if X a is ds-spec and y isn't array-like""" spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) - bad_y = AnotherType() + bad_y = "TargetColumnName" expected_msg = ( - type_error_message("y", bad_y, pd.Series, pd.DataFrame) + type_error_message("y", bad_y, "array-like") + " (X's tables are of type pandas.DataFrame)" ) self.assert_dataset_fails(spec, bad_y, TypeError, expected_msg) @@ -719,42 +692,27 @@ def test_dict_spec_y_must_be_str_when_x_is_file_spec(self): spec, _ = self.create_fixture_dataset_spec( output_dir=self.output_dir, data_type="file" ) - bad_y = AnotherType() + bad_y = np.array([1, 2, 3]) expected_msg = ( type_error_message("y", bad_y, str) + " (X's tables are of type str [file paths])" ) self.assert_dataset_fails(spec, bad_y, TypeError, expected_msg) - def test_dict_spec_target_column_must_be_specified_to_be_accessed(self): - """Test Dataset raising ValueError when accessing a non specified target col""" - # Disable pointless statement because it is necessary for the test - # pylint: disable=pointless-statement - spec, _ = self.create_fixture_dataset_spec( - output_dir=self.output_dir, data_type="file", multitable=False, schema=None - ) - dataset = Dataset(spec, None) - with self.assertRaises(ValueError) as context: - dataset.target_column_type - output_error_msg = str(context.exception) - expected_error_msg = "Target column is not set" - self.assertEqual(output_error_msg, expected_error_msg) - def test_dict_spec_table_name_must_be_str(self): """Test Dataset raising TypeError when a table name is not a str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: PandasTable( AnotherType(), features_table, - target_column=y, ) output_error_msg = str(context.exception) expected_msg = type_error_message("name", AnotherType(), str) self.assertEqual(output_error_msg, expected_msg) - def test_dict_spec_table_name_is_empty_string(self): + def test_dict_spec_table_nameis_empty_string(self): """Test Dataset raising ValueError when a table name is empty""" spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) with self.assertRaises(ValueError) as context: @@ -767,41 +725,149 @@ def test_dict_spec_key_type_must_be_str_or_list_like(self): """Test Dataset raising TypeError when a key is not of the proper type""" bad_key = AnotherType() expected_error_msg = type_error_message("key", bad_key, str, int, "list-like") - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=bad_key, - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) self.assertEqual(output_error_msg, expected_error_msg) def test_dict_spec_key_column_type_must_be_str_or_int(self): """Test Dataset raising TypeError when a key column is not of the proper type""" - bad_key = {"not-a-str-or-int": []} + bad_key = [AnotherType()] expected_error_msg = ( - type_error_message("key[0]", bad_key, str, int) + " at table 'reviews'" + type_error_message("key[0]", AnotherType(), str, int) + + " at table 'reviews'" ) - dataset_spec, label = self.create_fixture_dataset_spec( + dataset_spec, _ = self.create_fixture_dataset_spec( multitable=False, schema=None ) features_table = dataset_spec["tables"]["Reviews"][0] with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=label, - categorical_target=True, - key=[bad_key], - ) + PandasTable("reviews", features_table, key=bad_key) output_error_msg = str(context.exception) - self.assertEqual(output_error_msg, expected_error_msg) + self.assertEqual(expected_error_msg, output_error_msg) + + def test_dict_spec_relations_must_be_list_like(self): + """Test Dataset raising TypeError when dict spec "relations" is a dict-like""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"] = AnotherType() + expected_msg = type_error_message( + "'relations' entry", + bad_spec["relations"], + "list-like", + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relations_must_be_tuple(self): + """Test Dataset raising TypeError when a relation is not a tuple""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = AnotherType() + expected_msg = type_error_message("Relation", bad_spec["relations"][0], "tuple") + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relations_must_be_of_size_2_or_3(self): + """Test Dataset raising ValueError when a relation is not of size 2 or 3""" + bad_spec, y = self.create_fixture_dataset_spec() + for size in [0, 1, 4, 5]: + bad_spec["relations"][0] = tuple((f"Table{i}" for i in range(size))) + expected_msg = f"A relation must be of size 2 or 3, not {size}" + with self.subTest(tuple_size=size): + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_table_relation_must_be_str(self): + """Test Dataset raising TypeError when a relation table is not a str""" + # Test the error in the left table + bad_spec, y = self.create_fixture_dataset_spec() + first_relation = bad_spec["relations"][0] + bad_spec["relations"][0] = (AnotherType(), "D") + expected_msg = type_error_message( + "Relation #1's parent table", bad_spec["relations"][0][0], str + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + # Test the error in the right table + bad_spec["relations"][0] = first_relation + bad_spec["relations"][1] = ("A", AnotherType()) + expected_msg = type_error_message( + "Relation #2's child table", bad_spec["relations"][1][1], str + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_entiy_flag_relation_must_be_bool(self): + """Test Dataset raising TypeError when the entity flag is not boolean""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("B", "D", AnotherType()) + expected_msg = type_error_message( + "Relation #1 (B, D) 1-1 flag", bad_spec["relations"][0][2], bool + ) + self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) + + def test_dict_spec_relation_tables_must_not_be_the_same(self): + """Test Dataset raising ValueError when tables of a relation are equal""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("Table", "Table") + expected_msg = ( + "Relation #1's tables are equal: (Table, Table). They must be different." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_table_must_be_in_table_list(self): + """Test Dataset raising ValueError when a rel. table is not in the table list""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"][0] = ("NonExistentTable", "D") + expected_msg = ( + "Relation #1 (NonExistentTable, D) contains " + "non-existent table 'NonExistentTable'. " + "All relation tables must exist in the 'tables' entry." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_must_appear_once(self): + """Test Dataset raising ValueError if a relation appears more than once""" + bad_spec, y = self.create_fixture_dataset_spec() + bad_spec["relations"].append(("B", "D")) + expected_msg = ( + "Relation #1 (B, D) occurs 2 times. Each relation must be unique." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_non_hierarchical_key(self): + """Test Dataset raising ValueError on non-hierarchical table keys""" + ref_spec, y = self.create_fixture_dataset_spec() + bad_spec = { + "main_table": "B", + "tables": { + "A": ref_spec["tables"]["A"], + "B": ref_spec["tables"]["B"], + "C": ref_spec["tables"]["C"], + }, + "relations": [("A", "C"), ("B", "A")], + } + expected_msg = ( + "Relation #2 child table 'A' key ([User_ID]) " + "does not contain that of parent table 'B' ([User_ID, VAR_1])." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) + + def test_dict_spec_relation_cycle(self): + """Test Dataset raising ValueError when there is a relation cycle""" + ref_spec, y = self.create_fixture_dataset_spec() + bad_spec = { + "main_table": "A", + "tables": { + "A": ref_spec["tables"]["A"], + "B": ref_spec["tables"]["B"], + "C": ref_spec["tables"]["C"], + }, + "relations": [("A", "C"), ("A", "B"), ("C", "A")], + } + expected_msg = ( + "'relations' entry contains a cycle that includes " "the relation (C, A)." + ) + self.assert_dataset_fails(bad_spec, y, ValueError, expected_msg) ############################ # Tests for DatasetTable's # @@ -818,59 +884,18 @@ def test_pandas_table_input_type_must_be_dataframe(self): def test_pandas_table_input_table_must_not_be_empty(self): """Test PandasTable raising ValueError if the input dataframe is empty""" with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - pd.DataFrame(), - target_column="class", - ) + PandasTable("reviews", pd.DataFrame()) output_error_msg = str(context.exception) expected_msg = "'dataframe' is empty" self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_target_column_must_be_series(self): - """Test PandasTable raising TypeError if the input target col. isn't a Series""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - with self.assertRaises(TypeError) as context: - PandasTable( - "reviews", - features_table, - target_column=AnotherType(), - ) - output_error_msg = str(context.exception) - expected_msg = type_error_message("target_column", AnotherType(), "array-like") - self.assertEqual(output_error_msg, expected_msg) - - def test_pandas_table_fails_if_target_column_is_already_in_the_features(self): - """Test in-memory table failing when the target is already in the features""" - dataset_spec, _ = self.create_fixture_dataset_spec( - multitable=False, schema=None - ) - features_table = dataset_spec["tables"]["Reviews"][0] - y = features_table["Recommended IND"] - with self.assertRaises(ValueError) as context: - PandasTable( - "reviews", - features_table, - target_column=y, - ) - output_error_msg = str(context.exception) - expected_msg = ( - "Target series name 'Recommended IND' is already present in" - " dataframe : ['User_ID', 'Age', 'Clothing ID', 'Date', 'New'," - " 'Title', 'Recommended IND', 'Positive Feedback average']" - ) - self.assertEqual(output_error_msg, expected_msg) - def test_pandas_table_column_ids_must_all_be_int_or_str(self): """Test that in-memory dataset all columns ids must be int or str""" - spec, y = self.create_fixture_dataset_spec(multitable=False, schema=None) + spec, _ = self.create_fixture_dataset_spec(multitable=False, schema=None) features_table = spec["tables"]["Reviews"][0] features_table.rename(columns={"User_ID": 1}, inplace=True) with self.assertRaises(TypeError) as context: - PandasTable("reviews", features_table, target_column=y) + PandasTable("reviews", features_table) output_error_msg = str(context.exception) expected_msg = ( "Dataframe column ids must be either all integers or all " @@ -881,21 +906,10 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self): def test_file_table_fails_with_non_existent_table_file(self): """Test FileTable failing when it is created with a non-existent file""" with self.assertRaises(ValueError) as context: - FileTable("reviews", "Review.csv", target_column_id="class") + FileTable("reviews", "Review.csv") output_error_msg = str(context.exception) expected_msg = "Non-existent data table file: Review.csv" - self.assertEqual(output_error_msg, expected_msg) - - def test_file_table_fails_with_empty_table_file(self): - """Test FileTable failing if it is created with an empty table""" - table_path = os.path.join(self.output_dir, "empty_table.csv") - table = pd.DataFrame(columns=["a", "b"]) - table.to_csv(table_path, sep="\t", index=False) - with self.assertRaises(ValueError) as context: - FileTable("empty_table", table_path, target_column_id="class") - output_error_msg = str(context.exception) - expected_msg_prefix = "Empty data table file" - self.assertIn(expected_msg_prefix, output_error_msg) + self.assertEqual(expected_msg, output_error_msg) def test_file_table_internal_file_creation_fails_on_an_existing_path(self): """Test FileTable failing to create an internal file to a existing path""" @@ -905,21 +919,16 @@ def test_file_table_internal_file_creation_fails_on_an_existing_path(self): old_file_path = spec["tables"]["Reviews"][0] new_file_path = old_file_path.replace("Reviews.csv", "copy_Reviews.txt") os.rename(old_file_path, new_file_path) - file_table = FileTable( - "Reviews", - new_file_path, - target_column_id="class", - key="User_ID", - ) + file_table = FileTable("Reviews", new_file_path, key="User_ID") with self.assertRaises(ValueError) as context: file_table.create_table_file_for_khiops(self.output_dir, sort=False) output_error_msg = str(context.exception) expected_msg_prefix = "Cannot overwrite this table's path" self.assertIn(expected_msg_prefix, output_error_msg) - #################################################### - # Tests for X tuple and sequence spec (deprecated) # - #################################################### + ########################################################## + # Tests for tuple and sequence dataset spec (deprecated) # + ########################################################## def test_tuple_spec_must_have_length_2(self): """Test that `.Dataset` raises `ValueError` when the tuple is not of size 2""" @@ -967,6 +976,7 @@ def test_sequence_spec_must_be_str_or_df(self): # Test that the second element is not str bad_spec = ["table_1", AnotherType()] expected_msg = ( - type_error_message("X[1]", bad_spec[1], str) + " as the first table in X" + type_error_message("Table at index 1", bad_spec[1], str) + + " as the first table in X" ) self.assert_dataset_fails(bad_spec, y, TypeError, expected_msg) diff --git a/tests/test_helper_functions.py b/tests/test_helper_functions.py index 86d43aa6..233d720f 100644 --- a/tests/test_helper_functions.py +++ b/tests/test_helper_functions.py @@ -5,10 +5,16 @@ # see the "LICENSE.md" file for more details. # ###################################################################################### """Tests for checking the output types of predictors""" +import contextlib +import io +import tempfile import unittest +import pandas as pd + from khiops.core.dictionary import DictionaryDomain from khiops.core.helpers import build_multi_table_dictionary_domain +from khiops.utils.helpers import sort_dataset, train_test_split_dataset class KhiopsHelperFunctions(unittest.TestCase): @@ -91,3 +97,513 @@ def test_build_multi_table_dictionary_domain(self): for test_var, ref_var in zip(test_dict.variables, ref_dict.variables): self.assertEqual(test_var.name, ref_var.name) self.assertEqual(test_var.type, ref_var.type) + + def test_sort_dataset_dataframe(self): + """Tests that the sort_dataset function works for dataframe datasets""" + # Create the fixture dataset + clients_df = pd.read_csv(io.StringIO(UNSORTED_CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(UNSORTED_CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(UNSORTED_CONNECTIONS_CSV)) + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_df, ["id"]), + "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False), ("calls", "connections", False)], + } + + # Call the sort_dataset function + sorted_ds_spec = sort_dataset(ds_spec) + ref_sorted_table_dfs = { + "clients": pd.read_csv(io.StringIO(CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(CONNECTIONS_CSV)), + } + + # Check that the structure of the sorted dataset + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + for table_name in ds_spec["tables"]: + # Check that the dataframes are equal (ignoring the index) + self._assert_frame_equal( + ref_sorted_table_dfs[table_name].reset_index(drop=True), + sorted_ds_spec["tables"][table_name][0].reset_index(drop=True), + ) + + def test_sort_dataset_file(self): + """Tests that the sort_dataset function works for file datasets""" + # Create a execution context for temporary files and directories + with contextlib.ExitStack() as exit_stack: + # Create temporary files and a temporary directory + clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) + tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) + + # Create the fixture dataset + clients_csv_file.write(bytes(UNSORTED_CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(UNSORTED_CALLS_CSV, encoding="ascii")) + connections_csv_file.write( + bytes(UNSORTED_CONNECTIONS_CSV, encoding="ascii") + ) + clients_csv_file.flush() + calls_csv_file.flush() + connections_csv_file.flush() + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_csv_file.name, ["id"]), + "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), + }, + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], + "format": (",", True), + } + + # Call the sort_dataset function + sorted_ds_spec = sort_dataset(ds_spec, output_dir=tmp_dir) + + # Check that the structure of the sorted dataset + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + ref_sorted_tables = { + "clients": CLIENTS_CSV, + "calls": CALLS_CSV, + "connections": CONNECTIONS_CSV, + } + for table_name, _ in ds_spec["tables"].items(): + # Read the contents of the sorted table to a list of strings + sorted_table_spec = sorted_ds_spec["tables"][table_name] + sorted_table_file = exit_stack.enter_context( + open(sorted_table_spec[0], encoding="ascii") + ) + sorted_table = sorted_table_file.readlines() + + # Transform the reference table string to a list of strings + ref_sorted_table = ref_sorted_tables[table_name].splitlines( + keepends=True + ) + + # Check that the sorted table is equal to the reference + self.assertEqual(ref_sorted_table, sorted_table) + + def test_traint_test_split_dataset_dataframe(self): + """Tests that the train_test_split_dataset function works for df datasets""" + # Create the fixture dataset + clients_df = pd.read_csv(io.StringIO(CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(CONNECTIONS_CSV)) + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_df.drop("class", axis=1), ["id"]), + "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False), ("calls", "connections", False)], + } + y = clients_df["class"] + + # Execute the train/test split function + ds_spec_train, ds_spec_test, y_train, y_test = train_test_split_dataset( + ds_spec, y, test_size=0.5, random_state=31614 + ) + + # Check that the target are the same as the reference + ref_y_train = pd.read_csv(io.StringIO(TRAIN_DF_TARGET_CSV))["class"] + ref_y_test = pd.read_csv(io.StringIO(TEST_DF_TARGET_CSV))["class"] + self._assert_series_equal(ref_y_train, y_train.reset_index()["class"]) + self._assert_series_equal(ref_y_test, y_test.reset_index()["class"]) + + # Check that the dataset spec structure is the same + self._assert_dataset_keeps_structure(ds_spec_train, ds_spec) + self._assert_dataset_keeps_structure(ds_spec_test, ds_spec) + + # Check that the table contents match those of the references + split_ds_specs = { + "train": ds_spec_train, + "test": ds_spec_test, + } + ref_table_dfs = { + "train": { + "clients": pd.read_csv(io.StringIO(TRAIN_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TRAIN_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TRAIN_DF_CONNECTIONS_CSV)), + }, + "test": { + "clients": pd.read_csv(io.StringIO(TEST_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TEST_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TEST_DF_CONNECTIONS_CSV)), + }, + } + for split, ref_tables in ref_table_dfs.items(): + for table_name in ds_spec["tables"]: + with self.subTest(split=split, table_name=table_name): + self._assert_frame_equal( + split_ds_specs[split]["tables"][table_name][0].reset_index( + drop=True + ), + ref_tables[table_name].reset_index(drop=True), + ) + + def test_train_test_split_dataset_file(self): + """Tests that the train_test_split_dataset function works for file datasets""" + # Create a execution context for temporary files and directories + with contextlib.ExitStack() as exit_stack: + # Create temporary files and a temporary directory + clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) + tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) + + # Create the fixture dataset + clients_csv_file.write(bytes(CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(CALLS_CSV, encoding="ascii")) + connections_csv_file.write(bytes(CONNECTIONS_CSV, encoding="ascii")) + clients_csv_file.flush() + calls_csv_file.flush() + connections_csv_file.flush() + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_csv_file.name, ["id"]), + "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), + }, + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], + "format": (",", True), + } + + # Call the train_test_split_dataset function + train_ds_spec, test_ds_spec = train_test_split_dataset( + ds_spec, test_size=0.5, output_dir=tmp_dir + ) + split_ds_specs = {"train": train_ds_spec, "test": test_ds_spec} + + # Check that the structure of the splitted datasets + self._assert_dataset_keeps_structure(ds_spec, train_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, test_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + ref_split_tables = { + "train": { + "clients": TRAIN_FILE_CLIENTS_CSV, + "calls": TRAIN_FILE_CALLS_CSV, + "connections": TRAIN_FILE_CONNECTIONS_CSV, + }, + "test": { + "clients": TEST_FILE_CLIENTS_CSV, + "calls": TEST_FILE_CALLS_CSV, + "connections": TEST_FILE_CONNECTIONS_CSV, + }, + } + for split, split_ds_spec in split_ds_specs.items(): + for table_name, _ in ds_spec["tables"].items(): + # Read the contents of the splitted table to a list of strings + split_table_spec = split_ds_spec["tables"][table_name] + split_table_file = exit_stack.enter_context( + open(split_table_spec[0], encoding="ascii") + ) + split_table = split_table_file.readlines() + + # Transform the reference table string to a list of strings + ref_split_table = ref_split_tables[split][table_name].splitlines( + keepends=True + ) + + # Check that the sorted table is equal to the reference + self.assertEqual(split_table, ref_split_table) + + def _assert_dataset_keeps_structure(self, ds_spec, ref_ds_spec): + """Asserts that the input dataset has the same structure as the reference + + It does not check the contents of the tables. + """ + # Check that the spec dictionary is the same excluding the tables + self.assertIn("main_table", ref_ds_spec) + self.assertIn("tables", ref_ds_spec) + self.assertIn("relations", ref_ds_spec) + self.assertEqual(ds_spec["main_table"], ref_ds_spec["main_table"]) + self.assertEqual(ds_spec["relations"], ref_ds_spec["relations"]) + self.assertEqual(ds_spec["tables"].keys(), ref_ds_spec["tables"].keys()) + if "format" in ref_ds_spec: + self.assertIn("format", ds_spec) + self.assertEqual(ds_spec["format"], ref_ds_spec["format"]) + + # Check that the table keys are equal + for table_name, table_spec in ds_spec["tables"].items(): + self.assertEqual(table_spec[1], ref_ds_spec["tables"][table_name][1]) + + def _assert_frame_equal(self, ref_df, out_df): + """Wrapper for the assert_frame_equal pandas function + + In case of failure of assert_frame_equal we capture the AssertionError thrown by + it and make a unittest call to fail. This reports the error found by + assert_frame_equal while avoiding a double thrown exception. + """ + failure_error = None + try: + pd.testing.assert_frame_equal(ref_df, out_df) + except AssertionError as error: + failure_error = error + if failure_error is not None: + self.fail(failure_error) + + def _assert_series_equal(self, ref_series, out_series): + """Wrapper for the assert_frame_equal pandas function + + In case of failure of assert_frame_equal we capture the AssertionError thrown by + it and make a unittest call to fail. This reports the error found by + assert_frame_equal while avoiding a double thrown exception. + """ + failure_error = None + try: + pd.testing.assert_series_equal(ref_series, out_series) + except AssertionError as error: + failure_error = error + if failure_error is not None: + self.fail(failure_error) + + +# pylint: disable=line-too-long +# fmt: off + +# Test data + +CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +""".lstrip() + +CALLS_CSV = """ +id,call_id,duration +1,1,38 +1,20,29 +10,2,7 +13,25,329 +13,3,1 +13,30,8 +4,14,48 +4,2,543 +7,4,339 +""".lstrip() + +CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip() + +UNSORTED_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +UNSORTED_CALLS_CSV = """ +id,call_id,duration +1,1,38 +10,2,7 +13,25,329 +4,2,543 +13,30,8 +13,3,1 +4,14,48 +1,20,29 +7,4,339 +""".lstrip() + +UNSORTED_CONNECTIONS_CSV = """ +id,call_id,connection_ip +13,25,193.23.02.67 +1,1,277.1.56.30 +4,14,16.56.66.16 +13,25,146.74.18.88 +13,25,118.41.87.47 +1,1,147.43.67.35 +4,14,19.30.36.57 +1,20,199.44.70.12 +10,2,170.05.79.41 +1,20,169.51.97.96 +10,2,118.45.57.51 +13,25,161.51.79.60 +13,3,115.45.02.58 +4,14,15.16.40.67 +1,1,164.27.26.50 +7,4,16.66.64.13 +13,30,12.115.90.93 +7,4,15.13.69.18 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + +TRAIN_DF_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" +""".lstrip() + +TRAIN_DF_CALLS_CSV = """ +id,call_id,duration +7,4,339 +13,25,329 +13,3,1 +13,30,8 +""".lstrip() + +TRAIN_DF_TARGET_CSV = """ +class +1 +0 +""".lstrip() + +TRAIN_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +7,4,16.66.64.13 +7,4,15.13.69.18 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +""".lstrip() + + +TEST_DF_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" +""".lstrip() + +TEST_DF_TARGET_CSV = """ +class +1 +0 +1 +""".lstrip() + + +TEST_DF_CALLS_CSV = """ +id,call_id,duration +4,14,48 +4,2,543 +10,2,7 +1,1,38 +1,20,29 +""".lstrip() + +TEST_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +10,2,170.05.79.41 +10,2,118.45.57.51 +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +""".lstrip() + +TRAIN_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +TRAIN_FILE_CALLS_CSV = """ +id,call_id,duration +10,2,7 +13,25,329 +13,3,1 +13,30,8 +4,14,48 +4,2,543 +""".lstrip() + +TRAIN_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + + +TEST_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +""".lstrip() + +TEST_FILE_CALLS_CSV = """ +id,call_id,duration +1,1,38 +1,20,29 +7,4,339 +""".lstrip() + +TEST_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip() diff --git a/tests/test_remote_access.py b/tests/test_remote_access.py index f3bfbc78..f43b15b4 100644 --- a/tests/test_remote_access.py +++ b/tests/test_remote_access.py @@ -132,7 +132,8 @@ def test_khiops_classifier_with_remote_access(self): iris_df = pd.read_csv(iris_data_file, sep="\t") iris_df.pop("Class") classifier.predict(iris_df) - self.assertTrue(fs.exists(fs.get_child_path(output_dir, "transformed.txt"))) + predict_path = fs.get_child_path(output_dir, "predict.txt") + self.assertTrue(fs.exists(predict_path), msg=f"Path: {predict_path}") # Cleanup for filename in fs.list_dir(output_dir): @@ -190,7 +191,7 @@ def test_train_predictor_fail_and_log_with_remote_access(self): log_file_path=log_file_path, ) # Check and remove log file - self.assertTrue(fs.exists(log_file_path)) + self.assertTrue(fs.exists(log_file_path), f"Path: {log_file_path}") fs.remove(log_file_path) diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 0024f1b3..f9ce3872 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -68,12 +68,6 @@ def assertEqualAdditionalDataTableNames( @classmethod def setUpClass(cls): """Prepare datasets for tests""" - # Disable file-path warnings - warnings.filterwarnings( - "ignore", - message="File-path dataset input is deprecated and will be removed", - ) - # Grab output_dir for subsequent deletion cls.output_dir = os.path.join( "resources", "tmp", "test_sklearn_parameter_transfer" @@ -1372,7 +1366,7 @@ def _check_dictionary_domain( expected_additional_data_table_names=(), ): """Check assertions on dictionary domains""" - self.assertIsInstance(dictionary_domain, kh.dictionary.DictionaryDomain) + self.assertIsInstance(dictionary_domain, kh.DictionaryDomain) if expected_n_dictionaries is not None: self.assertEqual( len(dictionary_domain.dictionaries), expected_n_dictionaries @@ -1625,7 +1619,7 @@ def _retrieve_data( ): return self.datasets[schema_type][source_type][estimation_process] - def _define_resources(self, dataset, estimator_type): + def _define_resources(self, dataset, estimator_type, estimator_method): # Set the resources directory for the arguments head_dir = os.path.join( KhiopsTestHelper.get_resources_dir(), "sklearn", "results" @@ -1655,7 +1649,18 @@ def _define_resources(self, dataset, estimator_type): report_path = os.path.join(ref_reports_dir, report_name) model_kdic_path = os.path.join(ref_models_dir, f"{kdic_name}.kdic") model_kdicj_path = os.path.join(ref_models_dir, f"{kdic_name}.kdicj") - prediction_table_path = os.path.join(ref_predictions_dir, "transformed.txt") + if estimator_type in (KhiopsCoclustering, KhiopsEncoder): + prediction_table_path = os.path.join(ref_predictions_dir, "transform.txt") + else: + if estimator_method == "predict": + prediction_table_path = os.path.join(ref_predictions_dir, "predict.txt") + elif estimator_method == "predict_proba": + prediction_table_path = os.path.join( + ref_predictions_dir, "predict_proba.txt" + ) + else: + assert estimator_method == "fit", f"Real: {estimator_method}" + prediction_table_path = "" # Buld the resources resources = { @@ -1780,7 +1785,7 @@ def _test_template( X_test_data = data["test"] dataset = self.dataset_of_schema_type[schema_type] - resources = self._define_resources(dataset, estimator_type) + resources = self._define_resources(dataset, estimator_type, estimator_method) estimator_type_key = ( KhiopsPredictor @@ -2266,20 +2271,37 @@ def test_sklearn_check_estimator(self): KhiopsEncoder(n_trees=0, transform_type_numerical="0-1_normalization"), ] - # Execute sklearn's estimator test battery - for khiops_estimator in khiops_estimators: - for estimator, check in check_estimator( - khiops_estimator, generate_only=True - ): - # Skip some checks for KhiopsEncoder as they yield "empty" - # deployed tables; they need to be implemented manually - check_name = check.func.__name__ - if check_name in [ - "check_fit_score_takes_y", - "check_fit_idempotent", - ] and isinstance(estimator, KhiopsEncoder): - continue - with self.subTest( - sklearn_check_name=check_name, sklearn_check_kwargs=check.keywords + # Ignore the "No informative variables" warnings + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+no informative variables" + ) + warnings.filterwarnings( + "ignore", message=r"[\S\n\t\v ]+No informative input variable" + ) + + # Execute sklearn's estimator test battery + print("") + for khiops_estimator in khiops_estimators: + for estimator, check in check_estimator( + khiops_estimator, generate_only=True ): - check(estimator) + # Skip some checks for KhiopsEncoder as they yield "empty" + # deployed tables; they need to be implemented manually + check_name = check.func.__name__ + if check_name in [ + "check_fit_score_takes_y", + "check_fit_idempotent", + ] and isinstance(estimator, KhiopsEncoder): + continue + print( + f">>> Executing {check_name} on " + f"{estimator.__class__.__name__}... ", + end="", + ) + with self.subTest( + sklearn_check_name=check_name, + sklearn_check_kwargs=check.keywords, + ): + check(estimator) + print("Done")