diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index beb7663b..2d3c9541 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,13 @@ repos: hooks: - id: isort language_version: python3 + exclude: khiops/samples/samples.py|khiops/samples/samples_sklearn.py + - id: isort + alias: isort-samples + name: isort-samples + language_version: python3 + files: khiops/samples/samples.py|khiops/samples/samples_sklearn.py + args: [--no-sections] - repo: https://github.com/lyz-code/yamlfix/ rev: 1.16.0 hooks: @@ -25,13 +32,16 @@ repos: rev: 0.28.1 hooks: - id: check-github-workflows + name: gh-workflows args: [--verbose] - id: check-github-actions + name: gh-actions args: [--verbose] - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: 3.0.0 hooks: - id: shellcheck + name: shellcheck - repo: local hooks: - id: samples-generation diff --git a/doc/conf.py b/doc/conf.py index e4027f58..fbe9115d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -61,8 +61,13 @@ # List of patterns, relative to source directory, that match files and directories to # ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_templates", "_build", "Thumbs.db", ".DS_Store"] - +exclude_patterns = [ + "_templates", + "_build", + "Thumbs.db", + ".DS_Store", + "**.ipynb_checkpoints", +] # HTML Theme # Theme colors and fonts come from https://brand.orange.com html_theme = "furo" @@ -70,6 +75,7 @@ "light_css_variables": { "color-brand-primary": "#FF7900", "color-brand-content": "#F16E00", + "color-brand-visited": "#FF7900", "color-sidebar-background": "#FFFFFF", "color-highlighted-background": "#FFD200", "color-admonition-title--note": "#FF7900", @@ -79,6 +85,7 @@ "dark_css_variables": { "color-brand-primary": "#FF7900", "color-brand-content": "#F16E00", + "color-brand-visited": "#FF7900", "color-sidebar-background": "#000000", "color-highlighted-background": "#FFD200", "color-admonition-title--note": "#FF7900", diff --git a/doc/convert_samples.py b/doc/convert_samples.py index ae204a8a..f1741cf3 100644 --- a/doc/convert_samples.py +++ b/doc/convert_samples.py @@ -8,51 +8,11 @@ import sys import textwrap - -def create_boilerplate_code(script_name): - if script_name == "samples": - boilerplate_code = [ - "import os\n", - "from math import sqrt\n", - "from os import path\n", - "\n", - "from khiops import core as kh\n", - "\n", - ] - elif script_name == "samples_sklearn": - boilerplate_code = [ - "import os\n", - "import pickle\n", - "from os import path\n", - "\n", - "import pandas as pd\n", - "from sklearn import metrics\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.experimental import enable_hist_gradient_boosting\n", - "from sklearn.ensemble import HistGradientBoostingClassifier\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from sklearn.feature_extraction.text import HashingVectorizer\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "\n", - "from khiops import core as kh\n", - "from khiops.sklearn import (\n", - " KhiopsClassifier,\n", - " KhiopsCoclustering,\n", - " KhiopsEncoder,\n", - " KhiopsRegressor,\n", - ")\n", - ] - else: - raise ValueError(f"Invalid samples script name '{script_name}'") - return boilerplate_code +import black def create_header_cells(script_name): """Creates the header cells for the notebook""" - boilerplate_code = create_boilerplate_code(script_name) - # Create the boilerplate cells cells = [ { @@ -66,39 +26,41 @@ def create_header_cells(script_name): "[Khiops](https://khiops.org) before using this this notebook", ], }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {"collapsed": True}, - "outputs": [], - "source": boilerplate_code, - }, ] return cells -def create_sample_cell(sample_method): +def create_sample_cells(sample_method): """Creates a code cell and an execution cell for the specified method""" + # Create the code block + code, docstring = split_docstring(inspect.getsource(sample_method)) + code = textwrap.dedent(code) + code = black.format_str(code, mode=black.Mode()) + # Create the cell source as a list of lines - sample_method_source = inspect.getsource(sample_method) - sample_source_list = [line + "\n" for line in sample_method_source.split("\n")] - sample_source_list += ["#Run sample\n", sample_method.__name__ + "()"] + code_list = [line + "\n" for line in code.rstrip().split("\n")] + code_list[-1] = code_list[-1].rstrip() - sample_execution_cell = { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": sample_source_list, - } + sample_execution_cells = [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [f"### `{sample_method.__name__}()`\n\n", f"{docstring}\n"], + }, + { + "cell_type": "code", + "execution_count": None, + "metadata": {}, + "outputs": [], + "source": code_list, + }, + ] - return sample_execution_cell + return sample_execution_cells def create_rest_page_header(script_name): - boilerplate_code = "".join(create_boilerplate_code(script_name)) - indented_boilerplate_code = textwrap.indent(boilerplate_code, " ") subtitle = "The code snippets on this page demonstrate the basic use of the " if script_name == "samples": title = "Samples core" @@ -139,38 +101,37 @@ def create_rest_page_header(script_name): " from khiops.tools import download_datasets\n" " download_datasets()\n" "\n" - "Before copying any code snippet make sure to precede it with following\n" - "preamble:\n" - "\n" - ".. code-block:: python\n" - "\n" - f"{indented_boilerplate_code}" "\n" "Samples\n" "-------\n" ) -def remove_docstring(source): - docstring_open = source.find('"""') - if docstring_open == -1: +def split_docstring(source): + docstring_open_quote = source.find('"""') + if docstring_open_quote == -1: source_without_docstring = sample_source + docstring = "" else: - docstring_close = source[docstring_open + 3 :].find('"""') - source_without_docstring = source[docstring_open + 3 + docstring_close + 4 :] - return source_without_docstring + docstring_close_quote = ( + docstring_open_quote + 3 + source[docstring_open_quote + 3 :].find('"""') + ) + source_without_docstring = source[docstring_close_quote + 4 :] + docstring = source[docstring_open_quote + 3 : docstring_close_quote] + return source_without_docstring, docstring def create_rest_page_section(sample_function): - code = f"def {sample_function.__name__}():\n" + remove_docstring( - inspect.getsource(sample_function) - ) - indented_code = textwrap.indent(code, " ") + code, _ = split_docstring(inspect.getsource(sample_function)) + code = textwrap.dedent(code) + code = black.format_str(code, mode=black.Mode()) + code = textwrap.indent(code, " ") + code = code.rstrip() return ( f".. autofunction:: {sample_function.__name__}\n" ".. code-block:: python\n" "\n" - f"{indented_code}" + f"{code}" ) @@ -184,6 +145,7 @@ def main(args): # Sanity check script_path = os.path.join(args.samples_dir, f"{script_name}.py") + print(f"Converting to format '{args.format}' samples script at {script_path}") if os.path.abspath(script_path) == os.path.abspath(args.output_path): print("error: input and output paths are the same") sys.exit(1) @@ -210,7 +172,7 @@ def main(args): notebook_objects = {} notebook_objects["cells"] = create_header_cells(script_name) for sample_method in samples.exported_samples: - notebook_objects["cells"].append(create_sample_cell(sample_method)) + notebook_objects["cells"].extend(create_sample_cells(sample_method)) notebook_objects["metadata"] = {} notebook_objects["nbformat"] = 4 notebook_objects["nbformat_minor"] = 2 diff --git a/doc/create-doc b/doc/create-doc index 17b2d363..1de0c1ac 100755 --- a/doc/create-doc +++ b/doc/create-doc @@ -90,21 +90,18 @@ fi # Create the coursework materials echo "Creating ZIP files" -(cd "$KHIOPS_TUTORIAL_REPO_DIR" && cp -r data helper_functions.py "../$tutorials_dir") cd "$tutorials_dir" mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "core_tutorials_solutions.zip" Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "core_tutorials_solutions.zip" Core*.ipynb data/*/* exercises/.dummy +zip "sklearn_tutorials_solutions.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "$KHIOPS_TUTORIAL_REPO_DIR" python create-coursework.py cd coursework mkdir -p exercises touch exercises/.dummy # Create a dummy so the "exercises" directory is created on unzip -zip "../../$tutorials_dir/core_tutorials.zip" \ - Core*.ipynb helper_functions.py data/*/* exercises/.dummy -zip "../../$tutorials_dir/sklearn_tutorials.zip" \ - Sklearn*.ipynb helper_functions.py data/*/* exercises/.dummy +zip "../../$tutorials_dir/core_tutorials.zip" Core*.ipynb data/*/* exercises/.dummy +zip "../../$tutorials_dir/sklearn_tutorials.zip" Sklearn*.ipynb data/*/* exercises/.dummy cd "../.." # Create the documentation with Sphinx diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst index ba1e3f48..5f35064b 100644 --- a/doc/samples/samples.rst +++ b/doc/samples/samples.rst @@ -30,17 +30,6 @@ If that doesn't work open a python console and execute: from khiops.tools import download_datasets download_datasets() -Before copying any code snippet make sure to precede it with following -preamble: - -.. code-block:: python - - import os - from math import sqrt - from os import path - - from khiops import core as kh - Samples ------- @@ -48,1357 +37,1452 @@ Samples .. autofunction:: get_khiops_version .. code-block:: python - def get_khiops_version(): - print(f"Khiops version: {kh.get_khiops_version()}") - + print(f"Khiops version: {kh.get_khiops_version()}") .. autofunction:: build_dictionary_from_data_table .. code-block:: python - def build_dictionary_from_data_table(): - # Set the file paths - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - dictionary_name = "AutoAdult" - dictionary_file_path = path.join( - "kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic" - ) - - # Create the dictionary from the data table - kh.build_dictionary_from_data_table( - data_table_path, dictionary_name, dictionary_file_path - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + dictionary_name = "AutoAdult" + dictionary_file_path = os.path.join( + "kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic" + ) + + # Create the dictionary from the data table + kh.build_dictionary_from_data_table( + data_table_path, dictionary_name, dictionary_file_path + ) .. autofunction:: detect_data_table_format .. code-block:: python - def detect_data_table_format(): - # Set the file paths - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - results_dir = path.join("kh_samples", "detect_data_table_format") - transformed_data_table_path = path.join(results_dir, "AdultWithAnotherFormat.txt") - - # Create the output directory - if not path.isdir(results_dir): - os.mkdir(results_dir) - - # Detect the format of the table - format_spec = kh.detect_data_table_format(data_table_path) - print("Format specification (header_line, field_separator)") - print("Format detected on original table:", format_spec) - - # Make a deployment to change the format of the data table - kh.deploy_model( - dictionary_file_path, - "Adult", - data_table_path, - transformed_data_table_path, - output_header_line=False, - output_field_separator=",", - ) - - # Detect the new format of the table without a dictionary file - format_spec = kh.detect_data_table_format(transformed_data_table_path) - print("Format detected on reformatted table:", format_spec) - - # Detect the new format of the table with a dictionary file - format_spec = kh.detect_data_table_format( - transformed_data_table_path, - dictionary_file_path_or_domain=dictionary_file_path, - dictionary_name="Adult", - ) - print("Format detected (with dictionary file) on reformatted table:", format_spec) + # Imports + import os + from khiops import core as kh + # Set the file paths + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + results_dir = os.path.join("kh_samples", "detect_data_table_format") + transformed_data_table_path = os.path.join(results_dir, "AdultWithAnotherFormat.txt") + + # Create the output directory + if not os.path.isdir(results_dir): + os.mkdir(results_dir) + + # Detect the format of the table + format_spec = kh.detect_data_table_format(data_table_path) + print("Format specification (header_line, field_separator)") + print("Format detected on original table:", format_spec) + + # Make a deployment to change the format of the data table + kh.deploy_model( + dictionary_file_path, + "Adult", + data_table_path, + transformed_data_table_path, + output_header_line=False, + output_field_separator=",", + ) + + # Detect the new format of the table without a dictionary file + format_spec = kh.detect_data_table_format(transformed_data_table_path) + print("Format detected on reformatted table:", format_spec) + + # Detect the new format of the table with a dictionary file + format_spec = kh.detect_data_table_format( + transformed_data_table_path, + dictionary_file_path_or_domain=dictionary_file_path, + dictionary_name="Adult", + ) + print("Format detected (with dictionary file) on reformatted table:", format_spec) .. autofunction:: check_database .. code-block:: python - def check_database(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - log_file = path.join("kh_samples", "check_database", "check_database.log") - - # Check the database - kh.check_database( - dictionary_file_path, - "Adult", - data_table_path, - log_file_path=log_file, - max_messages=50, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + log_file = os.path.join("kh_samples", "check_database", "check_database.log") + + # Check the database + kh.check_database( + dictionary_file_path, + "Adult", + data_table_path, + log_file_path=log_file, + max_messages=50, + ) .. autofunction:: export_dictionary_files .. code-block:: python - def export_dictionary_files(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - results_dir = path.join("kh_samples", "export_dictionary_file") - output_dictionary_file_path = path.join(results_dir, "ModifiedAdult.kdic") - output_dictionary_json_path = path.join(results_dir, "ModifiedAdult.kdicj") - alt_output_dictionary_json_path = path.join(results_dir, "AltModifiedAdult.kdicj") - - # Load the dictionary domain from initial dictionary file - # Then obtain the "Adult" dictionary within - domain = kh.read_dictionary_file(dictionary_file_path) - dictionary = domain.get_dictionary("Adult") - - # Set some of its variables to unused - fnlwgt_variable = dictionary.get_variable("fnlwgt") - fnlwgt_variable.used = False - label_variable = dictionary.get_variable("Label") - label_variable.used = False - - # Create output directory if necessary - if not path.exists("kh_samples"): - os.mkdir("kh_samples") + # Imports + import os + from khiops import core as kh + + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + results_dir = os.path.join("kh_samples", "export_dictionary_file") + output_dictionary_file_path = os.path.join(results_dir, "ModifiedAdult.kdic") + output_dictionary_json_path = os.path.join(results_dir, "ModifiedAdult.kdicj") + alt_output_dictionary_json_path = os.path.join(results_dir, "AltModifiedAdult.kdicj") + + # Load the dictionary domain from initial dictionary file + # Then obtain the "Adult" dictionary within + domain = kh.read_dictionary_file(dictionary_file_path) + dictionary = domain.get_dictionary("Adult") + + # Set some of its variables to unused + fnlwgt_variable = dictionary.get_variable("fnlwgt") + fnlwgt_variable.used = False + label_variable = dictionary.get_variable("Label") + label_variable.used = False + + # Create output directory if necessary + if not os.path.exists("kh_samples"): + os.mkdir("kh_samples") + os.mkdir(results_dir) + else: + if not os.path.exists(results_dir): os.mkdir(results_dir) - else: - if not path.exists(results_dir): - os.mkdir(results_dir) - - # Export to kdic - domain.export_khiops_dictionary_file(output_dictionary_file_path) - - # Export to kdicj either from the domain or from a kdic file - # Requires a Khiops execution, that's why it is not a method of DictionaryDomain - kh.export_dictionary_as_json(domain, output_dictionary_json_path) - kh.export_dictionary_as_json( - output_dictionary_file_path, alt_output_dictionary_json_path - ) + # Export to kdic + domain.export_khiops_dictionary_file(output_dictionary_file_path) + + # Export to kdicj either from the domain or from a kdic file + # Requires a Khiops execution, that's why it is not a method of DictionaryDomain + kh.export_dictionary_as_json(domain, output_dictionary_json_path) + kh.export_dictionary_as_json( + output_dictionary_file_path, alt_output_dictionary_json_path + ) .. autofunction:: train_predictor .. code-block:: python - def train_predictor(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor") - - # Train the predictor - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor") + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) .. autofunction:: train_predictor_file_paths .. code-block:: python - def train_predictor_file_paths(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_file_paths") - - # Train the predictor - report_file_path, modeling_dictionary_file_path = kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) - print("Reports file available at " + report_file_path) - print("Modeling dictionary file available at " + modeling_dictionary_file_path) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_file_paths") + + # Train the predictor + report_file_path, modeling_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) + print("Reports file available at " + report_file_path) + print("Modeling dictionary file available at " + modeling_dictionary_file_path) .. autofunction:: train_predictor_error_handling .. code-block:: python - def train_predictor_error_handling(): - # Set the file paths with a nonexistent dictionary file - dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic" - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_error_handling") - log_file_path = path.join(results_dir, "khiops.log") - scenario_path = path.join(results_dir, "scenario._kh") - - # Train the predictor and handle the error - try: - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - trace=True, - log_file_path=log_file_path, - output_scenario_path=scenario_path, - ) - except kh.KhiopsRuntimeError as error: - print("Khiops training failed! Below the KhiopsRuntimeError message:") - print(error) - - print("\nFull log contents:") - print("------------------") - with open(log_file_path) as log_file: - for line in log_file: - print(line, end="") - - print("\nExecuted scenario") - print("-----------------") - with open(scenario_path) as scenario_file: - for line in scenario_file: - print(line, end="") + # Imports + import os + from khiops import core as kh -.. autofunction:: train_predictor_mt -.. code-block:: python + # Set the file paths with a nonexistent dictionary file + dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic" + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_error_handling") + log_file_path = os.path.join(results_dir, "khiops.log") + scenario_path = os.path.join(results_dir, "scenario._kh") - def train_predictor_mt(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_predictor_mt") - - # Train the predictor. Besides the mandatory parameters, we specify: - # - A python dictionary linking data paths to file paths for non-root tables - # - To not construct any decision tree - # The default number of automatic features is 100 + # Train the predictor and handle the error + try: kh.train_predictor( dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", + "Adult", + data_table_path, + "class", results_dir, - additional_data_tables={"Accident`Vehicles": vehicles_table_path}, - max_trees=0, + trace=True, + log_file_path=log_file_path, + output_scenario_path=scenario_path, ) + except kh.KhiopsRuntimeError as error: + print("Khiops training failed! Below the KhiopsRuntimeError message:") + print(error) + + print("\nFull log contents:") + print("------------------") + with open(log_file_path) as log_file: + for line in log_file: + print(line, end="") + + print("\nExecuted scenario") + print("-----------------") + with open(scenario_path) as scenario_file: + for line in scenario_file: + print(line, end="") +.. autofunction:: train_predictor_mt +.. code-block:: python + # Imports + import os + from khiops import core as kh + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt") + + # Train the predictor. Besides the mandatory parameters, we specify: + # - A python dictionary linking data paths to file paths for non-root tables + # - To not construct any decision tree + # The default number of automatic features is 100 + kh.train_predictor( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, + max_trees=0, + ) .. autofunction:: train_predictor_mt_with_specific_rules .. code-block:: python - def train_predictor_mt_with_specific_rules(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_predictor_mt_with_specific_rules") - - # Train the predictor. Besides the mandatory parameters, it is specified: - # - A python dictionary linking data paths to file paths for non-root tables - # - The maximum number of aggregate variables to construct (1000) - # - The construction rules allowed to automatically create aggregates - # - To not construct any decision tree - kh.train_predictor( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - results_dir, - additional_data_tables={"Accident`Vehicles": vehicles_table_path}, - max_constructed_variables=1000, - construction_rules=["TableMode", "TableSelection"], - max_trees=0, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt_with_specific_rules") + + # Train the predictor. Besides the mandatory parameters, it is specified: + # - A python dictionary linking data paths to file paths for non-root tables + # - The maximum number of aggregate variables to construct (1000) + # - The construction rules allowed to automatically create aggregates + # - To not construct any decision tree + kh.train_predictor( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, + max_constructed_variables=1000, + construction_rules=["TableMode", "TableSelection"], + max_trees=0, + ) .. autofunction:: train_predictor_mt_snowflake .. code-block:: python - def train_predictor_mt_snowflake(): - - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "Accidents") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - users_table_path = path.join(accidents_dir, "Users.txt") - places_table_path = path.join(accidents_dir, "Places.txt") - results_dir = path.join("kh_samples", "train_predictor_mt_snowflake") - - # Train the predictor. Besides the mandatory parameters, we specify: - # - A python dictionary linking data paths to file paths for non-root tables - # - To not construct any decision tree - # The default number of automatic features is 100 - kh.train_predictor( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - results_dir, - additional_data_tables={ - "Accident`Vehicles": vehicles_table_path, - "Accident`Vehicles`Users": users_table_path, - "Accident`Place": places_table_path, - }, - max_trees=0, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt_snowflake") + + # Train the predictor. Besides the mandatory parameters, we specify: + # - A python dictionary linking data paths to file paths for non-root tables + # - To not construct any decision tree + # The default number of automatic features is 100 + kh.train_predictor( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={ + "Accident`Vehicles": vehicles_table_path, + "Accident`Vehicles`Users": users_table_path, + "Accident`Place": places_table_path, + }, + max_trees=0, + ) .. autofunction:: train_predictor_with_train_percentage .. code-block:: python - def train_predictor_with_train_percentage(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_train_percentage") - - # Train the predictor. Besides the mandatory parameters, it is specified: - # - A 90% sampling rate for the training dataset - # - Set the test dataset as the complement of the training dataset (10%) - # - No trees - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - sample_percentage=90, - use_complement_as_test=True, - max_trees=0, - results_prefix="P90_", - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_train_percentage") + + # Train the predictor. Besides the mandatory parameters, it is specified: + # - A 90% sampling rate for the training dataset + # - Set the test dataset as the complement of the training dataset (10%) + # - No trees + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + sample_percentage=90, + use_complement_as_test=True, + max_trees=0, + results_prefix="P90_", + ) .. autofunction:: train_predictor_with_trees .. code-block:: python - def train_predictor_with_trees(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Letter", "Letter.txt") - results_dir = path.join("kh_samples", "train_predictor_with_trees") - - # Train the predictor with at most 15 trees (default 10) - kh.train_predictor( - dictionary_file_path, - "Letter", - data_table_path, - "lettr", - results_dir, - sample_percentage=80, - use_complement_as_test=True, - results_prefix="P80_", - max_trees=15, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_trees") + + # Train the predictor with at most 15 trees (default 10) + kh.train_predictor( + dictionary_file_path, + "Letter", + data_table_path, + "lettr", + results_dir, + sample_percentage=80, + use_complement_as_test=True, + results_prefix="P80_", + max_trees=15, + ) .. autofunction:: train_predictor_with_pairs .. code-block:: python - def train_predictor_with_pairs(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_pairs") - - # Train the predictor with at most 10 pairs as follows: - # - Include pairs age-race and capital_gain-capital_loss - # - Include all possible pairs having relationship as component - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - use_complement_as_test=True, - max_trees=0, - max_pairs=10, - specific_pairs=[ - ("age", "race"), - ("capital_gain", "capital_loss"), - ("relationship", ""), - ], - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_pairs") + + # Train the predictor with at most 10 pairs as follows: + # - Include pairs age-race and capital_gain-capital_loss + # - Include all possible pairs having relationship as component + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + use_complement_as_test=True, + max_trees=0, + max_pairs=10, + specific_pairs=[ + ("age", "race"), + ("capital_gain", "capital_loss"), + ("relationship", ""), + ], + ) .. autofunction:: train_predictor_with_multiple_parameters .. code-block:: python - def train_predictor_with_multiple_parameters(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_multiple_parameters") - output_script_path = path.join(results_dir, "output_scenario._kh") - log_path = path.join(results_dir, "log.txt") - - # Set memory limit to 1000 Mb and train with Khiops - kh.get_runner().max_memory_mb = 1000 - - # Train the predictor. Besides the mandatory parameters, we specify: - # - The value "more" as main target value - # - The output Khiops script file location (generic) - # - The log file location (generic) - # - To show the debug trace (generic) - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - main_target_value="more", - output_scenario_path=output_script_path, - log_file_path=log_path, - trace=True, - ) - - # Reset memory limit to default Khiops tool value - kh.get_runner().max_memory_mb = 0 + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters") + output_script_path = os.path.join(results_dir, "output_scenario._kh") + log_path = os.path.join(results_dir, "log.txt") + + # Set memory limit to 1000 Mb and train with Khiops + kh.get_runner().max_memory_mb = 1000 + + # Train the predictor. Besides the mandatory parameters, we specify: + # - The value "more" as main target value + # - The output Khiops script file location (generic) + # - The log file location (generic) + # - To show the debug trace (generic) + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + main_target_value="more", + output_scenario_path=output_script_path, + log_file_path=log_path, + trace=True, + ) + + # Reset memory limit to default Khiops tool value + kh.get_runner().max_memory_mb = 0 .. autofunction:: train_predictor_detect_format .. code-block:: python - def train_predictor_detect_format(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - results_dir = path.join("kh_samples", "train_predictor_detect_format") - transformed_data_table_path = path.join(results_dir, "TransformedIris.txt") - - # Transform the database format from header_line=True and field_separator=TAB - # to header_line=False and field_separator="," - # See the deploy_model examples below for more details - kh.deploy_model( - dictionary_file_path, - "Iris", - data_table_path, - transformed_data_table_path, - output_header_line=False, - output_field_separator=",", - ) + # Imports + import os + from khiops import core as kh - # Try to learn with the old format - try: - kh.train_predictor( - dictionary_file_path, - "Iris", - transformed_data_table_path, - "Class", - results_dir, - header_line=True, - field_separator="", - ) - except kh.KhiopsRuntimeError as error: - print( - "This failed because of a bad data table format spec. " - + "Below the KhiopsRuntimeError message" - ) - print(error) - - # Train without specifyng the format (detect_format is True by default) + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + results_dir = os.path.join("kh_samples", "train_predictor_detect_format") + transformed_data_table_path = os.path.join(results_dir, "TransformedIris.txt") + + # Transform the database format from header_line=True and field_separator=TAB + # to header_line=False and field_separator="," + # See the deploy_model examples below for more details + kh.deploy_model( + dictionary_file_path, + "Iris", + data_table_path, + transformed_data_table_path, + output_header_line=False, + output_field_separator=",", + ) + + # Try to learn with the old format + try: kh.train_predictor( dictionary_file_path, "Iris", transformed_data_table_path, "Class", results_dir, + header_line=True, + field_separator="", ) - + except kh.KhiopsRuntimeError as error: + print( + "This failed because of a bad data table format spec. " + + "Below the KhiopsRuntimeError message" + ) + print(error) + + # Train without specifyng the format (detect_format is True by default) + kh.train_predictor( + dictionary_file_path, + "Iris", + transformed_data_table_path, + "Class", + results_dir, + ) .. autofunction:: train_predictor_with_cross_validation .. code-block:: python - def train_predictor_with_cross_validation(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_cross_validation") - fold_dictionary_file_path = path.join(results_dir, "AdultWithFolding.kdic") - - # Create the output directory - if not path.isdir(results_dir): - os.mkdir(results_dir) - - # Load the learning dictionary object - domain = kh.read_dictionary_file(dictionary_file_path) - dictionary = domain.get_dictionary("Adult") - - # Add a random fold index variable to the learning dictionary - fold_number = 5 - fold_index_variable = kh.Variable() - fold_index_variable.name = "FoldIndex" - fold_index_variable.type = "Numerical" - fold_index_variable.used = False - fold_index_variable.rule = "Ceil(Product(" + str(fold_number) + ", Random()))" - dictionary.add_variable(fold_index_variable) - - # Add variables that indicate if the instance is in the train dataset: - for fold_index in range(1, fold_number + 1): - is_in_train_dataset_variable = kh.Variable() - is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index) - is_in_train_dataset_variable.type = "Numerical" - is_in_train_dataset_variable.used = False - is_in_train_dataset_variable.rule = "NEQ(FoldIndex, " + str(fold_index) + ")" - dictionary.add_variable(is_in_train_dataset_variable) - - # Print dictionary with fold variables - print("Dictionary file with fold variables") - domain.export_khiops_dictionary_file(fold_dictionary_file_path) - with open(fold_dictionary_file_path) as fold_dictionary_file: - for line in fold_dictionary_file: - print(line, end="") - - # For each fold k: - print("Training Adult with " + str(fold_number) + " folds") - print("\tfold\ttrain auc\ttest auc") - train_aucs = [] - test_aucs = [] - for fold_index in range(1, fold_number + 1): - # Train a model from the sub-dataset where IsInTrainDataset is 1 - train_reports_path, modeling_dictionary_file_path = kh.train_predictor( - domain, - "Adult", - data_table_path, - "class", - results_dir, - sample_percentage=100, - selection_variable="IsInTrainDataset" + str(fold_index), - selection_value=1, - max_trees=0, - results_prefix="Fold" + str(fold_index), - ) - - # Evaluate the resulting model in the subsets where IsInTrainDataset is 0 - test_evaluation_report_path = kh.evaluate_predictor( - modeling_dictionary_file_path, - "Adult", - data_table_path, - results_dir, - sample_percentage=100, - selection_variable="IsInTrainDataset" + str(fold_index), - selection_value=0, - results_prefix="Fold" + str(fold_index), - ) - - # Obtain the train AUC from the train report and the test AUC from the - # evaluation report and print them - train_results = kh.read_analysis_results_file(train_reports_path) - test_evaluation_results = kh.read_analysis_results_file( - test_evaluation_report_path - ) - train_auc = train_results.train_evaluation_report.get_snb_performance().auc - test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc - print("\t" + str(fold_index) + "\t" + str(train_auc) + "\t" + str(test_auc)) - - # Store the train and test AUCs in arrays - train_aucs.append(train_auc) - test_aucs.append(test_auc) - - # Print the mean +- error aucs for both train and test - mean_train_auc = sum(train_aucs) / fold_number - squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs] - sd_train_auc = sqrt(sum(squared_error_train_aucs) / (fold_number - 1)) - - mean_test_auc = sum(test_aucs) / fold_number - squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs] - sd_test_auc = sqrt(sum(squared_error_test_aucs) / (fold_number - 1)) - - print("final auc") - print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc)) - print("test auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc)) - -.. autofunction:: multiple_train_predictor -.. code-block:: python + # Imports + import math + import os + from khiops import core as kh - def multiple_train_predictor(): - - def display_test_results(json_result_file_path): - """Display some of the training results""" - results = kh.read_analysis_results_file(json_result_file_path) - train_performance = results.train_evaluation_report.get_snb_performance() - test_performance = results.test_evaluation_report.get_snb_performance() - print( - "\t" - + str(len(results.preparation_report.variables_statistics)) - + "\t" - + str(train_performance.auc) - + "\t" - + str(test_performance.auc) - ) - - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "multiple_train_predictor") - - # Read the dictionary file to obtain an instance of class Dictionary - dictionary_domain = kh.read_dictionary_file(dictionary_file_path) - dictionary = dictionary_domain.get_dictionary("Adult") - - # Train a SNB model using all the variables - print("\t#vars\ttrain auc\ttest auc") - json_result_file_path, _ = kh.train_predictor( - dictionary_file_path, + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation") + fold_dictionary_file_path = os.path.join(results_dir, "AdultWithFolding.kdic") + + # Create the output directory + if not os.path.isdir(results_dir): + os.mkdir(results_dir) + + # Load the learning dictionary object + domain = kh.read_dictionary_file(dictionary_file_path) + dictionary = domain.get_dictionary("Adult") + + # Add a random fold index variable to the learning dictionary + fold_number = 5 + fold_index_variable = kh.Variable() + fold_index_variable.name = "FoldIndex" + fold_index_variable.type = "Numerical" + fold_index_variable.used = False + fold_index_variable.rule = "Ceil(Product(" + str(fold_number) + ", Random()))" + dictionary.add_variable(fold_index_variable) + + # Add variables that indicate if the instance is in the train dataset: + for fold_index in range(1, fold_number + 1): + is_in_train_dataset_variable = kh.Variable() + is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index) + is_in_train_dataset_variable.type = "Numerical" + is_in_train_dataset_variable.used = False + is_in_train_dataset_variable.rule = "NEQ(FoldIndex, " + str(fold_index) + ")" + dictionary.add_variable(is_in_train_dataset_variable) + + # Print dictionary with fold variables + print("Dictionary file with fold variables") + domain.export_khiops_dictionary_file(fold_dictionary_file_path) + with open(fold_dictionary_file_path) as fold_dictionary_file: + for line in fold_dictionary_file: + print(line, end="") + + # For each fold k: + print("Training Adult with " + str(fold_number) + " folds") + print("\tfold\ttrain auc\ttest auc") + train_aucs = [] + test_aucs = [] + for fold_index in range(1, fold_number + 1): + # Train a model from the sub-dataset where IsInTrainDataset is 1 + train_reports_path, modeling_dictionary_file_path = kh.train_predictor( + domain, "Adult", data_table_path, "class", results_dir, - sample_percentage=70, - use_complement_as_test=True, + sample_percentage=100, + selection_variable="IsInTrainDataset" + str(fold_index), + selection_value=1, max_trees=0, + results_prefix="Fold" + str(fold_index), ) - display_test_results(json_result_file_path) - # Read results to obtain the variables sorted by decreasing Level - analysis_results = kh.read_analysis_results_file(json_result_file_path) - preparation_results = analysis_results.preparation_report - - # Train a sequence of models with a decreasing number of variables - # We disable variables one-by-one in increasing level (predictive power) order - variable_number = len(preparation_results.variables_statistics) - for i in reversed(range(variable_number)): - # Search the next variable - variable = preparation_results.variables_statistics[i] - - # Disable this variable and save the dictionary with the Khiops format - dictionary.get_variable(variable.name).used = False - - # Train the model with this dictionary domain object - prefix = f"V{variable_number - 1 - i}_" - json_result_file_path, _ = kh.train_predictor( - dictionary_domain, - "Adult", - data_table_path, - "class", - results_dir, - sample_percentage=70, - use_complement_as_test=True, - results_prefix=prefix, - max_trees=0, - ) - - # Show a preview of the results - display_test_results(json_result_file_path) - -.. autofunction:: evaluate_predictor -.. code-block:: python - - def evaluate_predictor(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "evaluate_predictor") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - - # Train the predictor - kh.train_predictor( - dictionary_file_path, + # Evaluate the resulting model in the subsets where IsInTrainDataset is 0 + test_evaluation_report_path = kh.evaluate_predictor( + modeling_dictionary_file_path, "Adult", data_table_path, - "class", results_dir, - max_trees=0, - ) - - # Evaluate the predictor - report_file_path = kh.evaluate_predictor( - model_dictionary_file_path, "Adult", data_table_path, results_dir + sample_percentage=100, + selection_variable="IsInTrainDataset" + str(fold_index), + selection_value=0, + results_prefix="Fold" + str(fold_index), ) - print("Evaluation report available at " + report_file_path) -.. autofunction:: access_predictor_evaluation_report + # Obtain the train AUC from the train report and the test AUC from the + # evaluation report and print them + train_results = kh.read_analysis_results_file(train_reports_path) + test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path) + train_auc = train_results.train_evaluation_report.get_snb_performance().auc + test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc + print("\t" + str(fold_index) + "\t" + str(train_auc) + "\t" + str(test_auc)) + + # Store the train and test AUCs in arrays + train_aucs.append(train_auc) + test_aucs.append(test_auc) + + # Print the mean +- error aucs for both train and test + mean_train_auc = sum(train_aucs) / fold_number + squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs] + sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1)) + + mean_test_auc = sum(test_aucs) / fold_number + squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs] + sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1)) + + print("final auc") + print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc)) + print("test auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc)) +.. autofunction:: multiple_train_predictor .. code-block:: python - def access_predictor_evaluation_report(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "access_predictor_evaluation_report") - evaluation_report_path = path.join(results_dir, "AllReports.khj") + # Imports + import os + from khiops import core as kh - # Train the SNB predictor and some univariate predictors - # Note: Evaluation in test is 30% by default - kh.train_predictor( - dictionary_file_path, + + def display_test_results(json_result_file_path): + """Display some of the training results""" + results = kh.read_analysis_results_file(json_result_file_path) + train_performance = results.train_evaluation_report.get_snb_performance() + test_performance = results.test_evaluation_report.get_snb_performance() + print( + "\t" + + str(len(results.preparation_report.variables_statistics)) + + "\t" + + str(train_performance.auc) + + "\t" + + str(test_performance.auc) + ) + + + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "multiple_train_predictor") + + # Read the dictionary file to obtain an instance of class Dictionary + dictionary_domain = kh.read_dictionary_file(dictionary_file_path) + dictionary = dictionary_domain.get_dictionary("Adult") + + # Train a SNB model using all the variables + print("\t#vars\ttrain auc\ttest auc") + json_result_file_path, _ = kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + sample_percentage=70, + use_complement_as_test=True, + max_trees=0, + ) + display_test_results(json_result_file_path) + + # Read results to obtain the variables sorted by decreasing Level + analysis_results = kh.read_analysis_results_file(json_result_file_path) + preparation_results = analysis_results.preparation_report + + # Train a sequence of models with a decreasing number of variables + # We disable variables one-by-one in increasing level (predictive power) order + variable_number = len(preparation_results.variables_statistics) + for i in reversed(range(variable_number)): + # Search the next variable + variable = preparation_results.variables_statistics[i] + + # Disable this variable and save the dictionary with the Khiops format + dictionary.get_variable(variable.name).used = False + + # Train the model with this dictionary domain object + prefix = f"V{variable_number - 1 - i}_" + json_result_file_path, _ = kh.train_predictor( + dictionary_domain, "Adult", data_table_path, "class", results_dir, + sample_percentage=70, + use_complement_as_test=True, + results_prefix=prefix, max_trees=0, - univariate_predictor_number=4, ) - # Obtain the evaluation results - results = kh.read_analysis_results_file(evaluation_report_path) - evaluation_report = results.test_evaluation_report - snb_performance = evaluation_report.get_snb_performance() + # Show a preview of the results + display_test_results(json_result_file_path) +.. autofunction:: evaluate_predictor +.. code-block:: python + + # Imports + import os + from khiops import core as kh - # Print univariate metrics for the SNB - print("\nperformance metrics for " + snb_performance.name) - for metric_name in snb_performance.get_metric_names(): - print(metric_name + ": " + str(snb_performance.get_metric(metric_name))) + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "evaluate_predictor") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) + + # Evaluate the predictor + report_file_path = kh.evaluate_predictor( + model_dictionary_file_path, "Adult", data_table_path, results_dir + ) + print("Evaluation report available at " + report_file_path) +.. autofunction:: access_predictor_evaluation_report +.. code-block:: python - # Print the confusion matrix - print("\nconfusion matrix:") - confusion_matrix = snb_performance.confusion_matrix + # Imports + import os + from khiops import core as kh - for target_value in confusion_matrix.values: - print("\t" + target_value, end="") + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "access_predictor_evaluation_report") + evaluation_report_path = os.path.join(results_dir, "AllReports.khj") + + # Train the SNB predictor and some univariate predictors + # Note: Evaluation in test is 30% by default + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + univariate_predictor_number=4, + ) + + # Obtain the evaluation results + results = kh.read_analysis_results_file(evaluation_report_path) + evaluation_report = results.test_evaluation_report + snb_performance = evaluation_report.get_snb_performance() + + # Print univariate metrics for the SNB + print("\nperformance metrics for " + snb_performance.name) + for metric_name in snb_performance.get_metric_names(): + print(metric_name + ": " + str(snb_performance.get_metric(metric_name))) + + # Print the confusion matrix + print("\nconfusion matrix:") + confusion_matrix = snb_performance.confusion_matrix + + for target_value in confusion_matrix.values: + print("\t" + target_value, end="") + print("") + + for i, target_value in enumerate(confusion_matrix.values): + observed_frequencies = confusion_matrix.matrix[i] + print(target_value, end="") + for frequency in observed_frequencies: + print("\t" + str(frequency), end="") print("") - for i, target_value in enumerate(confusion_matrix.values): - observed_frequencies = confusion_matrix.matrix[i] - print(target_value, end="") - for frequency in observed_frequencies: - print("\t" + str(frequency), end="") - print("") - - # Print the head of the lift curves for the 'more' modality - print("\nfirst five values of the lift curves for 'more'") - - snb_lift_curve = evaluation_report.get_snb_lift_curve("more") - optimal_lift_curve = evaluation_report.get_classifier_lift_curve("Optimal", "more") - random_lift_curve = evaluation_report.get_classifier_lift_curve("Random", "more") - - for i in range(5): - print( - str(snb_lift_curve.values[i]) - + "\t" - + str(optimal_lift_curve.values[i]) - + "\t" - + str(random_lift_curve.values[i]) - ) - - # Print univariate metrics for an univariate predictor - predictor_performance = evaluation_report.get_predictor_performance( - "Univariate relationship" + # Print the head of the lift curves for the 'more' modality + print("\nfirst five values of the lift curves for 'more'") + + snb_lift_curve = evaluation_report.get_snb_lift_curve("more") + optimal_lift_curve = evaluation_report.get_classifier_lift_curve("Optimal", "more") + random_lift_curve = evaluation_report.get_classifier_lift_curve("Random", "more") + + for i in range(5): + print( + str(snb_lift_curve.values[i]) + + "\t" + + str(optimal_lift_curve.values[i]) + + "\t" + + str(random_lift_curve.values[i]) ) - print("\n\nperformance metrics for " + predictor_performance.name) - for metric_name in predictor_performance.get_metric_names(): - print(metric_name + ": " + str(predictor_performance.get_metric(metric_name))) + # Print univariate metrics for an univariate predictor + predictor_performance = evaluation_report.get_predictor_performance( + "Univariate relationship" + ) + print("\n\nperformance metrics for " + predictor_performance.name) + for metric_name in predictor_performance.get_metric_names(): + print(metric_name + ": " + str(predictor_performance.get_metric(metric_name))) .. autofunction:: train_recoder .. code-block:: python - def train_recoder(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_recoder") + # Imports + import os + from khiops import core as kh - # Train the recoder model - kh.train_recoder( - dictionary_file_path, "Adult", data_table_path, "class", results_dir - ) + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_recoder") + # Train the recoder model + kh.train_recoder(dictionary_file_path, "Adult", data_table_path, "class", results_dir) .. autofunction:: train_recoder_with_multiple_parameters .. code-block:: python - def train_recoder_with_multiple_parameters(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_recoder_with_multiple_parameters") - - # Train the recoder model - kh.train_recoder( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_pairs=10, - categorical_recoding_method="part label", - numerical_recoding_method="part label", - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_recoder_with_multiple_parameters") + + # Train the recoder model + kh.train_recoder( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_pairs=10, + categorical_recoding_method="part label", + numerical_recoding_method="part label", + ) .. autofunction:: train_recoder_mt_flatten .. code-block:: python - def train_recoder_mt_flatten(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_recoder_mt_flatten") - - # Train the recoder. Besides the mandatory parameters, it is specified: - # - A python dictionary linking data paths to file paths for non-root tables - # - The maximum number of aggregate variables to construct (1000) - # - To keep all the created variables independently of their informativeness (level) - # - To not recode the variables values - kh.train_recoder( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - results_dir, - additional_data_tables={"Accident`Vehicles": vehicles_table_path}, - max_constructed_variables=1000, - informative_variables_only=False, - categorical_recoding_method="none", - numerical_recoding_method="none", - keep_initial_categorical_variables=True, - keep_initial_numerical_variables=True, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_recoder_mt_flatten") + + # Train the recoder. Besides the mandatory parameters, it is specified: + # - A python dictionary linking data paths to file paths for non-root tables + # - The maximum number of aggregate variables to construct (1000) + # - To keep all the created variables independently of their informativeness (level) + # - To not recode the variables values + kh.train_recoder( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, + max_constructed_variables=1000, + informative_variables_only=False, + categorical_recoding_method="none", + numerical_recoding_method="none", + keep_initial_categorical_variables=True, + keep_initial_numerical_variables=True, + ) .. autofunction:: deploy_model .. code-block:: python - def deploy_model(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_model") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") - - # Train the predictor - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) - - # Deploy the model on the database - # It will score it according to the trained predictor - kh.deploy_model( - model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_model") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) + + # Deploy the model on the database + # It will score it according to the trained predictor + kh.deploy_model( + model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path + ) .. autofunction:: deploy_model_mt .. code-block:: python - def deploy_model_mt(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "deploy_model_mt") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "TransferredAccidents.txt") - - # Train the predictor (see train_predictor_mt for details) - kh.train_predictor( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - results_dir, - additional_data_tables={"Accident`Vehicles": vehicles_table_path}, - max_trees=0, - ) - - # Deploy the model on the database - # Besides the mandatory parameters, it is specified: - # - A python dictionary linking data paths to file paths for non-root tables - kh.deploy_model( - model_dictionary_file_path, - "SNB_Accident", - accidents_table_path, - output_data_table_path, - additional_data_tables={"SNB_Accident`Vehicles": vehicles_table_path}, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") + + # Train the predictor (see train_predictor_mt for details) + kh.train_predictor( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={"Accident`Vehicles": vehicles_table_path}, + max_trees=0, + ) + + # Deploy the model on the database + # Besides the mandatory parameters, it is specified: + # - A python dictionary linking data paths to file paths for non-root tables + kh.deploy_model( + model_dictionary_file_path, + "SNB_Accident", + accidents_table_path, + output_data_table_path, + additional_data_tables={"SNB_Accident`Vehicles": vehicles_table_path}, + ) .. autofunction:: deploy_model_mt_snowflake .. code-block:: python - def deploy_model_mt_snowflake(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "Accidents") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - users_table_path = path.join(accidents_dir, "Users.txt") - places_table_path = path.join(accidents_dir, "Places.txt") - results_dir = path.join("kh_samples", "deploy_model_mt_snowflake") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "TransferredAccidents.txt") - - # Train the predictor. Besides the mandatory parameters, we specify: - # - A python dictionary linking data paths to file paths for non-root tables - # - To not construct any decision tree - # The default number of automatic features is 100 - kh.train_predictor( - dictionary_file_path, - "Accident", - accidents_table_path, - "Gravity", - results_dir, - additional_data_tables={ - "Accident`Vehicles": vehicles_table_path, - "Accident`Vehicles`Users": users_table_path, - "Accident`Place": places_table_path, - }, - max_trees=0, - ) - - # Deploy the model on the database - # Besides the mandatory parameters, it is specified: - # - A python dictionary linking data paths to file paths for non-root tables - kh.deploy_model( - model_dictionary_file_path, - "SNB_Accident", - accidents_table_path, - output_data_table_path, - additional_data_tables={ - "SNB_Accident`Vehicles": vehicles_table_path, - "SNB_Accident`Vehicles`Users": users_table_path, - "SNB_Accident`Place": places_table_path, - }, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") + + # Train the predictor. Besides the mandatory parameters, we specify: + # - A python dictionary linking data paths to file paths for non-root tables + # - To not construct any decision tree + # The default number of automatic features is 100 + kh.train_predictor( + dictionary_file_path, + "Accident", + accidents_table_path, + "Gravity", + results_dir, + additional_data_tables={ + "Accident`Vehicles": vehicles_table_path, + "Accident`Vehicles`Users": users_table_path, + "Accident`Place": places_table_path, + }, + max_trees=0, + ) + + # Deploy the model on the database + # Besides the mandatory parameters, it is specified: + # - A python dictionary linking data paths to file paths for non-root tables + kh.deploy_model( + model_dictionary_file_path, + "SNB_Accident", + accidents_table_path, + output_data_table_path, + additional_data_tables={ + "SNB_Accident`Vehicles": vehicles_table_path, + "SNB_Accident`Vehicles`Users": users_table_path, + "SNB_Accident`Place": places_table_path, + }, + ) .. autofunction:: deploy_model_expert .. code-block:: python - def deploy_model_expert(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_model_expert") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") - - # Train the predictor - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) - - # Read the dictionary file to obtain an instance of class Dictionary - model_domain = kh.read_dictionary_file(model_dictionary_file_path) - snb_dictionary = model_domain.get_dictionary("SNB_Adult") - - # Select Label (identifier) - snb_dictionary.get_variable("Label").used = True - - # Select the variables containing the probabilities for each class - for variable in snb_dictionary.variables: - # The variable must have a meta data with key that start with "target_prob" - for key in variable.meta_data.keys: - if key.startswith("TargetProb"): - variable.used = True - - # Deploy the model. Besides the mandatory parameters, it is specified: - # - A DictionaryDomain object to use instead of the mandatory dictionary file - kh.deploy_model(model_domain, "SNB_Adult", data_table_path, output_data_table_path) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_model_expert") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) + + # Read the dictionary file to obtain an instance of class Dictionary + model_domain = kh.read_dictionary_file(model_dictionary_file_path) + snb_dictionary = model_domain.get_dictionary("SNB_Adult") + + # Select Label (identifier) + snb_dictionary.get_variable("Label").used = True + + # Select the variables containing the probabilities for each class + for variable in snb_dictionary.variables: + # The variable must have a meta data with key that start with "target_prob" + for key in variable.meta_data.keys: + if key.startswith("TargetProb"): + variable.used = True + + # Deploy the model. Besides the mandatory parameters, it is specified: + # - A DictionaryDomain object to use instead of the mandatory dictionary file + kh.deploy_model(model_domain, "SNB_Adult", data_table_path, output_data_table_path) .. autofunction:: deploy_classifier_for_metrics .. code-block:: python - def deploy_classifier_for_metrics(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_classifier_for_metrics") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") - - # Train the classifier for the target "class" - _, modeling_dictionary_file_path = kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) - - # Obtain the scores of the SNB on the test dataset to calculate the PR curve - kh.deploy_predictor_for_metrics( - modeling_dictionary_file_path, - "SNB_Adult", - data_table_path, - output_data_table_path, - sampling_mode="Exclude sample", - output_header_line=False, - ) - - # We estimate the precision/recall for the class "more" and increasing thresholds - # Note: Normally one would do this with a package (eg. sklearn.metrics) - thresholds = [0.1, 0.3, 0.5, 0.7, 0.9] - true_positives = {thres: 0 for thres in thresholds} - false_positives = {thres: 0 for thres in thresholds} - false_negatives = {thres: 0 for thres in thresholds} - with open(output_data_table_path) as output_data_table: - for line in output_data_table: - fields = line.split("\t") - true_target = fields[0] - proba_more = float(fields[3]) - for thres in thresholds: - if true_target == "more" and proba_more >= thres: - true_positives[thres] += 1 - elif true_target == "more" and proba_more < thres: - false_negatives[thres] += 1 - elif true_target == "less" and proba_more >= thres: - false_positives[thres] += 1 - - precision = { - thres: true_positives[thres] / (true_positives[thres] + false_positives[thres]) - for thres in thresholds - } - recall = { - thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres]) - for thres in thresholds - } - - # Print the curve at the selected points - print("Precision and Recall for class 'more'") - print("threshold\trecall\tprecision") - thresholds.reverse() - for thres in thresholds: - print(str(thres) + "\t" + str(recall[thres]) + "\t" + str(precision[thres])) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") + + # Train the classifier for the target "class" + _, modeling_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) + + # Obtain the scores of the SNB on the test dataset to calculate the PR curve + kh.deploy_predictor_for_metrics( + modeling_dictionary_file_path, + "SNB_Adult", + data_table_path, + output_data_table_path, + sampling_mode="Exclude sample", + output_header_line=False, + ) + + # We estimate the precision/recall for the class "more" and increasing thresholds + # Note: Normally one would do this with a package (eg. sklearn.metrics) + thresholds = [0.1, 0.3, 0.5, 0.7, 0.9] + true_positives = {thres: 0 for thres in thresholds} + false_positives = {thres: 0 for thres in thresholds} + false_negatives = {thres: 0 for thres in thresholds} + with open(output_data_table_path) as output_data_table: + for line in output_data_table: + fields = line.split("\t") + true_target = fields[0] + proba_more = float(fields[3]) + for thres in thresholds: + if true_target == "more" and proba_more >= thres: + true_positives[thres] += 1 + elif true_target == "more" and proba_more < thres: + false_negatives[thres] += 1 + elif true_target == "less" and proba_more >= thres: + false_positives[thres] += 1 + + precision = { + thres: true_positives[thres] / (true_positives[thres] + false_positives[thres]) + for thres in thresholds + } + recall = { + thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres]) + for thres in thresholds + } + + # Print the curve at the selected points + print("Precision and Recall for class 'more'") + print("threshold\trecall\tprecision") + thresholds.reverse() + for thres in thresholds: + print(str(thres) + "\t" + str(recall[thres]) + "\t" + str(precision[thres])) .. autofunction:: deploy_regressor_for_metrics .. code-block:: python - def deploy_regressor_for_metrics(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_regressor_for_metrics") - output_data_table_path = path.join(results_dir, "TrueAndPredictedAges.txt") - - # Train the regressor for the target "age" (with 20% train to be quick) - _, modeling_dictionary_file_path = kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "age", - results_dir, - sample_percentage=20, - max_trees=0, - ) - - # Obtain the predicted regression values of the SNB on the test dataset estimate R2 - kh.deploy_predictor_for_metrics( - modeling_dictionary_file_path, - "SNB_Adult", - data_table_path, - output_data_table_path, - sample_percentage=20, - sampling_mode="Exclude sample", - output_header_line=False, - ) - # Estimate R2 - # Note: Normally one would do this with a package (eg. sklearn.metrics) - # First pass to estimate sums of residuals and the mean - ss_res = 0 - mean = 0 - n_instances = 0 - with open(output_data_table_path) as output_data_table: - for line in output_data_table: - fields = line.split("\t") - true_target = float(fields[0]) - predicted_target = float(fields[1]) - ss_res += (true_target - predicted_target) ** 2 - mean += true_target - n_instances += 1 - mean /= n_instances - - # Second pass to estimate the total sums of squares and finish the R2 estimation - ss_tot = 0 - with open(output_data_table_path) as output_data_table: - for line in output_data_table: - fields = line.split("\t") - true_target = float(fields[0]) - ss_tot += (true_target - mean) ** 2 - r2_score = 1 - ss_res / ss_tot - - # Print results - print("Adult 'age' regression (30% train)") - print(f"R2 (explained variance) = {r2_score}") + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics") + output_data_table_path = os.path.join(results_dir, "TrueAndPredictedAges.txt") + + # Train the regressor for the target "age" (with 20% train to be quick) + _, modeling_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "age", + results_dir, + sample_percentage=20, + max_trees=0, + ) + + # Obtain the predicted regression values of the SNB on the test dataset estimate R2 + kh.deploy_predictor_for_metrics( + modeling_dictionary_file_path, + "SNB_Adult", + data_table_path, + output_data_table_path, + sample_percentage=20, + sampling_mode="Exclude sample", + output_header_line=False, + ) + # Estimate R2 + # Note: Normally one would do this with a package (eg. sklearn.metrics) + # First pass to estimate sums of residuals and the mean + ss_res = 0 + mean = 0 + n_instances = 0 + with open(output_data_table_path) as output_data_table: + for line in output_data_table: + fields = line.split("\t") + true_target = float(fields[0]) + predicted_target = float(fields[1]) + ss_res += (true_target - predicted_target) ** 2 + mean += true_target + n_instances += 1 + mean /= n_instances + + # Second pass to estimate the total sums of squares and finish the R2 estimation + ss_tot = 0 + with open(output_data_table_path) as output_data_table: + for line in output_data_table: + fields = line.split("\t") + true_target = float(fields[0]) + ss_tot += (true_target - mean) ** 2 + r2_score = 1 - ss_res / ss_tot + + # Print results + print("Adult 'age' regression (30% train)") + print(f"R2 (explained variance) = {r2_score}") .. autofunction:: sort_data_table .. code-block:: python - def sort_data_table(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - output_data_table_path = path.join( - "kh_samples", - "sort_data_table", - "SortedAccidents.txt", - ) - - # Sort table - kh.sort_data_table( - dictionary_file_path, "Accident", accidents_table_path, output_data_table_path - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + output_data_table_path = os.path.join( + "kh_samples", + "sort_data_table", + "SortedAccidents.txt", + ) + + # Sort table + kh.sort_data_table( + dictionary_file_path, "Accident", accidents_table_path, output_data_table_path + ) .. autofunction:: sort_data_table_expert .. code-block:: python - def sort_data_table_expert(): - # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - output_data_table_path = path.join( - "kh_samples", "sort_data_table_expert", "SortedVehicles.txt" - ) + # Imports + import os + from khiops import core as kh - # Sort table. Besides the mandatory parameters, it is specified: - # - A list containing the sorting fields - kh.sort_data_table( - dictionary_file_path, - "Vehicle", - vehicles_table_path, - output_data_table_path, - sort_variables=["AccidentId", "VehicleId"], - ) + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + output_data_table_path = os.path.join( + "kh_samples", "sort_data_table_expert", "SortedVehicles.txt" + ) + + # Sort table. Besides the mandatory parameters, it is specified: + # - A list containing the sorting fields + kh.sort_data_table( + dictionary_file_path, + "Vehicle", + vehicles_table_path, + output_data_table_path, + sort_variables=["AccidentId", "VehicleId"], + ) +.. autofunction:: sort_data_tables_mt +.. code-block:: python + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) .. autofunction:: extract_keys_from_data_table .. code-block:: python - def extract_keys_from_data_table(): - # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - output_data_table_path = path.join( - "kh_samples", - "extract_keys_from_data_table", - "KeysSpliceJunction.txt", - ) - - # Extract keys from table "SpliceJunctionDNA" to the output table - kh.extract_keys_from_data_table( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - output_data_table_path, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + output_data_table_path = os.path.join( + "kh_samples", + "extract_keys_from_data_table", + "KeysSpliceJunction.txt", + ) + + # Extract keys from table "SpliceJunctionDNA" to the output table + kh.extract_keys_from_data_table( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + output_data_table_path, + ) .. autofunction:: train_coclustering .. code-block:: python - def train_coclustering(): - # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "train_coclustering") - - # Train a coclustering model for variables "SampleId" and "Char" - coclustering_file_path = kh.train_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - ["SampleId", "Char"], - results_dir, - ) - print("Coclustering file available at " + coclustering_file_path) + # Imports + import os + from khiops import core as kh + # Set the file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "train_coclustering") + + # Train a coclustering model for variables "SampleId" and "Char" + coclustering_file_path = kh.train_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + ["SampleId", "Char"], + results_dir, + ) + print("Coclustering file available at " + coclustering_file_path) .. autofunction:: simplify_coclustering .. code-block:: python - def simplify_coclustering(): - # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "simplify_coclustering") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") - simplified_coclustering_file_name = "simplified_coclustering.khc" - - # Train coclustering model for variables "SampleId" and "Char" - kh.train_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - ["SampleId", "Char"], - results_dir, - ) - - # Simplify the trained coclustering with the constraints - # - maximum information preserved: 80% - # - maximum total parts number: 4 - kh.simplify_coclustering( - coclustering_file_path, - simplified_coclustering_file_name, - results_dir, - max_preserved_information=80, - max_total_parts=4, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "simplify_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + simplified_coclustering_file_name = "simplified_coclustering.khc" + + # Train coclustering model for variables "SampleId" and "Char" + kh.train_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + ["SampleId", "Char"], + results_dir, + ) + + # Simplify the trained coclustering with the constraints + # - maximum information preserved: 80% + # - maximum total parts number: 4 + kh.simplify_coclustering( + coclustering_file_path, + simplified_coclustering_file_name, + results_dir, + max_preserved_information=80, + max_total_parts=4, + ) .. autofunction:: extract_clusters .. code-block:: python - def extract_clusters(): - # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "extract_clusters") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") - clusters_file_path = path.join(results_dir, "extracted_clusters.txt") - - # Train a coclustering model for variables "SampleId" and "Char" - kh.train_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - ["SampleId", "Char"], - results_dir, - ) - - # Extract clusters - kh.extract_clusters(coclustering_file_path, "Char", clusters_file_path) - + # Set the file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "extract_clusters") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + clusters_file_path = os.path.join(results_dir, "extracted_clusters.txt") + + # Train a coclustering model for variables "SampleId" and "Char" + kh.train_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + ["SampleId", "Char"], + results_dir, + ) + + # Extract clusters + kh.extract_clusters(coclustering_file_path, "Char", clusters_file_path) .. autofunction:: deploy_coclustering .. code-block:: python - def deploy_coclustering(): - # Set the initial file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - results_dir = path.join("kh_samples", "deploy_coclustering") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") - - # Train a coclustering model for variables "SampleId" and "Char" - kh.train_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - ["SampleId", "Char"], - results_dir, - ) - - # Deploy "Char" clusters in the training database - kh.deploy_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - data_table_path, - coclustering_file_path, - ["SampleId"], - "Char", - results_dir, - header_line=True, - ) + # Imports + import os + from khiops import core as kh + # Set the initial file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + results_dir = os.path.join("kh_samples", "deploy_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + + # Train a coclustering model for variables "SampleId" and "Char" + kh.train_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + ["SampleId", "Char"], + results_dir, + ) + + # Deploy "Char" clusters in the training database + kh.deploy_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + data_table_path, + coclustering_file_path, + ["SampleId"], + "Char", + results_dir, + header_line=True, + ) .. autofunction:: deploy_coclustering_expert .. code-block:: python - def deploy_coclustering_expert(): - # Set the initial file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunction.txt") - secondary_data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "deploy_coclustering_expert") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") - - # Train a coclustering model for variables "SampleId" and "Char" - print("train coclustering on SpliceJunctionDNA") - kh.train_coclustering( - dictionary_file_path, - "SpliceJunctionDNA", - secondary_data_table_path, - ["SampleId", "Char"], - results_dir, - ) - - print("prepare_coclustering_deployment") - # The input dictionary is extended with new coclustering based variables - kh.prepare_coclustering_deployment( - dictionary_file_path, - "SpliceJunction", - coclustering_file_path, - "DNA", - "SampleId", - results_dir, - ) - augmented_dictionary_file_path = path.join(results_dir, "Coclustering.kdic") - - print("prepare_coclustering_deployment with at most two clusters") - # Extend the already extended dictionary with the new variables from a simplified CC - kh.prepare_coclustering_deployment( - augmented_dictionary_file_path, - "SpliceJunction", - coclustering_file_path, - "DNA", - "SampleId", - results_dir, - results_prefix="Reaugmented", - variables_prefix="C2_", - max_part_numbers={"SampleId": 2}, - ) - - reaugmented_dictionary_file_path = path.join( - results_dir, "ReaugmentedCoclustering.kdic" - ) - output_data_table_path = path.join(results_dir, "TransferredSpliceJunction.txt") - - # Deploy the coclustering with the extended dictionary - print("deploy_model with the new coclustering based variables") - kh.deploy_model( - reaugmented_dictionary_file_path, - "SpliceJunction", - data_table_path, - output_data_table_path, - additional_data_tables={"SpliceJunction`DNA": secondary_data_table_path}, - ) - - deployed_dictionary_file_path = path.join( - results_dir, "Transferred_Coclustering.kdic" - ) - print("build_deployed_dictionary to get the new dictionary") - kh.build_deployed_dictionary( - reaugmented_dictionary_file_path, - "SpliceJunction", - deployed_dictionary_file_path, - ) + # Imports + import os + from khiops import core as kh + # Set the initial file paths + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunction.txt") + secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "deploy_coclustering_expert") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + + # Train a coclustering model for variables "SampleId" and "Char" + print("train coclustering on SpliceJunctionDNA") + kh.train_coclustering( + dictionary_file_path, + "SpliceJunctionDNA", + secondary_data_table_path, + ["SampleId", "Char"], + results_dir, + ) + + print("prepare_coclustering_deployment") + # The input dictionary is extended with new coclustering based variables + kh.prepare_coclustering_deployment( + dictionary_file_path, + "SpliceJunction", + coclustering_file_path, + "DNA", + "SampleId", + results_dir, + ) + augmented_dictionary_file_path = os.path.join(results_dir, "Coclustering.kdic") + + print("prepare_coclustering_deployment with at most two clusters") + # Extend the already extended dictionary with the new variables from a simplified CC + kh.prepare_coclustering_deployment( + augmented_dictionary_file_path, + "SpliceJunction", + coclustering_file_path, + "DNA", + "SampleId", + results_dir, + results_prefix="Reaugmented", + variables_prefix="C2_", + max_part_numbers={"SampleId": 2}, + ) + + reaugmented_dictionary_file_path = os.path.join( + results_dir, "ReaugmentedCoclustering.kdic" + ) + output_data_table_path = os.path.join(results_dir, "TransferredSpliceJunction.txt") + + # Deploy the coclustering with the extended dictionary + print("deploy_model with the new coclustering based variables") + kh.deploy_model( + reaugmented_dictionary_file_path, + "SpliceJunction", + data_table_path, + output_data_table_path, + additional_data_tables={"SpliceJunction`DNA": secondary_data_table_path}, + ) + + deployed_dictionary_file_path = os.path.join( + results_dir, "Transferred_Coclustering.kdic" + ) + print("build_deployed_dictionary to get the new dictionary") + kh.build_deployed_dictionary( + reaugmented_dictionary_file_path, + "SpliceJunction", + deployed_dictionary_file_path, + ) .. autofunction:: scenario_prologue .. code-block:: python - def scenario_prologue(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "scenario_prologue") - - # Set the maximum memory "by hand" with an scenario prologue - kh.get_runner().scenario_prologue = """ - // Max memory 2000 mb - AnalysisSpec.SystemParameters.MemoryLimit 2000 - """ - - # Train the predictor - kh.train_predictor( - dictionary_file_path, - "Adult", - data_table_path, - "class", - results_dir, - max_trees=0, - ) + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "scenario_prologue") + + # Set the maximum memory "by hand" with an scenario prologue + kh.get_runner().scenario_prologue = """ + // Max memory 2000 mb + AnalysisSpec.SystemParameters.MemoryLimit 2000 + """ + + # Train the predictor + kh.train_predictor( + dictionary_file_path, + "Adult", + data_table_path, + "class", + results_dir, + max_trees=0, + ) .. autofunction:: build_deployed_dictionary .. code-block:: python - def build_deployed_dictionary(): - # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - results_dir = path.join("kh_samples", "build_deployed_dictionary") - deployed_dictionary_file_path = path.join(results_dir, "SNB_Iris_deployed.kdic") - - # Train the predictor - _, modeling_dictionary_file_path = kh.train_predictor( - dictionary_file_path, - "Iris", - data_table_path, - "Class", - results_dir, - max_trees=0, - ) - - # Build the dictionary to read the output of the predictor dictionary file - # It will contain the columns of the table generated by deploying the model - kh.build_deployed_dictionary( - modeling_dictionary_file_path, - "SNB_Iris", - deployed_dictionary_file_path, - ) - - # Print the deployed dictionary - with open(deployed_dictionary_file_path) as deployed_dictionary_file: - for line in deployed_dictionary_file: - print(line, end="") + # Imports + import os + from khiops import core as kh + # Set the file paths + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + results_dir = os.path.join("kh_samples", "build_deployed_dictionary") + deployed_dictionary_file_path = os.path.join(results_dir, "SNB_Iris_deployed.kdic") + + # Train the predictor + _, modeling_dictionary_file_path = kh.train_predictor( + dictionary_file_path, + "Iris", + data_table_path, + "Class", + results_dir, + max_trees=0, + ) + + # Build the dictionary to read the output of the predictor dictionary file + # It will contain the columns of the table generated by deploying the model + kh.build_deployed_dictionary( + modeling_dictionary_file_path, + "SNB_Iris", + deployed_dictionary_file_path, + ) + + # Print the deployed dictionary + with open(deployed_dictionary_file_path) as deployed_dictionary_file: + for line in deployed_dictionary_file: + print(line, end="") diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 5d9c4ee1..12f08ee2 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -30,775 +30,749 @@ If that doesn't work open a python console and execute: from khiops.tools import download_datasets download_datasets() -Before copying any code snippet make sure to precede it with following -preamble: +Samples +------- + +.. autofunction:: khiops_classifier .. code-block:: python + # Imports import os - import pickle - from os import path - import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier from sklearn import metrics - from sklearn.compose import ColumnTransformer - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import HistGradientBoostingClassifier - from sklearn.datasets import fetch_20newsgroups - from sklearn.feature_extraction.text import HashingVectorizer from sklearn.model_selection import train_test_split - from sklearn.pipeline import Pipeline - from sklearn.preprocessing import OneHotEncoder - - from khiops import core as kh - from khiops.sklearn import ( - KhiopsClassifier, - KhiopsCoclustering, - KhiopsEncoder, - KhiopsRegressor, - ) - -Samples -------- -.. autofunction:: khiops_classifier -.. code-block:: python + # Load the dataset into a pandas dataframe + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") - def khiops_classifier(): - # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - adult_df = pd.read_csv(adult_path, sep="\t") - - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] - - # Create the classifier object - khc = KhiopsClassifier() - - # Train the classifier - khc.fit(X_train, y_train) - - # Predict the classes on the test dataset - y_test_pred = khc.predict(X_test) - print("Predicted classes (first 10):") - print(y_test_pred[0:10]) - print("---") - - # Predict the class probabilities on the test dataset - y_test_probas = khc.predict_proba(X_test) - print(f"Class order: {khc.classes_}") - print("Predicted class probabilities (first 10):") - print(y_test_probas[0:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Split the whole dataframe into train and test (70%-30%) + adult_train_df, adult_test_df = train_test_split( + adult_df, test_size=0.3, random_state=1 + ) + # Split the dataset into: + # - the X feature table + # - the y target vector ("class" column) + X_train = adult_train_df.drop("class", axis=1) + X_test = adult_test_df.drop("class", axis=1) + y_train = adult_train_df["class"] + y_test = adult_test_df["class"] + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_sparse .. code-block:: python - def khiops_classifier_sparse(): - # Load 3 classes of the 20newsgroups dataset - categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"] - data_train, y_train = fetch_20newsgroups( - subset="train", - categories=categories, - return_X_y=True, - ) - data_test, y_test = fetch_20newsgroups( - subset="test", - categories=categories, - return_X_y=True, - ) - - # Extract features from the train and test data using a sparse vectorizer - vectorizer = HashingVectorizer(n_features=2048, stop_words="english") - X_train = vectorizer.fit_transform(data_train) - X_test = vectorizer.transform(data_test) - - # Print density of the intermediary datasets - print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}") - print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}") - print("---") - - # Create the classifier object (no trees) - khc = KhiopsClassifier(n_trees=0) - - # Train the classifier - khc.fit(X_train, y_train) - - # Predict the classes on the test dataset - y_test_pred = khc.predict(X_test) - print("Predicted classes (first 10):") - print(y_test_pred[0:10]) - print("---") - - # Predict the class probabilities on the test dataset - y_test_probas = khc.predict_proba(X_test) - print(f"Class order: {khc.classes_}") - print("Predicted class probabilities (first 10):") - print(y_test_probas[0:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Imports + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.datasets import fetch_20newsgroups + from sklearn.feature_extraction.text import HashingVectorizer + # Load 4 classes of the 20newsgroups dataset + categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"] + data_train, y_train = fetch_20newsgroups( + subset="train", + categories=categories, + return_X_y=True, + ) + data_test, y_test = fetch_20newsgroups( + subset="test", + categories=categories, + return_X_y=True, + ) + + # Extract features from the train and test data using a sparse vectorizer + vectorizer = HashingVectorizer(n_features=2048, stop_words="english") + X_train = vectorizer.fit_transform(data_train) + X_test = vectorizer.transform(data_test) + + # Print density of the intermediary datasets + print(f"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}") + print(f"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}") + print("---") + + # Create the classifier object (no trees) + khc = KhiopsClassifier(n_trees=0) + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_multiclass .. code-block:: python - def khiops_classifier_multiclass(): - # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Split the whole dataframe into train and test (70%-30%) - iris_train_df, iris_test_df = train_test_split( - iris_df, test_size=0.3, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("Class" column) - X_train = iris_train_df.drop("Class", axis=1) - X_test = iris_test_df.drop("Class", axis=1) - y_train = iris_train_df["Class"] - y_test = iris_test_df["Class"] - - # Create the classifier object - khc = KhiopsClassifier() - - # Train the classifier - khc.fit(X_train, y_train) - - # Predict the classes on the test dataset - y_test_pred = khc.predict(X_test) - print("Predicted classes (first 10):") - print(y_test_pred[:10]) - print("---") - - # Predict the class probabilities on the test datasets - y_test_probas = khc.predict_proba(X_test) - print(f"Class order: {khc.classes_}") - print("Predicted class probabilities (first 10):") - print(y_test_probas[:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + # Load the dataset into a pandas dataframe + iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + iris_df = pd.read_csv(iris_path, sep="\t") + + # Split the whole dataframe into train and test (70%-30%) + iris_train_df, iris_test_df = train_test_split(iris_df, test_size=0.3, random_state=1) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("Class" column) + X_train = iris_train_df.drop("Class", axis=1) + X_test = iris_test_df.drop("Class", axis=1) + y_train = iris_train_df["Class"] + y_test = iris_test_df["Class"] + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probabilities on the test datasets + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr") + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_multitable_star .. code-block:: python - def khiops_classifier_multitable_star(): - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), - }, - } - - # Train the classifier (by default it analyzes 100 multi-table features) - khc = KhiopsClassifier() - khc.fit(X_train, y_train) - - # Predict the class on the test dataset - y_test_pred = khc.predict(X_test) - print("Predicted classes (first 10):") - print(y_test_pred[:10]) - print("---") - - # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba(X_test) - print(f"Class order: {khc.classes_}") - print("Predicted class probabilities (first 10):") - print(y_test_probas[:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + + # Create the dataset spec and the target + X = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + }, + } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) + + # Train the classifier (by default it analyzes 100 multi-table features) + khc = KhiopsClassifier() + khc.fit(X_train, y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_multitable_star_file +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + if not os.path.exists("kh_samples"): + os.mkdir("kh_samples") + os.mkdir(results_dir) + else: + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_multitable_snowflake .. code-block:: python - def khiops_classifier_multitable_snowflake(): - # Load the dataset tables into dataframes - accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - users_df = pd.read_csv( - path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" - ) - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t", encoding="latin1" - ) - places_df = pd.read_csv( - path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" - ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column - X = { - "main_table": "Accidents", - "tables": { - "Accidents": (accidents_df, "AccidentId"), - "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), - "Places": (places_df, ["AccidentId"]), - }, - "relations": [ - ("Accidents", "Vehicles"), - ("Vehicles", "Users"), - ("Accidents", "Places", True), - ], - } - - # Load the target variable from the AccidentsSummary dataset - y = pd.read_csv( - path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - sep="\t", - encoding="latin1", - )["Gravity"] - - # Train the classifier (by default it creates 1000 multi-table features) - khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) - - # Predict the class on the test dataset - y_pred = khc.predict(X) - print("Predicted classes (first 10):") - print(y_pred[:10]) - print("---") - - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) - print(f"Class order: {khc.classes_}") - print("Predicted class probabilities (first 10):") - print(y_probas[:10]) - print("---") - - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Load the dataset tables into dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + users_df = pd.read_csv( + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" + ) + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), + sep="\t", + encoding="latin1", + ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. + X = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_df, "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), + "Places": (places_df, ["AccidentId"]), + }, + "relations": [ + ("Accidents", "Vehicles"), + ("Vehicles", "Users"), + ("Accidents", "Places", True), + ], + } + + # Load the target variable "Gravity" from the "AccidentsSummary" dataset + y = pd.read_csv( + os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ).squeeze("columns") + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) + + # Train the classifier (by default it creates 1000 multi-table features) + khc = KhiopsClassifier(n_trees=0) + khc.fit(X_train, y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_classifier_pickle .. code-block:: python - def khiops_classifier_pickle(): - # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) - - # Create/clean the output directory - results_dir = path.join("kh_samples", "khiops_classifier_pickle") - khc_pickle_path = path.join(results_dir, "khiops_classifier.pkl") - if path.exists(khc_pickle_path): - os.remove(khc_pickle_path) - else: - os.makedirs(results_dir, exist_ok=True) - - # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) - - # Unpickle it - with open(khc_pickle_path, "rb") as khc_pickle_file: - new_khc = pickle.load(khc_pickle_file) - - # Make some predictions on the training dataset with the unpickled classifier - new_khc.predict(X) - y_predicted = new_khc.predict(X) - print("Predicted classes (first 10):") - print(y_predicted[:10]) - print("---") - + # Imports + import os + import pickle + from khiops.sklearn import KhiopsClassifier + from sklearn.datasets import load_iris + + # Create/clean the output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") + khc_pickle_path = os.path.join(results_dir, "khiops_classifier.pkl") + if os.path.exists(khc_pickle_path): + os.remove(khc_pickle_path) + else: + os.makedirs(results_dir, exist_ok=True) + + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + + # Pickle its content to a file + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) + + # Unpickle it + with open(khc_pickle_path, "rb") as khc_pickle_file: + new_khc = pickle.load(khc_pickle_file) + + # Make some predictions on the training dataset with the unpickled classifier + new_khc.predict(X) + y_predicted = new_khc.predict(X) + print("Predicted classes (first 10):") + print(y_predicted[:10]) + print("---") .. autofunction:: khiops_regressor .. code-block:: python - def khiops_regressor(): - # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - adult_df = pd.read_csv(adult_path, sep="\t") - - # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] - - # Create the regressor object - khr = KhiopsRegressor() - - # Train the regressor - khr.fit(X_train, y_train) - - # Predict the values on the test dataset - y_test_pred = khr.predict(X_test) - print("Predicted values for 'age' (first 10):") - print(y_test_pred[:10]) - print("---") - - # Evaluate R2 and MAE metrics on the test dataset - test_r2 = metrics.r2_score(y_test, y_test_pred) - test_mae = metrics.mean_absolute_error(y_test, y_test_pred) - print(f"Test R2 = {test_r2}") - print(f"Test MAE = {test_mae}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsRegressor + from sklearn import metrics + from sklearn.model_selection import train_test_split -.. autofunction:: khiops_encoder -.. code-block:: python + # Load the "Adult" dataset and set the target to the "age" column + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] - def khiops_encoder(): - # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") + # Split the whole dataframe into train and test (40%-60% for speed) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Create the regressor object + khr = KhiopsRegressor() - # Create the encoder object - khe = KhiopsEncoder() - khe.fit(X, y) + # Train the regressor + khr.fit(X_train, y_train) - # Transform the training dataset - X_transformed = khe.transform(X) + # Predict the values on the test dataset + y_test_pred = khr.predict(X_test) + print("Predicted values for 'age' (first 10):") + print(y_test_pred[:10]) + print("---") - # Print both the original and transformed features - print("Original:") - print(X.head(10)) - print("---") - print("Encoded feature names:") - print(khe.feature_names_out_) - print("Encoded data:") - print(X_transformed[:10]) - print("---") + # Evaluate R2 and MAE metrics on the test dataset + test_r2 = metrics.r2_score(y_test, y_test_pred) + test_mae = metrics.mean_absolute_error(y_test, y_test_pred) + print(f"Test R2 = {test_r2}") + print(f"Test MAE = {test_mae}") +.. autofunction:: khiops_encoder +.. code-block:: python + # Imports + from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris + + # Load the dataset + X, y = load_iris(return_X_y=True) + + # Create the encoder object + khe = KhiopsEncoder(transform_type_numerical="part_label") + khe.fit(X, y) + + # Transform the training dataset + X_transformed = khe.transform(X) + + # Print both the original and transformed features + print("Original:") + print(X[:10]) + print("---") + print("Encoded feature names:") + print(khe.feature_names_out_) + print("Encoded data:") + print(X_transformed[:10]) + print("---") .. autofunction:: khiops_encoder_multitable_star .. code-block:: python - def khiops_encoder_multitable_star(): - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), - }, - } - - # Create the KhiopsEncoder with 10 additional multitable features and fit it - khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) - - # Transform the train dataset - print("Encoded feature names:") - print(khe.feature_names_out_) - print("Encoded data:") - print(khe.transform(X_dataset)[:10]) - + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + + # Build the multi-table spec and the target + X = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + }, + } + y = accidents_df["Gravity"] + + # Create the KhiopsEncoder with 5 multitable features and fit it + khe = KhiopsEncoder(n_features=10) + khe.fit(X, y) + + # Transform the train dataset + print("Encoded feature names:") + print(khe.feature_names_out_) + print("Encoded data:") + print(khe.transform(X)[:10]) .. autofunction:: khiops_encoder_multitable_snowflake .. code-block:: python - def khiops_encoder_multitable_snowflake(): - # Load the tables into dataframes - accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", encoding="latin1" - ) - users_df = pd.read_csv( - path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" - ) - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t", encoding="latin1" - ) - - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column - X = { - "main_table": "Accidents", - "tables": { - "Accidents": (accidents_df, "AccidentId"), - "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), - }, - "relations": [ - ("Accidents", "Vehicles"), - ("Vehicles", "Users"), - ], - } - - # Load the target variable from the AccidentsSummary dataset - y = pd.read_csv( - path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - sep="\t", - encoding="latin1", - )["Gravity"] - - # Create the KhiopsEncoder with 10 additional multitable features and fit it - khe = KhiopsEncoder(n_features=10) - khe.fit(X, y) - - # Transform the train dataset - print("Encoded feature names:") - print(khe.feature_names_out_) - print("Encoded data:") - print(khe.transform(X)[:10]) + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + + # Load the tables into dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + ) + users_df = pd.read_csv( + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" + ) + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), + sep="\t", + encoding="latin1", + ) + # Build the multi-table spec + # Note: We discard the "Gravity" field from the "Users" table as it was used to + # build the target column + X = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), + }, + "relations": [ + ("Accidents", "Vehicles"), + ("Accidents", "Places", True), + ("Vehicles", "Users"), + ], + } + + # Load the target variable from the AccidentsSummary dataset + y = pd.read_csv( + os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ).squeeze("columns") + + # Create the KhiopsEncoder with 10 additional multitable features and fit it + khe = KhiopsEncoder(n_features=10) + khe.fit(X, y) + + # Transform the train dataset + print("Encoded feature names:") + print(khe.feature_names_out_) + print("Encoded data:") + print(khe.transform(X)[:10]) .. autofunction:: khiops_encoder_pipeline_with_hgbc .. code-block:: python - def khiops_encoder_pipeline_with_hgbc(): - # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - adult_df = pd.read_csv(adult_path, sep="\t") - - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 - ) - - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] - - # Create the pipeline and fit it. Steps: - # - The khiops supervised column encoder, generates a full-categorical table - # - One hot encoder in all columns - # - Train the HGB classifier - pipe_steps = [ - ("khiops_enc", KhiopsEncoder()), - ( - "onehot_enc", - ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), - ), - ("hgb_clf", HistGradientBoostingClassifier()), - ] - pipe = Pipeline(pipe_steps) - pipe.fit(X_train, y_train) - - # Predict the classes on the test dataset - y_test_pred = pipe.predict(X_test) - print("Predicted classes (first 10):") - print(y_test_pred[:10]) - print("---") - - # Predict the class probabilities on the test dataset - y_test_probas = pipe.predict_proba(X_test) - print("Predicted class probabilities (first 10):") - print(y_test_probas[:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + from sklearn import metrics + from sklearn.compose import ColumnTransformer + from sklearn.ensemble import HistGradientBoostingClassifier + from sklearn.model_selection import train_test_split + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + # Load the dataset into dataframes + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] + + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) + + # Create the pipeline and fit it. Steps: + # - The khiops supervised column encoder, generates a full-categorical table + # - One hot encoder in all columns + # - Train the HGB classifier + pipe_steps = [ + ("khiops_enc", KhiopsEncoder()), + ( + "onehot_enc", + ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), + ), + ("hgb_clf", HistGradientBoostingClassifier()), + ] + pipe = Pipeline(pipe_steps) + pipe.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = pipe.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = pipe.predict_proba(X_test) + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_coclustering .. code-block:: python - def khiops_coclustering(): - # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" - ) + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsCoclustering + from sklearn.model_selection import train_test_split - # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + # Load the secondary table of the dataset into a pandas dataframe + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" + ) - # Create the KhiopsCoclustering instance - khcc = KhiopsCoclustering() + # Train with only 70% of data (for speed in this example) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) - # Train the model with the whole dataset - khcc.fit(X, id_column="SampleId") + # Create the KhiopsCoclustering instance + khcc = KhiopsCoclustering() - # Predict the clusters in some instances - X_clusters = khcc.predict(X) - print("Predicted clusters (first 10)") - print(X_clusters[:10]) - print("---") + # Train the model with the whole dataset + khcc.fit(X, id_column="SampleId") + # Predict the clusters in some instances + X_clusters = khcc.predict(X) + print("Predicted clusters (first 10)") + print(X_clusters[:10]) + print("---") .. autofunction:: khiops_coclustering_simplify .. code-block:: python - def khiops_coclustering_simplify(): - # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" - ) + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsCoclustering + from sklearn.model_selection import train_test_split - # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + # Load the secondary table of the dataset into a pandas dataframe + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_X = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" + ) - # Create the KhiopsCoclustering instance - khcc = KhiopsCoclustering() + # Train with only 70% of data (for speed in this example) + X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) - # Train the model with the whole dataset - khcc.fit(X, id_column="SampleId") + # Create the KhiopsCoclustering instance + khcc = KhiopsCoclustering() - # Simplify coclustering along the individual ID dimension - simplified_khcc = khcc.simplify(max_part_numbers={"SampleId": 3}) + # Train the model with the whole dataset + khcc.fit(X, id_column="SampleId") - # Predict the clusters using the simplified model - X_clusters = simplified_khcc.predict(X) - print("Predicted clusters (only three at most)") - print(X_clusters) - print("---") + # Simplify coclustering along the individual ID dimension + simplified_khcc = khcc.simplify(max_part_numbers={"SampleId": 3}) + # Predict the clusters using the simplified model + X_clusters = simplified_khcc.predict(X) + print("Predicted clusters (only three at most)") + print(X_clusters) + print("---") .. autofunction:: khiops_classifier_multitable_list .. code-block:: python - def khiops_classifier_multitable_list(): - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the classifier specifying the key column name - khc = KhiopsClassifier(key="AccidentId") - - # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) - - # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) - print("Predicted classes (first 10):") - print(y_test_pred[:10]) - print("---") - - # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) - print("Predicted class probabilities (first 10):") - print(y_test_probas[:10]) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred) - test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") - -.. autofunction:: khiops_classifier_multitable_star_file -.. code-block:: python - - def khiops_classifier_multitable_star_file(): - # Create output directory - results_dir = path.join("kh_samples", "khiops_classifier_multitable_file") - if not path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + # Load the root table of the dataset into a pandas dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) + + # Load the secondary table of the dataset into a pandas dataframe + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + + # Split the secondary dataframe with the keys of the splitted root dataframe + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() + X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") + X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") + + # Create the classifier specifying the key column name + khc = KhiopsClassifier(key="AccidentId") + + # Train the classifier + khc.fit([X_train, X_train_secondary], y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict([X_test, X_test_secondary]) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") diff --git a/doc/tools/index.rst b/doc/tools/index.rst index 2bfa53d0..f45e3a07 100644 --- a/doc/tools/index.rst +++ b/doc/tools/index.rst @@ -1,10 +1,11 @@ Tools ===== -These are auxiliary tools to the ``khiops-python`` installation. +These are auxiliary functions to the ``khiops-python`` installation. .. currentmodule:: khiops .. autosummary:: :toctree: generated :nosignatures: + khiops.utils.helpers khiops.tools diff --git a/khiops/core/internals/runner.py b/khiops/core/internals/runner.py index 9700f78b..ba6a980c 100644 --- a/khiops/core/internals/runner.py +++ b/khiops/core/internals/runner.py @@ -72,7 +72,7 @@ def get_dir_status(a_dir): return status -def check_samples_dir(samples_dir): +def _check_samples_dir(samples_dir): # Warn if there are problems with the samples_dir samples_dir_status = get_dir_status(samples_dir) download_msg = ( @@ -1523,13 +1523,13 @@ def _tool_path(self, tool_name): def _set_samples_dir(self, samples_dir): """Checks and sets the samples directory""" - check_samples_dir(samples_dir) + _check_samples_dir(samples_dir) super()._set_samples_dir(samples_dir) def _get_samples_dir(self): # Check the samples dir once (the check emmits only warnings) if not self._samples_dir_checked: - check_samples_dir(self._samples_dir) + _check_samples_dir(self._samples_dir) self._samples_dir_checked = True return self._samples_dir diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb index 2b244b0e..bccde760 100644 --- a/khiops/samples/samples.ipynb +++ b/khiops/samples/samples.ipynb @@ -9,20 +9,29 @@ "[Khiops](https://khiops.org) before using this this notebook" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `get_khiops_version()`\n\n", + "Shows the Khiops version\n" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "import os\n", - "from math import sqrt\n", - "from os import path\n", - "\n", - "from khiops import core as kh\n", - "\n" + "print(f\"Khiops version: {kh.get_khiops_version()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `build_dictionary_from_data_table()`\n\n", + "Automatically creates a dictionary file from a data table\n" ] }, { @@ -31,12 +40,29 @@ "metadata": {}, "outputs": [], "source": [ - "def get_khiops_version():\n", - " \"\"\"Shows the Khiops version\"\"\"\n", - " print(f\"Khiops version: {kh.get_khiops_version()}\")\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "get_khiops_version()" + "# Set the file paths\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "dictionary_name = \"AutoAdult\"\n", + "dictionary_file_path = os.path.join(\n", + " \"kh_samples\", \"build_dictionary_from_data_table\", \"AutoAdult.kdic\"\n", + ")\n", + "\n", + "# Create the dictionary from the data table\n", + "kh.build_dictionary_from_data_table(\n", + " data_table_path, dictionary_name, dictionary_file_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `detect_data_table_format()`\n\n", + "Detects the format of a data table with and without a dictionary file\n\n The user may provide a dictionary file or dictionary domain object specifying the\n table schema. The detection heuristic is more accurate with this information.\n \n" ] }, { @@ -45,22 +71,54 @@ "metadata": {}, "outputs": [], "source": [ - "def build_dictionary_from_data_table():\n", - " \"\"\"Automatically creates a dictionary file from a data table\"\"\"\n", - " # Set the file paths\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " dictionary_name = \"AutoAdult\"\n", - " dictionary_file_path = path.join(\n", - " \"kh_samples\", \"build_dictionary_from_data_table\", \"AutoAdult.kdic\"\n", - " )\n", - "\n", - " # Create the dictionary from the data table\n", - " kh.build_dictionary_from_data_table(\n", - " data_table_path, dictionary_name, dictionary_file_path\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "build_dictionary_from_data_table()" + "# Set the file paths\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "results_dir = os.path.join(\"kh_samples\", \"detect_data_table_format\")\n", + "transformed_data_table_path = os.path.join(results_dir, \"AdultWithAnotherFormat.txt\")\n", + "\n", + "# Create the output directory\n", + "if not os.path.isdir(results_dir):\n", + " os.mkdir(results_dir)\n", + "\n", + "# Detect the format of the table\n", + "format_spec = kh.detect_data_table_format(data_table_path)\n", + "print(\"Format specification (header_line, field_separator)\")\n", + "print(\"Format detected on original table:\", format_spec)\n", + "\n", + "# Make a deployment to change the format of the data table\n", + "kh.deploy_model(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " transformed_data_table_path,\n", + " output_header_line=False,\n", + " output_field_separator=\",\",\n", + ")\n", + "\n", + "# Detect the new format of the table without a dictionary file\n", + "format_spec = kh.detect_data_table_format(transformed_data_table_path)\n", + "print(\"Format detected on reformatted table:\", format_spec)\n", + "\n", + "# Detect the new format of the table with a dictionary file\n", + "format_spec = kh.detect_data_table_format(\n", + " transformed_data_table_path,\n", + " dictionary_file_path_or_domain=dictionary_file_path,\n", + " dictionary_name=\"Adult\",\n", + ")\n", + "print(\"Format detected (with dictionary file) on reformatted table:\", format_spec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `check_database()`\n\n", + "Runs an integrity check of a database\n\n The results are stored in the specified log file with at most 50 error messages.\n \n" ] }, { @@ -69,51 +127,31 @@ "metadata": {}, "outputs": [], "source": [ - "def detect_data_table_format():\n", - " \"\"\"Detects the format of a data table with and without a dictionary file\n", - "\n", - " The user may provide a dictionary file or dictionary domain object specifying the\n", - " table schema. The detection heuristic is more accurate with this information.\n", - " \"\"\"\n", - " # Set the file paths\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " results_dir = path.join(\"kh_samples\", \"detect_data_table_format\")\n", - " transformed_data_table_path = path.join(results_dir, \"AdultWithAnotherFormat.txt\")\n", - "\n", - " # Create the output directory\n", - " if not path.isdir(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - " # Detect the format of the table\n", - " format_spec = kh.detect_data_table_format(data_table_path)\n", - " print(\"Format specification (header_line, field_separator)\")\n", - " print(\"Format detected on original table:\", format_spec)\n", - "\n", - " # Make a deployment to change the format of the data table\n", - " kh.deploy_model(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " transformed_data_table_path,\n", - " output_header_line=False,\n", - " output_field_separator=\",\",\n", - " )\n", - "\n", - " # Detect the new format of the table without a dictionary file\n", - " format_spec = kh.detect_data_table_format(transformed_data_table_path)\n", - " print(\"Format detected on reformatted table:\", format_spec)\n", - "\n", - " # Detect the new format of the table with a dictionary file\n", - " format_spec = kh.detect_data_table_format(\n", - " transformed_data_table_path,\n", - " dictionary_file_path_or_domain=dictionary_file_path,\n", - " dictionary_name=\"Adult\",\n", - " )\n", - " print(\"Format detected (with dictionary file) on reformatted table:\", format_spec)\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "detect_data_table_format()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "log_file = os.path.join(\"kh_samples\", \"check_database\", \"check_database.log\")\n", + "\n", + "# Check the database\n", + "kh.check_database(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " log_file_path=log_file,\n", + " max_messages=50,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `export_dictionary_files()`\n\n", + "Exports a customized dictionary to \".kdic\" and to \".kdicj\" (JSON)\n" ] }, { @@ -122,27 +160,53 @@ "metadata": {}, "outputs": [], "source": [ - "def check_database():\n", - " \"\"\"Runs an integrity check of a database\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " The results are stored in the specified log file with at most 50 error messages.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " log_file = path.join(\"kh_samples\", \"check_database\", \"check_database.log\")\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "results_dir = os.path.join(\"kh_samples\", \"export_dictionary_file\")\n", + "output_dictionary_file_path = os.path.join(results_dir, \"ModifiedAdult.kdic\")\n", + "output_dictionary_json_path = os.path.join(results_dir, \"ModifiedAdult.kdicj\")\n", + "alt_output_dictionary_json_path = os.path.join(results_dir, \"AltModifiedAdult.kdicj\")\n", + "\n", + "# Load the dictionary domain from initial dictionary file\n", + "# Then obtain the \"Adult\" dictionary within\n", + "domain = kh.read_dictionary_file(dictionary_file_path)\n", + "dictionary = domain.get_dictionary(\"Adult\")\n", + "\n", + "# Set some of its variables to unused\n", + "fnlwgt_variable = dictionary.get_variable(\"fnlwgt\")\n", + "fnlwgt_variable.used = False\n", + "label_variable = dictionary.get_variable(\"Label\")\n", + "label_variable.used = False\n", + "\n", + "# Create output directory if necessary\n", + "if not os.path.exists(\"kh_samples\"):\n", + " os.mkdir(\"kh_samples\")\n", + " os.mkdir(results_dir)\n", + "else:\n", + " if not os.path.exists(results_dir):\n", + " os.mkdir(results_dir)\n", "\n", - " # Check the database\n", - " kh.check_database(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " log_file_path=log_file,\n", - " max_messages=50,\n", - " )\n", + "# Export to kdic\n", + "domain.export_khiops_dictionary_file(output_dictionary_file_path)\n", "\n", - "#Run sample\n", - "check_database()" + "# Export to kdicj either from the domain or from a kdic file\n", + "# Requires a Khiops execution, that's why it is not a method of DictionaryDomain\n", + "kh.export_dictionary_as_json(domain, output_dictionary_json_path)\n", + "kh.export_dictionary_as_json(\n", + " output_dictionary_file_path, alt_output_dictionary_json_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor()`\n\n", + "Trains a predictor with a minimal setup\n" ] }, { @@ -151,46 +215,32 @@ "metadata": {}, "outputs": [], "source": [ - "def export_dictionary_files():\n", - " \"\"\"Exports a customized dictionary to \".kdic\" and to \".kdicj\" (JSON)\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " results_dir = path.join(\"kh_samples\", \"export_dictionary_file\")\n", - " output_dictionary_file_path = path.join(results_dir, \"ModifiedAdult.kdic\")\n", - " output_dictionary_json_path = path.join(results_dir, \"ModifiedAdult.kdicj\")\n", - " alt_output_dictionary_json_path = path.join(results_dir, \"AltModifiedAdult.kdicj\")\n", - "\n", - " # Load the dictionary domain from initial dictionary file\n", - " # Then obtain the \"Adult\" dictionary within\n", - " domain = kh.read_dictionary_file(dictionary_file_path)\n", - " dictionary = domain.get_dictionary(\"Adult\")\n", - "\n", - " # Set some of its variables to unused\n", - " fnlwgt_variable = dictionary.get_variable(\"fnlwgt\")\n", - " fnlwgt_variable.used = False\n", - " label_variable = dictionary.get_variable(\"Label\")\n", - " label_variable.used = False\n", - "\n", - " # Create output directory if necessary\n", - " if not path.exists(\"kh_samples\"):\n", - " os.mkdir(\"kh_samples\")\n", - " os.mkdir(results_dir)\n", - " else:\n", - " if not path.exists(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - " # Export to kdic\n", - " domain.export_khiops_dictionary_file(output_dictionary_file_path)\n", - "\n", - " # Export to kdicj either from the domain or from a kdic file\n", - " # Requires a Khiops execution, that's why it is not a method of DictionaryDomain\n", - " kh.export_dictionary_as_json(domain, output_dictionary_json_path)\n", - " kh.export_dictionary_as_json(\n", - " output_dictionary_file_path, alt_output_dictionary_json_path\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "export_dictionary_files()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor\")\n", + "\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_file_paths()`\n\n", + "Trains a predictor and stores the return value of the function\n" ] }, { @@ -199,25 +249,34 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor():\n", - " \"\"\"Trains a predictor with a minimal setup\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor\")\n", - "\n", - " # Train the predictor\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_file_paths\")\n", + "\n", + "# Train the predictor\n", + "report_file_path, modeling_dictionary_file_path = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "print(\"Reports file available at \" + report_file_path)\n", + "print(\"Modeling dictionary file available at \" + modeling_dictionary_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_error_handling()`\n\n", + "Shows how to handle errors when training a predictor\n\n Trains the predictor and handles the errors by printing a custom message. When the\n Khiops application fails the Khiops Python library will raise a\n KhiopsRuntimeError reporting the errors encountered by Khiops.\n\n If the latter information is not enough to diagnose the problem, it is possible to\n save the temporary log file by activating the \"trace\" flag in the call to\n `~.api.train_predictor`. The path of the log file will be printed to the standard\n output, as well as that of the dictionary and scenario files (note that the \"trace\"\n keyword argument is available in all functions of the `khiops.core.api`\n submodule).\n \n" ] }, { @@ -226,86 +285,52 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_file_paths():\n", - " \"\"\"Trains a predictor and stores the return value of the function\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_file_paths\")\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Train the predictor\n", - " report_file_path, modeling_dictionary_file_path = kh.train_predictor(\n", + "# Set the file paths with a nonexistent dictionary file\n", + "dictionary_file_path = \"NONEXISTENT_DICTIONARY_FILE.kdic\"\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_error_handling\")\n", + "log_file_path = os.path.join(results_dir, \"khiops.log\")\n", + "scenario_path = os.path.join(results_dir, \"scenario._kh\")\n", + "\n", + "# Train the predictor and handle the error\n", + "try:\n", + " kh.train_predictor(\n", " dictionary_file_path,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", " results_dir,\n", - " max_trees=0,\n", + " trace=True,\n", + " log_file_path=log_file_path,\n", + " output_scenario_path=scenario_path,\n", " )\n", - " print(\"Reports file available at \" + report_file_path)\n", - " print(\"Modeling dictionary file available at \" + modeling_dictionary_file_path)\n", - "\n", - "#Run sample\n", - "train_predictor_file_paths()" + "except kh.KhiopsRuntimeError as error:\n", + " print(\"Khiops training failed! Below the KhiopsRuntimeError message:\")\n", + " print(error)\n", + "\n", + "print(\"\\nFull log contents:\")\n", + "print(\"------------------\")\n", + "with open(log_file_path) as log_file:\n", + " for line in log_file:\n", + " print(line, end=\"\")\n", + "\n", + "print(\"\\nExecuted scenario\")\n", + "print(\"-----------------\")\n", + "with open(scenario_path) as scenario_file:\n", + " for line in scenario_file:\n", + " print(line, end=\"\")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def train_predictor_error_handling():\n", - " \"\"\"Shows how to handle errors when training a predictor\n", - "\n", - " Trains the predictor and handles the errors by printing a custom message. When the\n", - " Khiops application fails the Khiops Python library will raise a\n", - " KhiopsRuntimeError reporting the errors encountered by Khiops.\n", - "\n", - " If the latter information is not enough to diagnose the problem, it is possible to\n", - " save the temporary log file by activating the \"trace\" flag in the call to\n", - " `~.api.train_predictor`. The path of the log file will be printed to the standard\n", - " output, as well as that of the dictionary and scenario files (note that the \"trace\"\n", - " keyword argument is available in all functions of the `khiops.core.api`\n", - " submodule).\n", - " \"\"\"\n", - " # Set the file paths with a nonexistent dictionary file\n", - " dictionary_file_path = \"NONEXISTENT_DICTIONARY_FILE.kdic\"\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_error_handling\")\n", - " log_file_path = path.join(results_dir, \"khiops.log\")\n", - " scenario_path = path.join(results_dir, \"scenario._kh\")\n", - "\n", - " # Train the predictor and handle the error\n", - " try:\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " trace=True,\n", - " log_file_path=log_file_path,\n", - " output_scenario_path=scenario_path,\n", - " )\n", - " except kh.KhiopsRuntimeError as error:\n", - " print(\"Khiops training failed! Below the KhiopsRuntimeError message:\")\n", - " print(error)\n", - "\n", - " print(\"\\nFull log contents:\")\n", - " print(\"------------------\")\n", - " with open(log_file_path) as log_file:\n", - " for line in log_file:\n", - " print(line, end=\"\")\n", - "\n", - " print(\"\\nExecuted scenario\")\n", - " print(\"-----------------\")\n", - " with open(scenario_path) as scenario_file:\n", - " for line in scenario_file:\n", - " print(line, end=\"\")\n", - "\n", - "#Run sample\n", - "train_predictor_error_handling()" + "### `train_predictor_mt()`\n\n", + "Trains a multi-table predictor in the simplest way possible\n\n It is a call to `~.api.train_predictor` with additional parameters to handle\n multi-table learning\n \n" ] }, { @@ -314,35 +339,38 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_mt():\n", - " \"\"\"Trains a multi-table predictor in the simplest way possible\n", - "\n", - " It is a call to `~.api.train_predictor` with additional parameters to handle\n", - " multi-table learning\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_mt\")\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, we specify:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " # - To not construct any decision tree\n", - " # The default number of automatic features is 100\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_mt()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt\")\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, we specify:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "# - To not construct any decision tree\n", + "# The default number of automatic features is 100\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", + " max_trees=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_mt_with_specific_rules()`\n\n", + "Trains a multi-table predictor with specific construction rules\n\n It is the same as `.train_predictor_mt` but with the specification of the allowed\n variable construction rules. The list of available rules is found in the field\n ``kh.all_construction_rules``\n \n" ] }, { @@ -351,39 +379,41 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_mt_with_specific_rules():\n", - " \"\"\"Trains a multi-table predictor with specific construction rules\n", - "\n", - " It is the same as `.train_predictor_mt` but with the specification of the allowed\n", - " variable construction rules. The list of available rules is found in the field\n", - " ``kh.all_construction_rules``\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_mt_with_specific_rules\")\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, it is specified:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " # - The maximum number of aggregate variables to construct (1000)\n", - " # - The construction rules allowed to automatically create aggregates\n", - " # - To not construct any decision tree\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", - " max_constructed_variables=1000,\n", - " construction_rules=[\"TableMode\", \"TableSelection\"],\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_mt_with_specific_rules()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt_with_specific_rules\")\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, it is specified:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "# - The maximum number of aggregate variables to construct (1000)\n", + "# - The construction rules allowed to automatically create aggregates\n", + "# - To not construct any decision tree\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", + " max_constructed_variables=1000,\n", + " construction_rules=[\"TableMode\", \"TableSelection\"],\n", + " max_trees=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_mt_snowflake()`\n\n", + "Trains a multi-table predictor for a dataset with a snowflake schema\n" ] }, { @@ -392,38 +422,44 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_mt_snowflake():\n", - " \"\"\"Trains a multi-table predictor for a dataset with a snowflake schema\"\"\"\n", - "\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"Accidents\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " users_table_path = path.join(accidents_dir, \"Users.txt\")\n", - " places_table_path = path.join(accidents_dir, \"Places.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_mt_snowflake\")\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, we specify:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " # - To not construct any decision tree\n", - " # The default number of automatic features is 100\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\n", - " \"Accident`Vehicles\": vehicles_table_path,\n", - " \"Accident`Vehicles`Users\": users_table_path,\n", - " \"Accident`Place\": places_table_path,\n", - " },\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_mt_snowflake()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", + "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_mt_snowflake\")\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, we specify:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "# - To not construct any decision tree\n", + "# The default number of automatic features is 100\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\n", + " \"Accident`Vehicles\": vehicles_table_path,\n", + " \"Accident`Vehicles`Users\": users_table_path,\n", + " \"Accident`Place\": places_table_path,\n", + " },\n", + " max_trees=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_with_train_percentage()`\n\n", + "Trains a predictor with a 90%-10% train-test split\n\n Note: The default is a 70%-30% split\n \n" ] }, { @@ -432,34 +468,38 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_with_train_percentage():\n", - " \"\"\"Trains a predictor with a 90%-10% train-test split\n", - "\n", - " Note: The default is a 70%-30% split\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_with_train_percentage\")\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, it is specified:\n", - " # - A 90% sampling rate for the training dataset\n", - " # - Set the test dataset as the complement of the training dataset (10%)\n", - " # - No trees\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " sample_percentage=90,\n", - " use_complement_as_test=True,\n", - " max_trees=0,\n", - " results_prefix=\"P90_\",\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_with_train_percentage()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_train_percentage\")\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, it is specified:\n", + "# - A 90% sampling rate for the training dataset\n", + "# - Set the test dataset as the complement of the training dataset (10%)\n", + "# - No trees\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " sample_percentage=90,\n", + " use_complement_as_test=True,\n", + " max_trees=0,\n", + " results_prefix=\"P90_\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_with_trees()`\n\n", + "Trains a predictor based on 15 trees with a 80%-20% train-test split\n" ] }, { @@ -468,28 +508,35 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_with_trees():\n", - " \"\"\"Trains a predictor based on 15 trees with a 80%-20% train-test split\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Letter\", \"Letter.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Letter\", \"Letter.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_with_trees\")\n", - "\n", - " # Train the predictor with at most 15 trees (default 10)\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Letter\",\n", - " data_table_path,\n", - " \"lettr\",\n", - " results_dir,\n", - " sample_percentage=80,\n", - " use_complement_as_test=True,\n", - " results_prefix=\"P80_\",\n", - " max_trees=15,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_with_trees()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Letter\", \"Letter.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Letter\", \"Letter.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_trees\")\n", + "\n", + "# Train the predictor with at most 15 trees (default 10)\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Letter\",\n", + " data_table_path,\n", + " \"lettr\",\n", + " results_dir,\n", + " sample_percentage=80,\n", + " use_complement_as_test=True,\n", + " results_prefix=\"P80_\",\n", + " max_trees=15,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_with_pairs()`\n\n", + "Trains a predictor with user specified pairs of variables\n" ] }, { @@ -498,34 +545,41 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_with_pairs():\n", - " \"\"\"Trains a predictor with user specified pairs of variables\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_with_pairs\")\n", - "\n", - " # Train the predictor with at most 10 pairs as follows:\n", - " # - Include pairs age-race and capital_gain-capital_loss\n", - " # - Include all possible pairs having relationship as component\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " use_complement_as_test=True,\n", - " max_trees=0,\n", - " max_pairs=10,\n", - " specific_pairs=[\n", - " (\"age\", \"race\"),\n", - " (\"capital_gain\", \"capital_loss\"),\n", - " (\"relationship\", \"\"),\n", - " ],\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_with_pairs()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_pairs\")\n", + "\n", + "# Train the predictor with at most 10 pairs as follows:\n", + "# - Include pairs age-race and capital_gain-capital_loss\n", + "# - Include all possible pairs having relationship as component\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " use_complement_as_test=True,\n", + " max_trees=0,\n", + " max_pairs=10,\n", + " specific_pairs=[\n", + " (\"age\", \"race\"),\n", + " (\"capital_gain\", \"capital_loss\"),\n", + " (\"relationship\", \"\"),\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_with_multiple_parameters()`\n\n", + "Trains a predictor with various additional parameters\n\n Some of these parameters are specific to `~.api.train_predictor` and others generic\n to any Khiops execution.\n\n In this example, we specify the following parameters in the call:\n - A main target value\n - The path where to store the \"Khiops scenario\" script\n - The path where to store the log of the process\n - The flag to show the execution trace (generic to any `khiops.core.api`\n function)\n\n Additionally the Khiops runner is set such that the learning is executed with only\n 1000 MB of memory.\n \n" ] }, { @@ -534,54 +588,47 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_with_multiple_parameters():\n", - " \"\"\"Trains a predictor with various additional parameters\n", - "\n", - " Some of these parameters are specific to `~.api.train_predictor` and others generic\n", - " to any Khiops execution.\n", - "\n", - " In this example, we specify the following parameters in the call:\n", - " - A main target value\n", - " - The path where to store the \"Khiops scenario\" script\n", - " - The path where to store the log of the process\n", - " - The flag to show the execution trace (generic to any `khiops.core.api`\n", - " function)\n", - "\n", - " Additionally the Khiops runner is set such that the learning is executed with only\n", - " 1000 MB of memory.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_with_multiple_parameters\")\n", - " output_script_path = path.join(results_dir, \"output_scenario._kh\")\n", - " log_path = path.join(results_dir, \"log.txt\")\n", - "\n", - " # Set memory limit to 1000 Mb and train with Khiops\n", - " kh.get_runner().max_memory_mb = 1000\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, we specify:\n", - " # - The value \"more\" as main target value\n", - " # - The output Khiops script file location (generic)\n", - " # - The log file location (generic)\n", - " # - To show the debug trace (generic)\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " main_target_value=\"more\",\n", - " output_scenario_path=output_script_path,\n", - " log_file_path=log_path,\n", - " trace=True,\n", - " )\n", - "\n", - " # Reset memory limit to default Khiops tool value\n", - " kh.get_runner().max_memory_mb = 0\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_predictor_with_multiple_parameters()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_multiple_parameters\")\n", + "output_script_path = os.path.join(results_dir, \"output_scenario._kh\")\n", + "log_path = os.path.join(results_dir, \"log.txt\")\n", + "\n", + "# Set memory limit to 1000 Mb and train with Khiops\n", + "kh.get_runner().max_memory_mb = 1000\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, we specify:\n", + "# - The value \"more\" as main target value\n", + "# - The output Khiops script file location (generic)\n", + "# - The log file location (generic)\n", + "# - To show the debug trace (generic)\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " main_target_value=\"more\",\n", + " output_scenario_path=output_script_path,\n", + " log_file_path=log_path,\n", + " trace=True,\n", + ")\n", + "\n", + "# Reset memory limit to default Khiops tool value\n", + "kh.get_runner().max_memory_mb = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_predictor_detect_format()`\n\n", + "Trains a predictor without specifying the table format\n" ] }, { @@ -590,165 +637,62 @@ "metadata": {}, "outputs": [], "source": [ - "def train_predictor_detect_format():\n", - " \"\"\"Trains a predictor without specifying the table format\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_detect_format\")\n", - " transformed_data_table_path = path.join(results_dir, \"TransformedIris.txt\")\n", - "\n", - " # Transform the database format from header_line=True and field_separator=TAB\n", - " # to header_line=False and field_separator=\",\"\n", - " # See the deploy_model examples below for more details\n", - " kh.deploy_model(\n", - " dictionary_file_path,\n", - " \"Iris\",\n", - " data_table_path,\n", - " transformed_data_table_path,\n", - " output_header_line=False,\n", - " output_field_separator=\",\",\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Try to learn with the old format\n", - " try:\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Iris\",\n", - " transformed_data_table_path,\n", - " \"Class\",\n", - " results_dir,\n", - " header_line=True,\n", - " field_separator=\"\",\n", - " )\n", - " except kh.KhiopsRuntimeError as error:\n", - " print(\n", - " \"This failed because of a bad data table format spec. \"\n", - " + \"Below the KhiopsRuntimeError message\"\n", - " )\n", - " print(error)\n", - "\n", - " # Train without specifyng the format (detect_format is True by default)\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_detect_format\")\n", + "transformed_data_table_path = os.path.join(results_dir, \"TransformedIris.txt\")\n", + "\n", + "# Transform the database format from header_line=True and field_separator=TAB\n", + "# to header_line=False and field_separator=\",\"\n", + "# See the deploy_model examples below for more details\n", + "kh.deploy_model(\n", + " dictionary_file_path,\n", + " \"Iris\",\n", + " data_table_path,\n", + " transformed_data_table_path,\n", + " output_header_line=False,\n", + " output_field_separator=\",\",\n", + ")\n", + "\n", + "# Try to learn with the old format\n", + "try:\n", " kh.train_predictor(\n", " dictionary_file_path,\n", " \"Iris\",\n", " transformed_data_table_path,\n", " \"Class\",\n", " results_dir,\n", + " header_line=True,\n", + " field_separator=\"\",\n", " )\n", - "\n", - "#Run sample\n", - "train_predictor_detect_format()" + "except kh.KhiopsRuntimeError as error:\n", + " print(\n", + " \"This failed because of a bad data table format spec. \"\n", + " + \"Below the KhiopsRuntimeError message\"\n", + " )\n", + " print(error)\n", + "\n", + "# Train without specifyng the format (detect_format is True by default)\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Iris\",\n", + " transformed_data_table_path,\n", + " \"Class\",\n", + " results_dir,\n", + ")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def train_predictor_with_cross_validation():\n", - " \"\"\"Trains a predictor with a 5-fold cross-validation\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_predictor_with_cross_validation\")\n", - " fold_dictionary_file_path = path.join(results_dir, \"AdultWithFolding.kdic\")\n", - "\n", - " # Create the output directory\n", - " if not path.isdir(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - " # Load the learning dictionary object\n", - " domain = kh.read_dictionary_file(dictionary_file_path)\n", - " dictionary = domain.get_dictionary(\"Adult\")\n", - "\n", - " # Add a random fold index variable to the learning dictionary\n", - " fold_number = 5\n", - " fold_index_variable = kh.Variable()\n", - " fold_index_variable.name = \"FoldIndex\"\n", - " fold_index_variable.type = \"Numerical\"\n", - " fold_index_variable.used = False\n", - " fold_index_variable.rule = \"Ceil(Product(\" + str(fold_number) + \", Random()))\"\n", - " dictionary.add_variable(fold_index_variable)\n", - "\n", - " # Add variables that indicate if the instance is in the train dataset:\n", - " for fold_index in range(1, fold_number + 1):\n", - " is_in_train_dataset_variable = kh.Variable()\n", - " is_in_train_dataset_variable.name = \"IsInTrainDataset\" + str(fold_index)\n", - " is_in_train_dataset_variable.type = \"Numerical\"\n", - " is_in_train_dataset_variable.used = False\n", - " is_in_train_dataset_variable.rule = \"NEQ(FoldIndex, \" + str(fold_index) + \")\"\n", - " dictionary.add_variable(is_in_train_dataset_variable)\n", - "\n", - " # Print dictionary with fold variables\n", - " print(\"Dictionary file with fold variables\")\n", - " domain.export_khiops_dictionary_file(fold_dictionary_file_path)\n", - " with open(fold_dictionary_file_path) as fold_dictionary_file:\n", - " for line in fold_dictionary_file:\n", - " print(line, end=\"\")\n", - "\n", - " # For each fold k:\n", - " print(\"Training Adult with \" + str(fold_number) + \" folds\")\n", - " print(\"\\tfold\\ttrain auc\\ttest auc\")\n", - " train_aucs = []\n", - " test_aucs = []\n", - " for fold_index in range(1, fold_number + 1):\n", - " # Train a model from the sub-dataset where IsInTrainDataset is 1\n", - " train_reports_path, modeling_dictionary_file_path = kh.train_predictor(\n", - " domain,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " sample_percentage=100,\n", - " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", - " selection_value=1,\n", - " max_trees=0,\n", - " results_prefix=\"Fold\" + str(fold_index),\n", - " )\n", - "\n", - " # Evaluate the resulting model in the subsets where IsInTrainDataset is 0\n", - " test_evaluation_report_path = kh.evaluate_predictor(\n", - " modeling_dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " results_dir,\n", - " sample_percentage=100,\n", - " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", - " selection_value=0,\n", - " results_prefix=\"Fold\" + str(fold_index),\n", - " )\n", - "\n", - " # Obtain the train AUC from the train report and the test AUC from the\n", - " # evaluation report and print them\n", - " train_results = kh.read_analysis_results_file(train_reports_path)\n", - " test_evaluation_results = kh.read_analysis_results_file(\n", - " test_evaluation_report_path\n", - " )\n", - " train_auc = train_results.train_evaluation_report.get_snb_performance().auc\n", - " test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc\n", - " print(\"\\t\" + str(fold_index) + \"\\t\" + str(train_auc) + \"\\t\" + str(test_auc))\n", - "\n", - " # Store the train and test AUCs in arrays\n", - " train_aucs.append(train_auc)\n", - " test_aucs.append(test_auc)\n", - "\n", - " # Print the mean +- error aucs for both train and test\n", - " mean_train_auc = sum(train_aucs) / fold_number\n", - " squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs]\n", - " sd_train_auc = sqrt(sum(squared_error_train_aucs) / (fold_number - 1))\n", - "\n", - " mean_test_auc = sum(test_aucs) / fold_number\n", - " squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs]\n", - " sd_test_auc = sqrt(sum(squared_error_test_aucs) / (fold_number - 1))\n", - "\n", - " print(\"final auc\")\n", - " print(\"train auc: \" + str(mean_train_auc) + \" +- \" + str(sd_train_auc))\n", - " print(\"test auc: \" + str(mean_test_auc) + \" +- \" + str(sd_test_auc))\n", - "\n", - "#Run sample\n", - "train_predictor_with_cross_validation()" + "### `train_predictor_with_cross_validation()`\n\n", + "Trains a predictor with a 5-fold cross-validation\n" ] }, { @@ -757,84 +701,114 @@ "metadata": {}, "outputs": [], "source": [ - "def multiple_train_predictor():\n", - " \"\"\"Trains a sequence of models with a decreasing number of variables\n", - "\n", - " This example illustrates the use of the khiops classes `.DictionaryDomain` (for\n", - " reading dictionary files) and `.AnalysisResults` (for reading training/evaluation\n", - " results from JSON)\n", - " \"\"\"\n", + "# Imports\n", + "import math\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " def display_test_results(json_result_file_path):\n", - " \"\"\"Display some of the training results\"\"\"\n", - " results = kh.read_analysis_results_file(json_result_file_path)\n", - " train_performance = results.train_evaluation_report.get_snb_performance()\n", - " test_performance = results.test_evaluation_report.get_snb_performance()\n", - " print(\n", - " \"\\t\"\n", - " + str(len(results.preparation_report.variables_statistics))\n", - " + \"\\t\"\n", - " + str(train_performance.auc)\n", - " + \"\\t\"\n", - " + str(test_performance.auc)\n", - " )\n", - "\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"multiple_train_predictor\")\n", - "\n", - " # Read the dictionary file to obtain an instance of class Dictionary\n", - " dictionary_domain = kh.read_dictionary_file(dictionary_file_path)\n", - " dictionary = dictionary_domain.get_dictionary(\"Adult\")\n", - "\n", - " # Train a SNB model using all the variables\n", - " print(\"\\t#vars\\ttrain auc\\ttest auc\")\n", - " json_result_file_path, _ = kh.train_predictor(\n", - " dictionary_file_path,\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_predictor_with_cross_validation\")\n", + "fold_dictionary_file_path = os.path.join(results_dir, \"AdultWithFolding.kdic\")\n", + "\n", + "# Create the output directory\n", + "if not os.path.isdir(results_dir):\n", + " os.mkdir(results_dir)\n", + "\n", + "# Load the learning dictionary object\n", + "domain = kh.read_dictionary_file(dictionary_file_path)\n", + "dictionary = domain.get_dictionary(\"Adult\")\n", + "\n", + "# Add a random fold index variable to the learning dictionary\n", + "fold_number = 5\n", + "fold_index_variable = kh.Variable()\n", + "fold_index_variable.name = \"FoldIndex\"\n", + "fold_index_variable.type = \"Numerical\"\n", + "fold_index_variable.used = False\n", + "fold_index_variable.rule = \"Ceil(Product(\" + str(fold_number) + \", Random()))\"\n", + "dictionary.add_variable(fold_index_variable)\n", + "\n", + "# Add variables that indicate if the instance is in the train dataset:\n", + "for fold_index in range(1, fold_number + 1):\n", + " is_in_train_dataset_variable = kh.Variable()\n", + " is_in_train_dataset_variable.name = \"IsInTrainDataset\" + str(fold_index)\n", + " is_in_train_dataset_variable.type = \"Numerical\"\n", + " is_in_train_dataset_variable.used = False\n", + " is_in_train_dataset_variable.rule = \"NEQ(FoldIndex, \" + str(fold_index) + \")\"\n", + " dictionary.add_variable(is_in_train_dataset_variable)\n", + "\n", + "# Print dictionary with fold variables\n", + "print(\"Dictionary file with fold variables\")\n", + "domain.export_khiops_dictionary_file(fold_dictionary_file_path)\n", + "with open(fold_dictionary_file_path) as fold_dictionary_file:\n", + " for line in fold_dictionary_file:\n", + " print(line, end=\"\")\n", + "\n", + "# For each fold k:\n", + "print(\"Training Adult with \" + str(fold_number) + \" folds\")\n", + "print(\"\\tfold\\ttrain auc\\ttest auc\")\n", + "train_aucs = []\n", + "test_aucs = []\n", + "for fold_index in range(1, fold_number + 1):\n", + " # Train a model from the sub-dataset where IsInTrainDataset is 1\n", + " train_reports_path, modeling_dictionary_file_path = kh.train_predictor(\n", + " domain,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", " results_dir,\n", - " sample_percentage=70,\n", - " use_complement_as_test=True,\n", + " sample_percentage=100,\n", + " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", + " selection_value=1,\n", " max_trees=0,\n", + " results_prefix=\"Fold\" + str(fold_index),\n", " )\n", - " display_test_results(json_result_file_path)\n", - "\n", - " # Read results to obtain the variables sorted by decreasing Level\n", - " analysis_results = kh.read_analysis_results_file(json_result_file_path)\n", - " preparation_results = analysis_results.preparation_report\n", - "\n", - " # Train a sequence of models with a decreasing number of variables\n", - " # We disable variables one-by-one in increasing level (predictive power) order\n", - " variable_number = len(preparation_results.variables_statistics)\n", - " for i in reversed(range(variable_number)):\n", - " # Search the next variable\n", - " variable = preparation_results.variables_statistics[i]\n", - "\n", - " # Disable this variable and save the dictionary with the Khiops format\n", - " dictionary.get_variable(variable.name).used = False\n", "\n", - " # Train the model with this dictionary domain object\n", - " prefix = f\"V{variable_number - 1 - i}_\"\n", - " json_result_file_path, _ = kh.train_predictor(\n", - " dictionary_domain,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " sample_percentage=70,\n", - " use_complement_as_test=True,\n", - " results_prefix=prefix,\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Show a preview of the results\n", - " display_test_results(json_result_file_path)\n", + " # Evaluate the resulting model in the subsets where IsInTrainDataset is 0\n", + " test_evaluation_report_path = kh.evaluate_predictor(\n", + " modeling_dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " results_dir,\n", + " sample_percentage=100,\n", + " selection_variable=\"IsInTrainDataset\" + str(fold_index),\n", + " selection_value=0,\n", + " results_prefix=\"Fold\" + str(fold_index),\n", + " )\n", "\n", - "#Run sample\n", - "multiple_train_predictor()" + " # Obtain the train AUC from the train report and the test AUC from the\n", + " # evaluation report and print them\n", + " train_results = kh.read_analysis_results_file(train_reports_path)\n", + " test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path)\n", + " train_auc = train_results.train_evaluation_report.get_snb_performance().auc\n", + " test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc\n", + " print(\"\\t\" + str(fold_index) + \"\\t\" + str(train_auc) + \"\\t\" + str(test_auc))\n", + "\n", + " # Store the train and test AUCs in arrays\n", + " train_aucs.append(train_auc)\n", + " test_aucs.append(test_auc)\n", + "\n", + "# Print the mean +- error aucs for both train and test\n", + "mean_train_auc = sum(train_aucs) / fold_number\n", + "squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs]\n", + "sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1))\n", + "\n", + "mean_test_auc = sum(test_aucs) / fold_number\n", + "squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs]\n", + "sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1))\n", + "\n", + "print(\"final auc\")\n", + "print(\"train auc: \" + str(mean_train_auc) + \" +- \" + str(sd_train_auc))\n", + "print(\"test auc: \" + str(mean_test_auc) + \" +- \" + str(sd_test_auc))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `multiple_train_predictor()`\n\n", + "Trains a sequence of models with a decreasing number of variables\n\n This example illustrates the use of the khiops classes `.DictionaryDomain` (for\n reading dictionary files) and `.AnalysisResults` (for reading training/evaluation\n results from JSON)\n \n" ] }, { @@ -843,35 +817,87 @@ "metadata": {}, "outputs": [], "source": [ - "def evaluate_predictor():\n", - " \"\"\"Evaluates a predictor in the simplest way possible\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " It calls `~.api.evaluate_predictor` with only its mandatory parameters.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"evaluate_predictor\")\n", - " model_dictionary_file_path = path.join(results_dir, \"Modeling.kdic\")\n", "\n", - " # Train the predictor\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", + "def display_test_results(json_result_file_path):\n", + " \"\"\"Display some of the training results\"\"\"\n", + " results = kh.read_analysis_results_file(json_result_file_path)\n", + " train_performance = results.train_evaluation_report.get_snb_performance()\n", + " test_performance = results.test_evaluation_report.get_snb_performance()\n", + " print(\n", + " \"\\t\"\n", + " + str(len(results.preparation_report.variables_statistics))\n", + " + \"\\t\"\n", + " + str(train_performance.auc)\n", + " + \"\\t\"\n", + " + str(test_performance.auc)\n", + " )\n", + "\n", + "\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"multiple_train_predictor\")\n", + "\n", + "# Read the dictionary file to obtain an instance of class Dictionary\n", + "dictionary_domain = kh.read_dictionary_file(dictionary_file_path)\n", + "dictionary = dictionary_domain.get_dictionary(\"Adult\")\n", + "\n", + "# Train a SNB model using all the variables\n", + "print(\"\\t#vars\\ttrain auc\\ttest auc\")\n", + "json_result_file_path, _ = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " sample_percentage=70,\n", + " use_complement_as_test=True,\n", + " max_trees=0,\n", + ")\n", + "display_test_results(json_result_file_path)\n", + "\n", + "# Read results to obtain the variables sorted by decreasing Level\n", + "analysis_results = kh.read_analysis_results_file(json_result_file_path)\n", + "preparation_results = analysis_results.preparation_report\n", + "\n", + "# Train a sequence of models with a decreasing number of variables\n", + "# We disable variables one-by-one in increasing level (predictive power) order\n", + "variable_number = len(preparation_results.variables_statistics)\n", + "for i in reversed(range(variable_number)):\n", + " # Search the next variable\n", + " variable = preparation_results.variables_statistics[i]\n", + "\n", + " # Disable this variable and save the dictionary with the Khiops format\n", + " dictionary.get_variable(variable.name).used = False\n", + "\n", + " # Train the model with this dictionary domain object\n", + " prefix = f\"V{variable_number - 1 - i}_\"\n", + " json_result_file_path, _ = kh.train_predictor(\n", + " dictionary_domain,\n", " \"Adult\",\n", " data_table_path,\n", " \"class\",\n", " results_dir,\n", + " sample_percentage=70,\n", + " use_complement_as_test=True,\n", + " results_prefix=prefix,\n", " max_trees=0,\n", " )\n", "\n", - " # Evaluate the predictor\n", - " report_file_path = kh.evaluate_predictor(\n", - " model_dictionary_file_path, \"Adult\", data_table_path, results_dir\n", - " )\n", - " print(\"Evaluation report available at \" + report_file_path)\n", - "\n", - "#Run sample\n", - "evaluate_predictor()" + " # Show a preview of the results\n", + " display_test_results(json_result_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `evaluate_predictor()`\n\n", + "Evaluates a predictor in the simplest way possible\n\n It calls `~.api.evaluate_predictor` with only its mandatory parameters.\n \n" ] }, { @@ -880,81 +906,125 @@ "metadata": {}, "outputs": [], "source": [ - "def access_predictor_evaluation_report():\n", - " \"\"\"Shows the performance metrics of a predictor\n", - "\n", - " See `evaluate_predictor` or `train_predictor_with_train_percentage` to see examples\n", - " on how to evaluate a model.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"access_predictor_evaluation_report\")\n", - " evaluation_report_path = path.join(results_dir, \"AllReports.khj\")\n", - "\n", - " # Train the SNB predictor and some univariate predictors\n", - " # Note: Evaluation in test is 30% by default\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " univariate_predictor_number=4,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Obtain the evaluation results\n", - " results = kh.read_analysis_results_file(evaluation_report_path)\n", - " evaluation_report = results.test_evaluation_report\n", - " snb_performance = evaluation_report.get_snb_performance()\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"evaluate_predictor\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Evaluate the predictor\n", + "report_file_path = kh.evaluate_predictor(\n", + " model_dictionary_file_path, \"Adult\", data_table_path, results_dir\n", + ")\n", + "print(\"Evaluation report available at \" + report_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `access_predictor_evaluation_report()`\n\n", + "Shows the performance metrics of a predictor\n\n See `evaluate_predictor` or `train_predictor_with_train_percentage` to see examples\n on how to evaluate a model.\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Print univariate metrics for the SNB\n", - " print(\"\\nperformance metrics for \" + snb_performance.name)\n", - " for metric_name in snb_performance.get_metric_names():\n", - " print(metric_name + \": \" + str(snb_performance.get_metric(metric_name)))\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"access_predictor_evaluation_report\")\n", + "evaluation_report_path = os.path.join(results_dir, \"AllReports.khj\")\n", + "\n", + "# Train the SNB predictor and some univariate predictors\n", + "# Note: Evaluation in test is 30% by default\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + " univariate_predictor_number=4,\n", + ")\n", + "\n", + "# Obtain the evaluation results\n", + "results = kh.read_analysis_results_file(evaluation_report_path)\n", + "evaluation_report = results.test_evaluation_report\n", + "snb_performance = evaluation_report.get_snb_performance()\n", + "\n", + "# Print univariate metrics for the SNB\n", + "print(\"\\nperformance metrics for \" + snb_performance.name)\n", + "for metric_name in snb_performance.get_metric_names():\n", + " print(metric_name + \": \" + str(snb_performance.get_metric(metric_name)))\n", + "\n", + "# Print the confusion matrix\n", + "print(\"\\nconfusion matrix:\")\n", + "confusion_matrix = snb_performance.confusion_matrix\n", + "\n", + "for target_value in confusion_matrix.values:\n", + " print(\"\\t\" + target_value, end=\"\")\n", + "print(\"\")\n", + "\n", + "for i, target_value in enumerate(confusion_matrix.values):\n", + " observed_frequencies = confusion_matrix.matrix[i]\n", + " print(target_value, end=\"\")\n", + " for frequency in observed_frequencies:\n", + " print(\"\\t\" + str(frequency), end=\"\")\n", + " print(\"\")\n", "\n", - " # Print the confusion matrix\n", - " print(\"\\nconfusion matrix:\")\n", - " confusion_matrix = snb_performance.confusion_matrix\n", + "# Print the head of the lift curves for the 'more' modality\n", + "print(\"\\nfirst five values of the lift curves for 'more'\")\n", "\n", - " for target_value in confusion_matrix.values:\n", - " print(\"\\t\" + target_value, end=\"\")\n", - " print(\"\")\n", + "snb_lift_curve = evaluation_report.get_snb_lift_curve(\"more\")\n", + "optimal_lift_curve = evaluation_report.get_classifier_lift_curve(\"Optimal\", \"more\")\n", + "random_lift_curve = evaluation_report.get_classifier_lift_curve(\"Random\", \"more\")\n", "\n", - " for i, target_value in enumerate(confusion_matrix.values):\n", - " observed_frequencies = confusion_matrix.matrix[i]\n", - " print(target_value, end=\"\")\n", - " for frequency in observed_frequencies:\n", - " print(\"\\t\" + str(frequency), end=\"\")\n", - " print(\"\")\n", - "\n", - " # Print the head of the lift curves for the 'more' modality\n", - " print(\"\\nfirst five values of the lift curves for 'more'\")\n", - "\n", - " snb_lift_curve = evaluation_report.get_snb_lift_curve(\"more\")\n", - " optimal_lift_curve = evaluation_report.get_classifier_lift_curve(\"Optimal\", \"more\")\n", - " random_lift_curve = evaluation_report.get_classifier_lift_curve(\"Random\", \"more\")\n", - "\n", - " for i in range(5):\n", - " print(\n", - " str(snb_lift_curve.values[i])\n", - " + \"\\t\"\n", - " + str(optimal_lift_curve.values[i])\n", - " + \"\\t\"\n", - " + str(random_lift_curve.values[i])\n", - " )\n", - "\n", - " # Print univariate metrics for an univariate predictor\n", - " predictor_performance = evaluation_report.get_predictor_performance(\n", - " \"Univariate relationship\"\n", + "for i in range(5):\n", + " print(\n", + " str(snb_lift_curve.values[i])\n", + " + \"\\t\"\n", + " + str(optimal_lift_curve.values[i])\n", + " + \"\\t\"\n", + " + str(random_lift_curve.values[i])\n", " )\n", - " print(\"\\n\\nperformance metrics for \" + predictor_performance.name)\n", - " for metric_name in predictor_performance.get_metric_names():\n", - " print(metric_name + \": \" + str(predictor_performance.get_metric(metric_name)))\n", "\n", - "#Run sample\n", - "access_predictor_evaluation_report()" + "# Print univariate metrics for an univariate predictor\n", + "predictor_performance = evaluation_report.get_predictor_performance(\n", + " \"Univariate relationship\"\n", + ")\n", + "print(\"\\n\\nperformance metrics for \" + predictor_performance.name)\n", + "for metric_name in predictor_performance.get_metric_names():\n", + " print(metric_name + \": \" + str(predictor_performance.get_metric(metric_name)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_recoder()`\n\n", + "Train a database recoder in the simplest way possible\n\n It is a call to `~.api.train_recoder` with only its mandatory parameters.\n \n" ] }, { @@ -963,23 +1033,25 @@ "metadata": {}, "outputs": [], "source": [ - "def train_recoder():\n", - " \"\"\"Train a database recoder in the simplest way possible\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " It is a call to `~.api.train_recoder` with only its mandatory parameters.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_recoder\")\n", - "\n", - " # Train the recoder model\n", - " kh.train_recoder(\n", - " dictionary_file_path, \"Adult\", data_table_path, \"class\", results_dir\n", - " )\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder\")\n", "\n", - "#Run sample\n", - "train_recoder()" + "# Train the recoder model\n", + "kh.train_recoder(dictionary_file_path, \"Adult\", data_table_path, \"class\", results_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_recoder_with_multiple_parameters()`\n\n", + "Trains a recoder that transforms variable values to their respective part labels\n\n It also creates 10 pair features.\n \n" ] }, { @@ -988,30 +1060,34 @@ "metadata": {}, "outputs": [], "source": [ - "def train_recoder_with_multiple_parameters():\n", - " \"\"\"Trains a recoder that transforms variable values to their respective part labels\n", - "\n", - " It also creates 10 pair features.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_recoder_with_multiple_parameters\")\n", - "\n", - " # Train the recoder model\n", - " kh.train_recoder(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_pairs=10,\n", - " categorical_recoding_method=\"part label\",\n", - " numerical_recoding_method=\"part label\",\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_recoder_with_multiple_parameters()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder_with_multiple_parameters\")\n", + "\n", + "# Train the recoder model\n", + "kh.train_recoder(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_pairs=10,\n", + " categorical_recoding_method=\"part label\",\n", + " numerical_recoding_method=\"part label\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_recoder_mt_flatten()`\n\n", + "Trains a recoder that flattens a multi-table database into a single table\n\n The constructed variables are all kept and no recoding is performed on their values\n \n" ] }, { @@ -1020,40 +1096,44 @@ "metadata": {}, "outputs": [], "source": [ - "def train_recoder_mt_flatten():\n", - " \"\"\"Trains a recoder that flattens a multi-table database into a single table\n", - "\n", - " The constructed variables are all kept and no recoding is performed on their values\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_recoder_mt_flatten\")\n", - "\n", - " # Train the recoder. Besides the mandatory parameters, it is specified:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " # - The maximum number of aggregate variables to construct (1000)\n", - " # - To keep all the created variables independently of their informativeness (level)\n", - " # - To not recode the variables values\n", - " kh.train_recoder(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", - " max_constructed_variables=1000,\n", - " informative_variables_only=False,\n", - " categorical_recoding_method=\"none\",\n", - " numerical_recoding_method=\"none\",\n", - " keep_initial_categorical_variables=True,\n", - " keep_initial_numerical_variables=True,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_recoder_mt_flatten()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_recoder_mt_flatten\")\n", + "\n", + "# Train the recoder. Besides the mandatory parameters, it is specified:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "# - The maximum number of aggregate variables to construct (1000)\n", + "# - To keep all the created variables independently of their informativeness (level)\n", + "# - To not recode the variables values\n", + "kh.train_recoder(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", + " max_constructed_variables=1000,\n", + " informative_variables_only=False,\n", + " categorical_recoding_method=\"none\",\n", + " numerical_recoding_method=\"none\",\n", + " keep_initial_categorical_variables=True,\n", + " keep_initial_numerical_variables=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_model()`\n\n", + "Deploys a model in the simplest possible\n\n It is a call to `~.api.deploy_model` with its mandatory parameters.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n" ] }, { @@ -1062,40 +1142,40 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_model():\n", - " \"\"\"Deploys a model in the simplest possible\n", - "\n", - " It is a call to `~.api.deploy_model` with its mandatory parameters.\n", - "\n", - " In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n", - " associated dictionary to the input database. The model predictions are written to\n", - " the output database.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_model\")\n", - " model_dictionary_file_path = path.join(results_dir, \"Modeling.kdic\")\n", - " output_data_table_path = path.join(results_dir, \"ScoresAdult.txt\")\n", - "\n", - " # Train the predictor\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Deploy the model on the database\n", - " # It will score it according to the trained predictor\n", - " kh.deploy_model(\n", - " model_dictionary_file_path, \"SNB_Adult\", data_table_path, output_data_table_path\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "deploy_model()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", + "\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Deploy the model on the database\n", + "# It will score it according to the trained predictor\n", + "kh.deploy_model(\n", + " model_dictionary_file_path, \"SNB_Adult\", data_table_path, output_data_table_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_model_mt()`\n\n", + "Deploys a multi-table classifier in the simplest way possible\n\n It is a call to `~.api.deploy_model` with additional parameters to handle\n multi-table deployment.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n" ] }, { @@ -1104,49 +1184,48 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_model_mt():\n", - " \"\"\"Deploys a multi-table classifier in the simplest way possible\n", - "\n", - " It is a call to `~.api.deploy_model` with additional parameters to handle\n", - " multi-table deployment.\n", - "\n", - " In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n", - " associated dictionary to the input database. The model predictions are written to\n", - " the output database.\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_model_mt\")\n", - " model_dictionary_file_path = path.join(results_dir, \"Modeling.kdic\")\n", - " output_data_table_path = path.join(results_dir, \"TransferredAccidents.txt\")\n", - "\n", - " # Train the predictor (see train_predictor_mt for details)\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Deploy the model on the database\n", - " # Besides the mandatory parameters, it is specified:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " kh.deploy_model(\n", - " model_dictionary_file_path,\n", - " \"SNB_Accident\",\n", - " accidents_table_path,\n", - " output_data_table_path,\n", - " additional_data_tables={\"SNB_Accident`Vehicles\": vehicles_table_path},\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "deploy_model_mt()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_mt\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredAccidents.txt\")\n", + "\n", + "# Train the predictor (see train_predictor_mt for details)\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\"Accident`Vehicles\": vehicles_table_path},\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Deploy the model on the database\n", + "# Besides the mandatory parameters, it is specified:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "kh.deploy_model(\n", + " model_dictionary_file_path,\n", + " \"SNB_Accident\",\n", + " accidents_table_path,\n", + " output_data_table_path,\n", + " additional_data_tables={\"SNB_Accident`Vehicles\": vehicles_table_path},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_model_mt_snowflake()`\n\n", + "Deploys a classifier model on a dataset with a snowflake schema\n" ] }, { @@ -1155,54 +1234,61 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_model_mt_snowflake():\n", - " \"\"\"Deploys a classifier model on a dataset with a snowflake schema\"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"Accidents\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " users_table_path = path.join(accidents_dir, \"Users.txt\")\n", - " places_table_path = path.join(accidents_dir, \"Places.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_model_mt_snowflake\")\n", - " model_dictionary_file_path = path.join(results_dir, \"Modeling.kdic\")\n", - " output_data_table_path = path.join(results_dir, \"TransferredAccidents.txt\")\n", - "\n", - " # Train the predictor. Besides the mandatory parameters, we specify:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " # - To not construct any decision tree\n", - " # The default number of automatic features is 100\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Accident\",\n", - " accidents_table_path,\n", - " \"Gravity\",\n", - " results_dir,\n", - " additional_data_tables={\n", - " \"Accident`Vehicles\": vehicles_table_path,\n", - " \"Accident`Vehicles`Users\": users_table_path,\n", - " \"Accident`Place\": places_table_path,\n", - " },\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Deploy the model on the database\n", - " # Besides the mandatory parameters, it is specified:\n", - " # - A python dictionary linking data paths to file paths for non-root tables\n", - " kh.deploy_model(\n", - " model_dictionary_file_path,\n", - " \"SNB_Accident\",\n", - " accidents_table_path,\n", - " output_data_table_path,\n", - " additional_data_tables={\n", - " \"SNB_Accident`Vehicles\": vehicles_table_path,\n", - " \"SNB_Accident`Vehicles`Users\": users_table_path,\n", - " \"SNB_Accident`Place\": places_table_path,\n", - " },\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "deploy_model_mt_snowflake()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", + "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_mt_snowflake\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredAccidents.txt\")\n", + "\n", + "# Train the predictor. Besides the mandatory parameters, we specify:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "# - To not construct any decision tree\n", + "# The default number of automatic features is 100\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Accident\",\n", + " accidents_table_path,\n", + " \"Gravity\",\n", + " results_dir,\n", + " additional_data_tables={\n", + " \"Accident`Vehicles\": vehicles_table_path,\n", + " \"Accident`Vehicles`Users\": users_table_path,\n", + " \"Accident`Place\": places_table_path,\n", + " },\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Deploy the model on the database\n", + "# Besides the mandatory parameters, it is specified:\n", + "# - A python dictionary linking data paths to file paths for non-root tables\n", + "kh.deploy_model(\n", + " model_dictionary_file_path,\n", + " \"SNB_Accident\",\n", + " accidents_table_path,\n", + " output_data_table_path,\n", + " additional_data_tables={\n", + " \"SNB_Accident`Vehicles\": vehicles_table_path,\n", + " \"SNB_Accident`Vehicles`Users\": users_table_path,\n", + " \"SNB_Accident`Place\": places_table_path,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_model_expert()`\n\n", + "Deploys a model with a specification of additional variables to be included\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. Specifically, the output file contains:\n\n - The model predictions\n - The probabilities of all modalities of the target variable.\n\n The \"expert\" part of this example is the use of the khiops dictionary interface\n and the `.DictionaryDomain` class\n \n" ] }, { @@ -1211,55 +1297,52 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_model_expert():\n", - " \"\"\"Deploys a model with a specification of additional variables to be included\n", - "\n", - " In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n", - " associated dictionary to the input database. Specifically, the output file contains:\n", - "\n", - " - The model predictions\n", - " - The probabilities of all modalities of the target variable.\n", - "\n", - " The \"expert\" part of this example is the use of the khiops dictionary interface\n", - " and the `.DictionaryDomain` class\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_model_expert\")\n", - " model_dictionary_file_path = path.join(results_dir, \"Modeling.kdic\")\n", - " output_data_table_path = path.join(results_dir, \"ScoresAdult.txt\")\n", - "\n", - " # Train the predictor\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Read the dictionary file to obtain an instance of class Dictionary\n", - " model_domain = kh.read_dictionary_file(model_dictionary_file_path)\n", - " snb_dictionary = model_domain.get_dictionary(\"SNB_Adult\")\n", - "\n", - " # Select Label (identifier)\n", - " snb_dictionary.get_variable(\"Label\").used = True\n", - "\n", - " # Select the variables containing the probabilities for each class\n", - " for variable in snb_dictionary.variables:\n", - " # The variable must have a meta data with key that start with \"target_prob\"\n", - " for key in variable.meta_data.keys:\n", - " if key.startswith(\"TargetProb\"):\n", - " variable.used = True\n", - "\n", - " # Deploy the model. Besides the mandatory parameters, it is specified:\n", - " # - A DictionaryDomain object to use instead of the mandatory dictionary file\n", - " kh.deploy_model(model_domain, \"SNB_Adult\", data_table_path, output_data_table_path)\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "deploy_model_expert()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_model_expert\")\n", + "model_dictionary_file_path = os.path.join(results_dir, \"Modeling.kdic\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", + "\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Read the dictionary file to obtain an instance of class Dictionary\n", + "model_domain = kh.read_dictionary_file(model_dictionary_file_path)\n", + "snb_dictionary = model_domain.get_dictionary(\"SNB_Adult\")\n", + "\n", + "# Select Label (identifier)\n", + "snb_dictionary.get_variable(\"Label\").used = True\n", + "\n", + "# Select the variables containing the probabilities for each class\n", + "for variable in snb_dictionary.variables:\n", + " # The variable must have a meta data with key that start with \"target_prob\"\n", + " for key in variable.meta_data.keys:\n", + " if key.startswith(\"TargetProb\"):\n", + " variable.used = True\n", + "\n", + "# Deploy the model. Besides the mandatory parameters, it is specified:\n", + "# - A DictionaryDomain object to use instead of the mandatory dictionary file\n", + "kh.deploy_model(model_domain, \"SNB_Adult\", data_table_path, output_data_table_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_classifier_for_metrics()`\n\n", + "Constructs a small precision-recall curve for a classifier\n" ] }, { @@ -1268,71 +1351,78 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_classifier_for_metrics():\n", - " \"\"\"Constructs a small precision-recall curve for a classifier\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_classifier_for_metrics\")\n", - " output_data_table_path = path.join(results_dir, \"ScoresAdult.txt\")\n", - "\n", - " # Train the classifier for the target \"class\"\n", - " _, modeling_dictionary_file_path = kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", - "\n", - " # Obtain the scores of the SNB on the test dataset to calculate the PR curve\n", - " kh.deploy_predictor_for_metrics(\n", - " modeling_dictionary_file_path,\n", - " \"SNB_Adult\",\n", - " data_table_path,\n", - " output_data_table_path,\n", - " sampling_mode=\"Exclude sample\",\n", - " output_header_line=False,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # We estimate the precision/recall for the class \"more\" and increasing thresholds\n", - " # Note: Normally one would do this with a package (eg. sklearn.metrics)\n", - " thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]\n", - " true_positives = {thres: 0 for thres in thresholds}\n", - " false_positives = {thres: 0 for thres in thresholds}\n", - " false_negatives = {thres: 0 for thres in thresholds}\n", - " with open(output_data_table_path) as output_data_table:\n", - " for line in output_data_table:\n", - " fields = line.split(\"\\t\")\n", - " true_target = fields[0]\n", - " proba_more = float(fields[3])\n", - " for thres in thresholds:\n", - " if true_target == \"more\" and proba_more >= thres:\n", - " true_positives[thres] += 1\n", - " elif true_target == \"more\" and proba_more < thres:\n", - " false_negatives[thres] += 1\n", - " elif true_target == \"less\" and proba_more >= thres:\n", - " false_positives[thres] += 1\n", - "\n", - " precision = {\n", - " thres: true_positives[thres] / (true_positives[thres] + false_positives[thres])\n", - " for thres in thresholds\n", - " }\n", - " recall = {\n", - " thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres])\n", - " for thres in thresholds\n", - " }\n", - "\n", - " # Print the curve at the selected points\n", - " print(\"Precision and Recall for class 'more'\")\n", - " print(\"threshold\\trecall\\tprecision\")\n", - " thresholds.reverse()\n", - " for thres in thresholds:\n", - " print(str(thres) + \"\\t\" + str(recall[thres]) + \"\\t\" + str(precision[thres]))\n", - "\n", - "#Run sample\n", - "deploy_classifier_for_metrics()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_classifier_for_metrics\")\n", + "output_data_table_path = os.path.join(results_dir, \"ScoresAdult.txt\")\n", + "\n", + "# Train the classifier for the target \"class\"\n", + "_, modeling_dictionary_file_path = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Obtain the scores of the SNB on the test dataset to calculate the PR curve\n", + "kh.deploy_predictor_for_metrics(\n", + " modeling_dictionary_file_path,\n", + " \"SNB_Adult\",\n", + " data_table_path,\n", + " output_data_table_path,\n", + " sampling_mode=\"Exclude sample\",\n", + " output_header_line=False,\n", + ")\n", + "\n", + "# We estimate the precision/recall for the class \"more\" and increasing thresholds\n", + "# Note: Normally one would do this with a package (eg. sklearn.metrics)\n", + "thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]\n", + "true_positives = {thres: 0 for thres in thresholds}\n", + "false_positives = {thres: 0 for thres in thresholds}\n", + "false_negatives = {thres: 0 for thres in thresholds}\n", + "with open(output_data_table_path) as output_data_table:\n", + " for line in output_data_table:\n", + " fields = line.split(\"\\t\")\n", + " true_target = fields[0]\n", + " proba_more = float(fields[3])\n", + " for thres in thresholds:\n", + " if true_target == \"more\" and proba_more >= thres:\n", + " true_positives[thres] += 1\n", + " elif true_target == \"more\" and proba_more < thres:\n", + " false_negatives[thres] += 1\n", + " elif true_target == \"less\" and proba_more >= thres:\n", + " false_positives[thres] += 1\n", + "\n", + "precision = {\n", + " thres: true_positives[thres] / (true_positives[thres] + false_positives[thres])\n", + " for thres in thresholds\n", + "}\n", + "recall = {\n", + " thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres])\n", + " for thres in thresholds\n", + "}\n", + "\n", + "# Print the curve at the selected points\n", + "print(\"Precision and Recall for class 'more'\")\n", + "print(\"threshold\\trecall\\tprecision\")\n", + "thresholds.reverse()\n", + "for thres in thresholds:\n", + " print(str(thres) + \"\\t\" + str(recall[thres]) + \"\\t\" + str(precision[thres]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_regressor_for_metrics()`\n\n", + "Estimates the R2 coefficient of a regressor\n" ] }, { @@ -1341,66 +1431,73 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_regressor_for_metrics():\n", - " \"\"\"Estimates the R2 coefficient of a regressor\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_regressor_for_metrics\")\n", - " output_data_table_path = path.join(results_dir, \"TrueAndPredictedAges.txt\")\n", - "\n", - " # Train the regressor for the target \"age\" (with 20% train to be quick)\n", - " _, modeling_dictionary_file_path = kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"age\",\n", - " results_dir,\n", - " sample_percentage=20,\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Obtain the predicted regression values of the SNB on the test dataset estimate R2\n", - " kh.deploy_predictor_for_metrics(\n", - " modeling_dictionary_file_path,\n", - " \"SNB_Adult\",\n", - " data_table_path,\n", - " output_data_table_path,\n", - " sample_percentage=20,\n", - " sampling_mode=\"Exclude sample\",\n", - " output_header_line=False,\n", - " )\n", - " # Estimate R2\n", - " # Note: Normally one would do this with a package (eg. sklearn.metrics)\n", - " # First pass to estimate sums of residuals and the mean\n", - " ss_res = 0\n", - " mean = 0\n", - " n_instances = 0\n", - " with open(output_data_table_path) as output_data_table:\n", - " for line in output_data_table:\n", - " fields = line.split(\"\\t\")\n", - " true_target = float(fields[0])\n", - " predicted_target = float(fields[1])\n", - " ss_res += (true_target - predicted_target) ** 2\n", - " mean += true_target\n", - " n_instances += 1\n", - " mean /= n_instances\n", - "\n", - " # Second pass to estimate the total sums of squares and finish the R2 estimation\n", - " ss_tot = 0\n", - " with open(output_data_table_path) as output_data_table:\n", - " for line in output_data_table:\n", - " fields = line.split(\"\\t\")\n", - " true_target = float(fields[0])\n", - " ss_tot += (true_target - mean) ** 2\n", - " r2_score = 1 - ss_res / ss_tot\n", - "\n", - " # Print results\n", - " print(\"Adult 'age' regression (30% train)\")\n", - " print(f\"R2 (explained variance) = {r2_score}\")\n", - "\n", - "#Run sample\n", - "deploy_regressor_for_metrics()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_regressor_for_metrics\")\n", + "output_data_table_path = os.path.join(results_dir, \"TrueAndPredictedAges.txt\")\n", + "\n", + "# Train the regressor for the target \"age\" (with 20% train to be quick)\n", + "_, modeling_dictionary_file_path = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"age\",\n", + " results_dir,\n", + " sample_percentage=20,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Obtain the predicted regression values of the SNB on the test dataset estimate R2\n", + "kh.deploy_predictor_for_metrics(\n", + " modeling_dictionary_file_path,\n", + " \"SNB_Adult\",\n", + " data_table_path,\n", + " output_data_table_path,\n", + " sample_percentage=20,\n", + " sampling_mode=\"Exclude sample\",\n", + " output_header_line=False,\n", + ")\n", + "# Estimate R2\n", + "# Note: Normally one would do this with a package (eg. sklearn.metrics)\n", + "# First pass to estimate sums of residuals and the mean\n", + "ss_res = 0\n", + "mean = 0\n", + "n_instances = 0\n", + "with open(output_data_table_path) as output_data_table:\n", + " for line in output_data_table:\n", + " fields = line.split(\"\\t\")\n", + " true_target = float(fields[0])\n", + " predicted_target = float(fields[1])\n", + " ss_res += (true_target - predicted_target) ** 2\n", + " mean += true_target\n", + " n_instances += 1\n", + " mean /= n_instances\n", + "\n", + "# Second pass to estimate the total sums of squares and finish the R2 estimation\n", + "ss_tot = 0\n", + "with open(output_data_table_path) as output_data_table:\n", + " for line in output_data_table:\n", + " fields = line.split(\"\\t\")\n", + " true_target = float(fields[0])\n", + " ss_tot += (true_target - mean) ** 2\n", + "r2_score = 1 - ss_res / ss_tot\n", + "\n", + "# Print results\n", + "print(\"Adult 'age' regression (30% train)\")\n", + "print(f\"R2 (explained variance) = {r2_score}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `sort_data_table()`\n\n", + "Sorts a database in the simplest way possible\n\n It is a call to `~.api.sort_data_table` with only its mandatory parameters. This\n sorts a data table by its default key variable (specified in the table's\n dictionary).\n \n" ] }, { @@ -1409,30 +1506,32 @@ "metadata": {}, "outputs": [], "source": [ - "def sort_data_table():\n", - " \"\"\"Sorts a database in the simplest way possible\n", - "\n", - " It is a call to `~.api.sort_data_table` with only its mandatory parameters. This\n", - " sorts a data table by its default key variable (specified in the table's\n", - " dictionary).\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " accidents_table_path = path.join(accidents_dir, \"Accidents.txt\")\n", - " output_data_table_path = path.join(\n", - " \"kh_samples\",\n", - " \"sort_data_table\",\n", - " \"SortedAccidents.txt\",\n", - " )\n", - "\n", - " # Sort table\n", - " kh.sort_data_table(\n", - " dictionary_file_path, \"Accident\", accidents_table_path, output_data_table_path\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "sort_data_table()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "output_data_table_path = os.path.join(\n", + " \"kh_samples\",\n", + " \"sort_data_table\",\n", + " \"SortedAccidents.txt\",\n", + ")\n", + "\n", + "# Sort table\n", + "kh.sort_data_table(\n", + " dictionary_file_path, \"Accident\", accidents_table_path, output_data_table_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `sort_data_table_expert()`\n\n", + "Sorts a database by a field other than the default table key\n\n It is a call to `~.api.sort_data_table` with additional parameters to specify the\n sorting fields.\n \n" ] }, { @@ -1441,32 +1540,35 @@ "metadata": {}, "outputs": [], "source": [ - "def sort_data_table_expert():\n", - " \"\"\"Sorts a database by a field other than the default table key\n", - "\n", - " It is a call to `~.api.sort_data_table` with additional parameters to specify the\n", - " sorting fields.\n", - " \"\"\"\n", - " # Set the file paths\n", - " accidents_dir = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " dictionary_file_path = path.join(accidents_dir, \"Accidents.kdic\")\n", - " vehicles_table_path = path.join(accidents_dir, \"Vehicles.txt\")\n", - " output_data_table_path = path.join(\n", - " \"kh_samples\", \"sort_data_table_expert\", \"SortedVehicles.txt\"\n", - " )\n", - "\n", - " # Sort table. Besides the mandatory parameters, it is specified:\n", - " # - A list containing the sorting fields\n", - " kh.sort_data_table(\n", - " dictionary_file_path,\n", - " \"Vehicle\",\n", - " vehicles_table_path,\n", - " output_data_table_path,\n", - " sort_variables=[\"AccidentId\", \"VehicleId\"],\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "sort_data_table_expert()" + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "dictionary_file_path = os.path.join(accidents_dir, \"Accidents.kdic\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "output_data_table_path = os.path.join(\n", + " \"kh_samples\", \"sort_data_table_expert\", \"SortedVehicles.txt\"\n", + ")\n", + "\n", + "# Sort table. Besides the mandatory parameters, it is specified:\n", + "# - A list containing the sorting fields\n", + "kh.sort_data_table(\n", + " dictionary_file_path,\n", + " \"Vehicle\",\n", + " vehicles_table_path,\n", + " output_data_table_path,\n", + " sort_variables=[\"AccidentId\", \"VehicleId\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `sort_data_tables_mt()`\n\n", + "Sorts with the dedicated helper a multi-table dataset by the default keys\n" ] }, { @@ -1475,34 +1577,39 @@ "metadata": {}, "outputs": [], "source": [ - "def extract_keys_from_data_table():\n", - " \"\"\"Extracts the keys from a database\n", - "\n", - " It is a call to `~.api.extract_keys_from_data_table` with only its mandatory\n", - " parameters.\n", - "\n", - " Pre-requisite: the database must be sorted by its key.\n", - " \"\"\"\n", - " # Set the file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " output_data_table_path = path.join(\n", - " \"kh_samples\",\n", - " \"extract_keys_from_data_table\",\n", - " \"KeysSpliceJunction.txt\",\n", - " )\n", - "\n", - " # Extract keys from table \"SpliceJunctionDNA\" to the output table\n", - " kh.extract_keys_from_data_table(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " output_data_table_path,\n", - " )\n", - "\n", - "#Run sample\n", - "extract_keys_from_data_table()" + "# Imports\n", + "import os\n", + "from khiops.utils.helpers import sort_dataset\n", + "\n", + "# Set the file paths\n", + "accidents_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_table_path = os.path.join(accidents_dir, \"Accidents.txt\")\n", + "vehicles_table_path = os.path.join(accidents_dir, \"Vehicles.txt\")\n", + "users_table_path = os.path.join(accidents_dir, \"Users.txt\")\n", + "places_table_path = os.path.join(accidents_dir, \"Places.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"sort_data_tables_mt\")\n", + "\n", + "# Build the dataset spec\n", + "ds_spec = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_table_path, \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_table_path, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_table_path, \"AccidentId\"),\n", + " },\n", + "}\n", + "\n", + "# Sort the dataset\n", + "sort_dataset(ds_spec, output_dir=results_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `extract_keys_from_data_table()`\n\n", + "Extracts the keys from a database\n\n It is a call to `~.api.extract_keys_from_data_table` with only its mandatory\n parameters.\n\n Pre-requisite: the database must be sorted by its key.\n \n" ] }, { @@ -1511,29 +1618,35 @@ "metadata": {}, "outputs": [], "source": [ - "def train_coclustering():\n", - " \"\"\"Trains a coclustering model in the simplest way possible\n", - "\n", - " It is a call to `~.api.train_coclustering` with only its mandatory parameters.\n", - " \"\"\"\n", - " # Set the file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"train_coclustering\")\n", - "\n", - " # Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - " coclustering_file_path = kh.train_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " [\"SampleId\", \"Char\"],\n", - " results_dir,\n", - " )\n", - " print(\"Coclustering file available at \" + coclustering_file_path)\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "train_coclustering()" + "# Set the file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "output_data_table_path = os.path.join(\n", + " \"kh_samples\",\n", + " \"extract_keys_from_data_table\",\n", + " \"KeysSpliceJunction.txt\",\n", + ")\n", + "\n", + "# Extract keys from table \"SpliceJunctionDNA\" to the output table\n", + "kh.extract_keys_from_data_table(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " output_data_table_path,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `train_coclustering()`\n\n", + "Trains a coclustering model in the simplest way possible\n\n It is a call to `~.api.train_coclustering` with only its mandatory parameters.\n \n" ] }, { @@ -1542,38 +1655,33 @@ "metadata": {}, "outputs": [], "source": [ - "def simplify_coclustering():\n", - " \"\"\"Simplifies a coclustering model while preserving 80% of its information\"\"\"\n", - " # Set the file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"simplify_coclustering\")\n", - " coclustering_file_path = path.join(results_dir, \"Coclustering.khc\")\n", - " simplified_coclustering_file_name = \"simplified_coclustering.khc\"\n", - "\n", - " # Train coclustering model for variables \"SampleId\" and \"Char\"\n", - " kh.train_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " [\"SampleId\", \"Char\"],\n", - " results_dir,\n", - " )\n", - "\n", - " # Simplify the trained coclustering with the constraints\n", - " # - maximum information preserved: 80%\n", - " # - maximum total parts number: 4\n", - " kh.simplify_coclustering(\n", - " coclustering_file_path,\n", - " simplified_coclustering_file_name,\n", - " results_dir,\n", - " max_preserved_information=80,\n", - " max_total_parts=4,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "simplify_coclustering()" + "# Set the file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"train_coclustering\")\n", + "\n", + "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", + "coclustering_file_path = kh.train_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " [\"SampleId\", \"Char\"],\n", + " results_dir,\n", + ")\n", + "print(\"Coclustering file available at \" + coclustering_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `simplify_coclustering()`\n\n", + "Simplifies a coclustering model while preserving 80% of its information\n" ] }, { @@ -1582,30 +1690,45 @@ "metadata": {}, "outputs": [], "source": [ - "def extract_clusters():\n", - " \"\"\"Extract the clusters' id, members, frequencies and typicalities into a file\"\"\"\n", - " # Set the file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"extract_clusters\")\n", - " coclustering_file_path = path.join(results_dir, \"Coclustering.khc\")\n", - " clusters_file_path = path.join(results_dir, \"extracted_clusters.txt\")\n", - "\n", - " # Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - " kh.train_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " [\"SampleId\", \"Char\"],\n", - " results_dir,\n", - " )\n", - "\n", - " # Extract clusters\n", - " kh.extract_clusters(coclustering_file_path, \"Char\", clusters_file_path)\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "extract_clusters()" + "# Set the file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"simplify_coclustering\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "simplified_coclustering_file_name = \"simplified_coclustering.khc\"\n", + "\n", + "# Train coclustering model for variables \"SampleId\" and \"Char\"\n", + "kh.train_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " [\"SampleId\", \"Char\"],\n", + " results_dir,\n", + ")\n", + "\n", + "# Simplify the trained coclustering with the constraints\n", + "# - maximum information preserved: 80%\n", + "# - maximum total parts number: 4\n", + "kh.simplify_coclustering(\n", + " coclustering_file_path,\n", + " simplified_coclustering_file_name,\n", + " results_dir,\n", + " max_preserved_information=80,\n", + " max_total_parts=4,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `extract_clusters()`\n\n", + "Extract the clusters' id, members, frequencies and typicalities into a file\n" ] }, { @@ -1614,38 +1737,33 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_coclustering():\n", - " \"\"\"Deploys an \"individual-variable\" coclustering\"\"\"\n", - " # Set the initial file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_coclustering\")\n", - " coclustering_file_path = path.join(results_dir, \"Coclustering.khc\")\n", - "\n", - " # Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - " kh.train_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " [\"SampleId\", \"Char\"],\n", - " results_dir,\n", - " )\n", - "\n", - " # Deploy \"Char\" clusters in the training database\n", - " kh.deploy_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " data_table_path,\n", - " coclustering_file_path,\n", - " [\"SampleId\"],\n", - " \"Char\",\n", - " results_dir,\n", - " header_line=True,\n", - " )\n", - "\n", - "#Run sample\n", - "deploy_coclustering()" + "# Set the file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"extract_clusters\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "clusters_file_path = os.path.join(results_dir, \"extracted_clusters.txt\")\n", + "\n", + "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", + "kh.train_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " [\"SampleId\", \"Char\"],\n", + " results_dir,\n", + ")\n", + "\n", + "# Extract clusters\n", + "kh.extract_clusters(coclustering_file_path, \"Char\", clusters_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_coclustering()`\n\n", + "Deploys an \"individual-variable\" coclustering\n" ] }, { @@ -1654,86 +1772,45 @@ "metadata": {}, "outputs": [], "source": [ - "def deploy_coclustering_expert():\n", - " \"\"\"Deploys an \"individual-variable\" coclustering step-by-step\n", - "\n", - " The `.api.prepare_coclustering_deployment` method is called twice to prepare the\n", - " deployment at two granularity levels. Then, the model is deployed and the respective\n", - " deployment dictionary is built.\n", - "\n", - " This is one of the most complex workflows of the Khiops suite.\n", - " \"\"\"\n", - " # Set the initial file paths\n", - " splice_dir = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " dictionary_file_path = path.join(splice_dir, \"SpliceJunction.kdic\")\n", - " data_table_path = path.join(splice_dir, \"SpliceJunction.txt\")\n", - " secondary_data_table_path = path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"deploy_coclustering_expert\")\n", - " coclustering_file_path = path.join(results_dir, \"Coclustering.khc\")\n", - "\n", - " # Train a coclustering model for variables \"SampleId\" and \"Char\"\n", - " print(\"train coclustering on SpliceJunctionDNA\")\n", - " kh.train_coclustering(\n", - " dictionary_file_path,\n", - " \"SpliceJunctionDNA\",\n", - " secondary_data_table_path,\n", - " [\"SampleId\", \"Char\"],\n", - " results_dir,\n", - " )\n", - "\n", - " print(\"prepare_coclustering_deployment\")\n", - " # The input dictionary is extended with new coclustering based variables\n", - " kh.prepare_coclustering_deployment(\n", - " dictionary_file_path,\n", - " \"SpliceJunction\",\n", - " coclustering_file_path,\n", - " \"DNA\",\n", - " \"SampleId\",\n", - " results_dir,\n", - " )\n", - " augmented_dictionary_file_path = path.join(results_dir, \"Coclustering.kdic\")\n", - "\n", - " print(\"prepare_coclustering_deployment with at most two clusters\")\n", - " # Extend the already extended dictionary with the new variables from a simplified CC\n", - " kh.prepare_coclustering_deployment(\n", - " augmented_dictionary_file_path,\n", - " \"SpliceJunction\",\n", - " coclustering_file_path,\n", - " \"DNA\",\n", - " \"SampleId\",\n", - " results_dir,\n", - " results_prefix=\"Reaugmented\",\n", - " variables_prefix=\"C2_\",\n", - " max_part_numbers={\"SampleId\": 2},\n", - " )\n", - "\n", - " reaugmented_dictionary_file_path = path.join(\n", - " results_dir, \"ReaugmentedCoclustering.kdic\"\n", - " )\n", - " output_data_table_path = path.join(results_dir, \"TransferredSpliceJunction.txt\")\n", - "\n", - " # Deploy the coclustering with the extended dictionary\n", - " print(\"deploy_model with the new coclustering based variables\")\n", - " kh.deploy_model(\n", - " reaugmented_dictionary_file_path,\n", - " \"SpliceJunction\",\n", - " data_table_path,\n", - " output_data_table_path,\n", - " additional_data_tables={\"SpliceJunction`DNA\": secondary_data_table_path},\n", - " )\n", - "\n", - " deployed_dictionary_file_path = path.join(\n", - " results_dir, \"Transferred_Coclustering.kdic\"\n", - " )\n", - " print(\"build_deployed_dictionary to get the new dictionary\")\n", - " kh.build_deployed_dictionary(\n", - " reaugmented_dictionary_file_path,\n", - " \"SpliceJunction\",\n", - " deployed_dictionary_file_path,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "deploy_coclustering_expert()" + "# Set the initial file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_coclustering\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "\n", + "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", + "kh.train_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " [\"SampleId\", \"Char\"],\n", + " results_dir,\n", + ")\n", + "\n", + "# Deploy \"Char\" clusters in the training database\n", + "kh.deploy_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " data_table_path,\n", + " coclustering_file_path,\n", + " [\"SampleId\"],\n", + " \"Char\",\n", + " results_dir,\n", + " header_line=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `deploy_coclustering_expert()`\n\n", + "Deploys an \"individual-variable\" coclustering step-by-step\n\n The `.api.prepare_coclustering_deployment` method is called twice to prepare the\n deployment at two granularity levels. Then, the model is deployed and the respective\n deployment dictionary is built.\n\n This is one of the most complex workflows of the Khiops suite.\n \n" ] }, { @@ -1742,35 +1819,86 @@ "metadata": {}, "outputs": [], "source": [ - "def scenario_prologue():\n", - " \"\"\"Trains a simple model with a prologue written in the Khiops scenario language\n", - "\n", - " .. note::\n", - " This is an **advanced** feature.\n", - " \"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"scenario_prologue\")\n", - "\n", - " # Set the maximum memory \"by hand\" with an scenario prologue\n", - " kh.get_runner().scenario_prologue = \"\"\"\n", - " // Max memory 2000 mb\n", - " AnalysisSpec.SystemParameters.MemoryLimit 2000\n", - " \"\"\"\n", - "\n", - " # Train the predictor\n", - " kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Adult\",\n", - " data_table_path,\n", - " \"class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "scenario_prologue()" + "# Set the initial file paths\n", + "splice_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "dictionary_file_path = os.path.join(splice_dir, \"SpliceJunction.kdic\")\n", + "data_table_path = os.path.join(splice_dir, \"SpliceJunction.txt\")\n", + "secondary_data_table_path = os.path.join(splice_dir, \"SpliceJunctionDNA.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"deploy_coclustering_expert\")\n", + "coclustering_file_path = os.path.join(results_dir, \"Coclustering.khc\")\n", + "\n", + "# Train a coclustering model for variables \"SampleId\" and \"Char\"\n", + "print(\"train coclustering on SpliceJunctionDNA\")\n", + "kh.train_coclustering(\n", + " dictionary_file_path,\n", + " \"SpliceJunctionDNA\",\n", + " secondary_data_table_path,\n", + " [\"SampleId\", \"Char\"],\n", + " results_dir,\n", + ")\n", + "\n", + "print(\"prepare_coclustering_deployment\")\n", + "# The input dictionary is extended with new coclustering based variables\n", + "kh.prepare_coclustering_deployment(\n", + " dictionary_file_path,\n", + " \"SpliceJunction\",\n", + " coclustering_file_path,\n", + " \"DNA\",\n", + " \"SampleId\",\n", + " results_dir,\n", + ")\n", + "augmented_dictionary_file_path = os.path.join(results_dir, \"Coclustering.kdic\")\n", + "\n", + "print(\"prepare_coclustering_deployment with at most two clusters\")\n", + "# Extend the already extended dictionary with the new variables from a simplified CC\n", + "kh.prepare_coclustering_deployment(\n", + " augmented_dictionary_file_path,\n", + " \"SpliceJunction\",\n", + " coclustering_file_path,\n", + " \"DNA\",\n", + " \"SampleId\",\n", + " results_dir,\n", + " results_prefix=\"Reaugmented\",\n", + " variables_prefix=\"C2_\",\n", + " max_part_numbers={\"SampleId\": 2},\n", + ")\n", + "\n", + "reaugmented_dictionary_file_path = os.path.join(\n", + " results_dir, \"ReaugmentedCoclustering.kdic\"\n", + ")\n", + "output_data_table_path = os.path.join(results_dir, \"TransferredSpliceJunction.txt\")\n", + "\n", + "# Deploy the coclustering with the extended dictionary\n", + "print(\"deploy_model with the new coclustering based variables\")\n", + "kh.deploy_model(\n", + " reaugmented_dictionary_file_path,\n", + " \"SpliceJunction\",\n", + " data_table_path,\n", + " output_data_table_path,\n", + " additional_data_tables={\"SpliceJunction`DNA\": secondary_data_table_path},\n", + ")\n", + "\n", + "deployed_dictionary_file_path = os.path.join(\n", + " results_dir, \"Transferred_Coclustering.kdic\"\n", + ")\n", + "print(\"build_deployed_dictionary to get the new dictionary\")\n", + "kh.build_deployed_dictionary(\n", + " reaugmented_dictionary_file_path,\n", + " \"SpliceJunction\",\n", + " deployed_dictionary_file_path,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `scenario_prologue()`\n\n", + "Trains a simple model with a prologue written in the Khiops scenario language\n\n .. note::\n This is an **advanced** feature.\n \n" ] }, { @@ -1779,39 +1907,78 @@ "metadata": {}, "outputs": [], "source": [ - "def build_deployed_dictionary():\n", - " \"\"\"Builds a dictionary file to read the output table of a deployed model\"\"\"\n", - " # Set the file paths\n", - " dictionary_file_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", - " data_table_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - " results_dir = path.join(\"kh_samples\", \"build_deployed_dictionary\")\n", - " deployed_dictionary_file_path = path.join(results_dir, \"SNB_Iris_deployed.kdic\")\n", + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - " # Train the predictor\n", - " _, modeling_dictionary_file_path = kh.train_predictor(\n", - " dictionary_file_path,\n", - " \"Iris\",\n", - " data_table_path,\n", - " \"Class\",\n", - " results_dir,\n", - " max_trees=0,\n", - " )\n", + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"scenario_prologue\")\n", "\n", - " # Build the dictionary to read the output of the predictor dictionary file\n", - " # It will contain the columns of the table generated by deploying the model\n", - " kh.build_deployed_dictionary(\n", - " modeling_dictionary_file_path,\n", - " \"SNB_Iris\",\n", - " deployed_dictionary_file_path,\n", - " )\n", + "# Set the maximum memory \"by hand\" with an scenario prologue\n", + "kh.get_runner().scenario_prologue = \"\"\"\n", + " // Max memory 2000 mb\n", + " AnalysisSpec.SystemParameters.MemoryLimit 2000\n", + " \"\"\"\n", "\n", - " # Print the deployed dictionary\n", - " with open(deployed_dictionary_file_path) as deployed_dictionary_file:\n", - " for line in deployed_dictionary_file:\n", - " print(line, end=\"\")\n", + "# Train the predictor\n", + "kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Adult\",\n", + " data_table_path,\n", + " \"class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `build_deployed_dictionary()`\n\n", + "Builds a dictionary file to read the output table of a deployed model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "from khiops import core as kh\n", "\n", - "#Run sample\n", - "build_deployed_dictionary()" + "# Set the file paths\n", + "dictionary_file_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.kdic\")\n", + "data_table_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", + "results_dir = os.path.join(\"kh_samples\", \"build_deployed_dictionary\")\n", + "deployed_dictionary_file_path = os.path.join(results_dir, \"SNB_Iris_deployed.kdic\")\n", + "\n", + "# Train the predictor\n", + "_, modeling_dictionary_file_path = kh.train_predictor(\n", + " dictionary_file_path,\n", + " \"Iris\",\n", + " data_table_path,\n", + " \"Class\",\n", + " results_dir,\n", + " max_trees=0,\n", + ")\n", + "\n", + "# Build the dictionary to read the output of the predictor dictionary file\n", + "# It will contain the columns of the table generated by deploying the model\n", + "kh.build_deployed_dictionary(\n", + " modeling_dictionary_file_path,\n", + " \"SNB_Iris\",\n", + " deployed_dictionary_file_path,\n", + ")\n", + "\n", + "# Print the deployed dictionary\n", + "with open(deployed_dictionary_file_path) as deployed_dictionary_file:\n", + " for line in deployed_dictionary_file:\n", + " print(line, end=\"\")" ] } ], diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py index 99deb9ad..60fe294e 100644 --- a/khiops/samples/samples.py +++ b/khiops/samples/samples.py @@ -11,16 +11,17 @@ older version some samples may fail. """ import argparse -import os -from math import sqrt -from os import path - import khiops +import os from khiops import core as kh # Disable open files without encoding because samples are simple code snippets # pylint: disable=unspecified-encoding +# For ease of use the functions in this module contain (repeated) import statements +# We disable all pylint warnings related to imports +# pylint: disable=import-outside-toplevel,redefined-outer-name,reimported + def get_khiops_version(): """Shows the Khiops version""" @@ -29,9 +30,13 @@ def get_khiops_version(): def export_dictionary_as_json(): """Exports a dictionary file ('.kdic') in JSON format ('.kdicj')""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - json_dictionary_file_path = path.join( + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + json_dictionary_file_path = os.path.join( "kh_samples", "export_dictionary_as_json", "Adult.kdicj", @@ -43,10 +48,14 @@ def export_dictionary_as_json(): def build_dictionary_from_data_table(): """Automatically creates a dictionary file from a data table""" + # Imports + import os + from khiops import core as kh + # Set the file paths - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") dictionary_name = "AutoAdult" - dictionary_file_path = path.join( + dictionary_file_path = os.path.join( "kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic" ) @@ -62,14 +71,20 @@ def detect_data_table_format(): The user may provide a dictionary file or dictionary domain object specifying the table schema. The detection heuristic is more accurate with this information. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - results_dir = path.join("kh_samples", "detect_data_table_format") - transformed_data_table_path = path.join(results_dir, "AdultWithAnotherFormat.txt") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + results_dir = os.path.join("kh_samples", "detect_data_table_format") + transformed_data_table_path = os.path.join( + results_dir, "AdultWithAnotherFormat.txt" + ) # Create the output directory - if not path.isdir(results_dir): + if not os.path.isdir(results_dir): os.mkdir(results_dir) # Detect the format of the table @@ -105,10 +120,14 @@ def check_database(): The results are stored in the specified log file with at most 50 error messages. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - log_file = path.join("kh_samples", "check_database", "check_database.log") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + log_file = os.path.join("kh_samples", "check_database", "check_database.log") # Check the database kh.check_database( @@ -122,12 +141,18 @@ def check_database(): def export_dictionary_files(): """Exports a customized dictionary to ".kdic" and to ".kdicj" (JSON)""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - results_dir = path.join("kh_samples", "export_dictionary_file") - output_dictionary_file_path = path.join(results_dir, "ModifiedAdult.kdic") - output_dictionary_json_path = path.join(results_dir, "ModifiedAdult.kdicj") - alt_output_dictionary_json_path = path.join(results_dir, "AltModifiedAdult.kdicj") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + results_dir = os.path.join("kh_samples", "export_dictionary_file") + output_dictionary_file_path = os.path.join(results_dir, "ModifiedAdult.kdic") + output_dictionary_json_path = os.path.join(results_dir, "ModifiedAdult.kdicj") + alt_output_dictionary_json_path = os.path.join( + results_dir, "AltModifiedAdult.kdicj" + ) # Load the dictionary domain from initial dictionary file # Then obtain the "Adult" dictionary within @@ -141,11 +166,11 @@ def export_dictionary_files(): label_variable.used = False # Create output directory if necessary - if not path.exists("kh_samples"): + if not os.path.exists("kh_samples"): os.mkdir("kh_samples") os.mkdir(results_dir) else: - if not path.exists(results_dir): + if not os.path.exists(results_dir): os.mkdir(results_dir) # Export to kdic @@ -161,10 +186,14 @@ def export_dictionary_files(): def train_predictor(): """Trains a predictor with a minimal setup""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor") # Train the predictor kh.train_predictor( @@ -179,10 +208,14 @@ def train_predictor(): def train_predictor_file_paths(): """Trains a predictor and stores the return value of the function""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_file_paths") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_file_paths") # Train the predictor report_file_path, modeling_dictionary_file_path = kh.train_predictor( @@ -211,12 +244,16 @@ def train_predictor_error_handling(): keyword argument is available in all functions of the `khiops.core.api` submodule). """ + # Imports + import os + from khiops import core as kh + # Set the file paths with a nonexistent dictionary file dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic" - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_error_handling") - log_file_path = path.join(results_dir, "khiops.log") - scenario_path = path.join(results_dir, "scenario._kh") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_error_handling") + log_file_path = os.path.join(results_dir, "khiops.log") + scenario_path = os.path.join(results_dir, "scenario._kh") # Train the predictor and handle the error try: @@ -253,12 +290,16 @@ def train_predictor_mt(): It is a call to `~.api.train_predictor` with additional parameters to handle multi-table learning """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_predictor_mt") + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables @@ -282,12 +323,16 @@ def train_predictor_mt_with_specific_rules(): variable construction rules. The list of available rules is found in the field ``kh.all_construction_rules`` """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_predictor_mt_with_specific_rules") + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt_with_specific_rules") # Train the predictor. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables @@ -309,15 +354,18 @@ def train_predictor_mt_with_specific_rules(): def train_predictor_mt_snowflake(): """Trains a multi-table predictor for a dataset with a snowflake schema""" + # Imports + import os + from khiops import core as kh # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "Accidents") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - users_table_path = path.join(accidents_dir, "Users.txt") - places_table_path = path.join(accidents_dir, "Places.txt") - results_dir = path.join("kh_samples", "train_predictor_mt_snowflake") + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "train_predictor_mt_snowflake") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables @@ -343,10 +391,14 @@ def train_predictor_with_train_percentage(): Note: The default is a 70%-30% split """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_train_percentage") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_train_percentage") # Train the predictor. Besides the mandatory parameters, it is specified: # - A 90% sampling rate for the training dataset @@ -367,10 +419,14 @@ def train_predictor_with_train_percentage(): def train_predictor_with_trees(): """Trains a predictor based on 15 trees with a 80%-20% train-test split""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Letter", "Letter.txt") - results_dir = path.join("kh_samples", "train_predictor_with_trees") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_trees") # Train the predictor with at most 15 trees (default 10) kh.train_predictor( @@ -388,10 +444,14 @@ def train_predictor_with_trees(): def train_predictor_with_pairs(): """Trains a predictor with user specified pairs of variables""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_pairs") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_pairs") # Train the predictor with at most 10 pairs as follows: # - Include pairs age-race and capital_gain-capital_loss @@ -429,12 +489,16 @@ def train_predictor_with_multiple_parameters(): Additionally the Khiops runner is set such that the learning is executed with only 1000 MB of memory. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_multiple_parameters") - output_script_path = path.join(results_dir, "output_scenario._kh") - log_path = path.join(results_dir, "log.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters") + output_script_path = os.path.join(results_dir, "output_scenario._kh") + log_path = os.path.join(results_dir, "log.txt") # Set memory limit to 1000 Mb and train with Khiops kh.get_runner().max_memory_mb = 1000 @@ -462,11 +526,15 @@ def train_predictor_with_multiple_parameters(): def train_predictor_detect_format(): """Trains a predictor without specifying the table format""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - results_dir = path.join("kh_samples", "train_predictor_detect_format") - transformed_data_table_path = path.join(results_dir, "TransformedIris.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + results_dir = os.path.join("kh_samples", "train_predictor_detect_format") + transformed_data_table_path = os.path.join(results_dir, "TransformedIris.txt") # Transform the database format from header_line=True and field_separator=TAB # to header_line=False and field_separator="," @@ -510,14 +578,19 @@ def train_predictor_detect_format(): def train_predictor_with_cross_validation(): """Trains a predictor with a 5-fold cross-validation""" + # Imports + import math + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_predictor_with_cross_validation") - fold_dictionary_file_path = path.join(results_dir, "AdultWithFolding.kdic") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation") + fold_dictionary_file_path = os.path.join(results_dir, "AdultWithFolding.kdic") # Create the output directory - if not path.isdir(results_dir): + if not os.path.isdir(results_dir): os.mkdir(results_dir) # Load the learning dictionary object @@ -598,11 +671,11 @@ def train_predictor_with_cross_validation(): # Print the mean +- error aucs for both train and test mean_train_auc = sum(train_aucs) / fold_number squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs] - sd_train_auc = sqrt(sum(squared_error_train_aucs) / (fold_number - 1)) + sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1)) mean_test_auc = sum(test_aucs) / fold_number squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs] - sd_test_auc = sqrt(sum(squared_error_test_aucs) / (fold_number - 1)) + sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1)) print("final auc") print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc)) @@ -616,6 +689,9 @@ def multiple_train_predictor(): reading dictionary files) and `.AnalysisResults` (for reading training/evaluation results from JSON) """ + # Imports + import os + from khiops import core as kh def display_test_results(json_result_file_path): """Display some of the training results""" @@ -632,9 +708,9 @@ def display_test_results(json_result_file_path): ) # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "multiple_train_predictor") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "multiple_train_predictor") # Read the dictionary file to obtain an instance of class Dictionary dictionary_domain = kh.read_dictionary_file(dictionary_file_path) @@ -691,11 +767,15 @@ def evaluate_predictor(): It calls `~.api.evaluate_predictor` with only its mandatory parameters. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "evaluate_predictor") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "evaluate_predictor") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") # Train the predictor kh.train_predictor( @@ -720,11 +800,15 @@ def access_predictor_evaluation_report(): See `evaluate_predictor` or `train_predictor_with_train_percentage` to see examples on how to evaluate a model. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "access_predictor_evaluation_report") - evaluation_report_path = path.join(results_dir, "AllReports.khj") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "access_predictor_evaluation_report") + evaluation_report_path = os.path.join(results_dir, "AllReports.khj") # Train the SNB predictor and some univariate predictors # Note: Evaluation in test is 30% by default @@ -793,10 +877,14 @@ def train_recoder(): It is a call to `~.api.train_recoder` with only its mandatory parameters. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_recoder") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_recoder") # Train the recoder model kh.train_recoder( @@ -809,10 +897,14 @@ def train_recoder_with_multiple_parameters(): It also creates 10 pair features. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "train_recoder_with_multiple_parameters") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "train_recoder_with_multiple_parameters") # Train the recoder model kh.train_recoder( @@ -832,12 +924,16 @@ def train_recoder_mt_flatten(): The constructed variables are all kept and no recoding is performed on their values """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "train_recoder_mt_flatten") + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "train_recoder_mt_flatten") # Train the recoder. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables @@ -869,12 +965,16 @@ def deploy_model(): associated dictionary to the input database. The model predictions are written to the output database. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_model") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_model") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor kh.train_predictor( @@ -903,14 +1003,18 @@ def deploy_model_mt(): associated dictionary to the input database. The model predictions are written to the output database. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - results_dir = path.join("kh_samples", "deploy_model_mt") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "TransferredAccidents.txt") + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor (see train_predictor_mt for details) kh.train_predictor( @@ -937,16 +1041,20 @@ def deploy_model_mt(): def deploy_model_mt_snowflake(): """Deploys a classifier model on a dataset with a snowflake schema""" + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "Accidents") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - users_table_path = path.join(accidents_dir, "Users.txt") - places_table_path = path.join(accidents_dir, "Places.txt") - results_dir = path.join("kh_samples", "deploy_model_mt_snowflake") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "TransferredAccidents.txt") + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables @@ -994,12 +1102,16 @@ def deploy_model_expert(): The "expert" part of this example is the use of the khiops dictionary interface and the `.DictionaryDomain` class """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_model_expert") - model_dictionary_file_path = path.join(results_dir, "Modeling.kdic") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_model_expert") + model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor kh.train_predictor( @@ -1032,11 +1144,15 @@ def deploy_model_expert(): def deploy_classifier_for_metrics(): """Constructs a small precision-recall curve for a classifier""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_classifier_for_metrics") - output_data_table_path = path.join(results_dir, "ScoresAdult.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics") + output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the classifier for the target "class" _, modeling_dictionary_file_path = kh.train_predictor( @@ -1096,11 +1212,15 @@ def deploy_classifier_for_metrics(): def deploy_regressor_for_metrics(): """Estimates the R2 coefficient of a regressor""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "deploy_regressor_for_metrics") - output_data_table_path = path.join(results_dir, "TrueAndPredictedAges.txt") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics") + output_data_table_path = os.path.join(results_dir, "TrueAndPredictedAges.txt") # Train the regressor for the target "age" (with 20% train to be quick) _, modeling_dictionary_file_path = kh.train_predictor( @@ -1160,11 +1280,15 @@ def sort_data_table(): sorts a data table by its default key variable (specified in the table's dictionary). """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - accidents_table_path = path.join(accidents_dir, "Accidents.txt") - output_data_table_path = path.join( + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + output_data_table_path = os.path.join( "kh_samples", "sort_data_table", "SortedAccidents.txt", @@ -1182,11 +1306,15 @@ def sort_data_table_expert(): It is a call to `~.api.sort_data_table` with additional parameters to specify the sorting fields. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - accidents_dir = path.join(kh.get_samples_dir(), "AccidentsSummary") - dictionary_file_path = path.join(accidents_dir, "Accidents.kdic") - vehicles_table_path = path.join(accidents_dir, "Vehicles.txt") - output_data_table_path = path.join( + accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + output_data_table_path = os.path.join( "kh_samples", "sort_data_table_expert", "SortedVehicles.txt" ) @@ -1201,6 +1329,35 @@ def sort_data_table_expert(): ) +def sort_data_tables_mt(): + """Sorts with the dedicated helper a multi-table dataset by the default keys""" + # Imports + import os + from khiops.utils.helpers import sort_dataset + + # Set the file paths + accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") + accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") + vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") + users_table_path = os.path.join(accidents_dir, "Users.txt") + places_table_path = os.path.join(accidents_dir, "Places.txt") + results_dir = os.path.join("kh_samples", "sort_data_tables_mt") + + # Build the dataset spec + ds_spec = { + "main_table": "Accidents", + "tables": { + "Accidents": (accidents_table_path, "AccidentId"), + "Vehicles": (vehicles_table_path, ["AccidentId", "VehicleId"]), + "Users": (users_table_path, ["AccidentId", "VehicleId"]), + "Places": (places_table_path, "AccidentId"), + }, + } + + # Sort the dataset + sort_dataset(ds_spec, output_dir=results_dir) + + def extract_keys_from_data_table(): """Extracts the keys from a database @@ -1209,11 +1366,15 @@ def extract_keys_from_data_table(): Pre-requisite: the database must be sorted by its key. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - output_data_table_path = path.join( + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + output_data_table_path = os.path.join( "kh_samples", "extract_keys_from_data_table", "KeysSpliceJunction.txt", @@ -1233,11 +1394,15 @@ def train_coclustering(): It is a call to `~.api.train_coclustering` with only its mandatory parameters. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "train_coclustering") + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "train_coclustering") # Train a coclustering model for variables "SampleId" and "Char" coclustering_file_path = kh.train_coclustering( @@ -1252,12 +1417,16 @@ def train_coclustering(): def simplify_coclustering(): """Simplifies a coclustering model while preserving 80% of its information""" + # Imports + import os + from khiops import core as kh + # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "simplify_coclustering") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "simplify_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") simplified_coclustering_file_name = "simplified_coclustering.khc" # Train coclustering model for variables "SampleId" and "Char" @@ -1284,12 +1453,12 @@ def simplify_coclustering(): def extract_clusters(): """Extract the clusters' id, members, frequencies and typicalities into a file""" # Set the file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "extract_clusters") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") - clusters_file_path = path.join(results_dir, "extracted_clusters.txt") + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "extract_clusters") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") + clusters_file_path = os.path.join(results_dir, "extracted_clusters.txt") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( @@ -1306,12 +1475,16 @@ def extract_clusters(): def deploy_coclustering(): """Deploys an "individual-variable" coclustering""" + # Imports + import os + from khiops import core as kh + # Set the initial file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - results_dir = path.join("kh_samples", "deploy_coclustering") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + results_dir = os.path.join("kh_samples", "deploy_coclustering") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( @@ -1344,13 +1517,17 @@ def deploy_coclustering_expert(): This is one of the most complex workflows of the Khiops suite. """ + # Imports + import os + from khiops import core as kh + # Set the initial file paths - splice_dir = path.join(kh.get_samples_dir(), "SpliceJunction") - dictionary_file_path = path.join(splice_dir, "SpliceJunction.kdic") - data_table_path = path.join(splice_dir, "SpliceJunction.txt") - secondary_data_table_path = path.join(splice_dir, "SpliceJunctionDNA.txt") - results_dir = path.join("kh_samples", "deploy_coclustering_expert") - coclustering_file_path = path.join(results_dir, "Coclustering.khc") + splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") + data_table_path = os.path.join(splice_dir, "SpliceJunction.txt") + secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") + results_dir = os.path.join("kh_samples", "deploy_coclustering_expert") + coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" print("train coclustering on SpliceJunctionDNA") @@ -1372,7 +1549,7 @@ def deploy_coclustering_expert(): "SampleId", results_dir, ) - augmented_dictionary_file_path = path.join(results_dir, "Coclustering.kdic") + augmented_dictionary_file_path = os.path.join(results_dir, "Coclustering.kdic") print("prepare_coclustering_deployment with at most two clusters") # Extend the already extended dictionary with the new variables from a simplified CC @@ -1388,10 +1565,10 @@ def deploy_coclustering_expert(): max_part_numbers={"SampleId": 2}, ) - reaugmented_dictionary_file_path = path.join( + reaugmented_dictionary_file_path = os.path.join( results_dir, "ReaugmentedCoclustering.kdic" ) - output_data_table_path = path.join(results_dir, "TransferredSpliceJunction.txt") + output_data_table_path = os.path.join(results_dir, "TransferredSpliceJunction.txt") # Deploy the coclustering with the extended dictionary print("deploy_model with the new coclustering based variables") @@ -1403,7 +1580,7 @@ def deploy_coclustering_expert(): additional_data_tables={"SpliceJunction`DNA": secondary_data_table_path}, ) - deployed_dictionary_file_path = path.join( + deployed_dictionary_file_path = os.path.join( results_dir, "Transferred_Coclustering.kdic" ) print("build_deployed_dictionary to get the new dictionary") @@ -1420,10 +1597,14 @@ def scenario_prologue(): .. note:: This is an **advanced** feature. """ + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") - results_dir = path.join("kh_samples", "scenario_prologue") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + results_dir = os.path.join("kh_samples", "scenario_prologue") # Set the maximum memory "by hand" with an scenario prologue kh.get_runner().scenario_prologue = """ @@ -1444,11 +1625,15 @@ def scenario_prologue(): def build_deployed_dictionary(): """Builds a dictionary file to read the output table of a deployed model""" + # Imports + import os + from khiops import core as kh + # Set the file paths - dictionary_file_path = path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") - data_table_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - results_dir = path.join("kh_samples", "build_deployed_dictionary") - deployed_dictionary_file_path = path.join(results_dir, "SNB_Iris_deployed.kdic") + dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") + data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + results_dir = os.path.join("kh_samples", "build_deployed_dictionary") + deployed_dictionary_file_path = os.path.join(results_dir, "SNB_Iris_deployed.kdic") # Train the predictor _, modeling_dictionary_file_path = kh.train_predictor( @@ -1506,6 +1691,7 @@ def build_deployed_dictionary(): deploy_regressor_for_metrics, sort_data_table, sort_data_table_expert, + sort_data_tables_mt, extract_keys_from_data_table, train_coclustering, simplify_coclustering, @@ -1520,7 +1706,7 @@ def build_deployed_dictionary(): def execute_samples(args): """Executes all non-interactive samples""" # Create the results directory if it does not exist - if not path.isdir("./kh_samples"): + if not os.path.isdir("./kh_samples"): os.mkdir("./kh_samples") # Set the user-defined samples dir if any diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index b87c1f50..e07f3be1 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -9,90 +9,77 @@ "[Khiops](https://khiops.org) before using this this notebook" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier()`\n\n", + "Trains a `.KhiopsClassifier` on a monotable dataframe\n" + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ + "# Imports\n", "import os\n", - "import pickle\n", - "from os import path\n", - "\n", "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", "from sklearn import metrics\n", - "from sklearn.compose import ColumnTransformer\n", - "from sklearn.experimental import enable_hist_gradient_boosting\n", - "from sklearn.ensemble import HistGradientBoostingClassifier\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from sklearn.feature_extraction.text import HashingVectorizer\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import OneHotEncoder\n", "\n", - "from khiops import core as kh\n", - "from khiops.sklearn import (\n", - " KhiopsClassifier,\n", - " KhiopsCoclustering,\n", - " KhiopsEncoder,\n", - " KhiopsRegressor,\n", - ")\n" + "# Load the dataset into a pandas dataframe\n", + "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "\n", + "# Split the whole dataframe into train and test (70%-30%)\n", + "adult_train_df, adult_test_df = train_test_split(\n", + " adult_df, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Split the dataset into:\n", + "# - the X feature table\n", + "# - the y target vector (\"class\" column)\n", + "X_train = adult_train_df.drop(\"class\", axis=1)\n", + "X_test = adult_test_df.drop(\"class\", axis=1)\n", + "y_train = adult_train_df[\"class\"]\n", + "y_test = adult_test_df[\"class\"]\n", + "\n", + "# Create the classifier object\n", + "khc = KhiopsClassifier()\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[0:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[0:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def khiops_classifier():\n", - " \"\"\"Trains a `.KhiopsClassifier` on a monotable dataframe\"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " adult_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", - "\n", - " # Split the whole dataframe into train and test (70%-30%)\n", - " adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Split the dataset into:\n", - " # - the X feature table\n", - " # - the y target vector (\"class\" column)\n", - " X_train = adult_train_df.drop(\"class\", axis=1)\n", - " X_test = adult_test_df.drop(\"class\", axis=1)\n", - " y_train = adult_train_df[\"class\"]\n", - " y_test = adult_test_df[\"class\"]\n", - "\n", - " # Create the classifier object\n", - " khc = KhiopsClassifier()\n", - "\n", - " # Train the classifier\n", - " khc.fit(X_train, y_train)\n", - "\n", - " # Predict the classes on the test dataset\n", - " y_test_pred = khc.predict(X_test)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[0:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probabilities on the test dataset\n", - " y_test_probas = khc.predict_proba(X_test)\n", - " print(f\"Class order: {khc.classes_}\")\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[0:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier()" + "### `khiops_classifier_sparse()`\n\n", + "Trains a `.KhiopsClassifier` on a monotable sparse matrix\n\n .. note::\n No intermediary dense data is used by Khiops because it supports sparse data\n natively.\n\n \n" ] }, { @@ -101,64 +88,67 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_sparse():\n", - " \"\"\"Trains a `.KhiopsClassifier` on a monotable sparse matrix\n", - "\n", - " .. note::\n", - " No intermediary dense data is used by Khiops because it supports sparse data\n", - " natively.\n", - "\n", - " \"\"\"\n", - " # Load 3 classes of the 20newsgroups dataset\n", - " categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\", \"alt.atheism\"]\n", - " data_train, y_train = fetch_20newsgroups(\n", - " subset=\"train\",\n", - " categories=categories,\n", - " return_X_y=True,\n", - " )\n", - " data_test, y_test = fetch_20newsgroups(\n", - " subset=\"test\",\n", - " categories=categories,\n", - " return_X_y=True,\n", - " )\n", - "\n", - " # Extract features from the train and test data using a sparse vectorizer\n", - " vectorizer = HashingVectorizer(n_features=2048, stop_words=\"english\")\n", - " X_train = vectorizer.fit_transform(data_train)\n", - " X_test = vectorizer.transform(data_test)\n", - "\n", - " # Print density of the intermediary datasets\n", - " print(f\"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}\")\n", - " print(f\"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}\")\n", - " print(\"---\")\n", - "\n", - " # Create the classifier object (no trees)\n", - " khc = KhiopsClassifier(n_trees=0)\n", - "\n", - " # Train the classifier\n", - " khc.fit(X_train, y_train)\n", - "\n", - " # Predict the classes on the test dataset\n", - " y_test_pred = khc.predict(X_test)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[0:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probabilities on the test dataset\n", - " y_test_probas = khc.predict_proba(X_test)\n", - " print(f\"Class order: {khc.classes_}\")\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[0:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class=\"ovr\")\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_sparse()" + "# Imports\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import HashingVectorizer\n", + "\n", + "# Load 4 classes of the 20newsgroups dataset\n", + "categories = [\"comp.graphics\", \"sci.space\", \"misc.forsale\", \"alt.atheism\"]\n", + "data_train, y_train = fetch_20newsgroups(\n", + " subset=\"train\",\n", + " categories=categories,\n", + " return_X_y=True,\n", + ")\n", + "data_test, y_test = fetch_20newsgroups(\n", + " subset=\"test\",\n", + " categories=categories,\n", + " return_X_y=True,\n", + ")\n", + "\n", + "# Extract features from the train and test data using a sparse vectorizer\n", + "vectorizer = HashingVectorizer(n_features=2048, stop_words=\"english\")\n", + "X_train = vectorizer.fit_transform(data_train)\n", + "X_test = vectorizer.transform(data_test)\n", + "\n", + "# Print density of the intermediary datasets\n", + "print(f\"X_train density: {X_train.size/(X_train.shape[0]*X_train.shape[1])}\")\n", + "print(f\"X_test density : {X_test.size/(X_test.shape[0]*X_test.shape[1])}\")\n", + "print(\"---\")\n", + "\n", + "# Create the classifier object (no trees)\n", + "khc = KhiopsClassifier(n_trees=0)\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[0:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[0:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class=\"ovr\")\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multiclass()`\n\n", + "Trains a multiclass `.KhiopsClassifier` on a monotable dataframe\n" ] }, { @@ -167,52 +157,61 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_multiclass():\n", - " \"\"\"Trains a multiclass `.KhiopsClassifier` on a monotable dataframe\"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " iris_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - " iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - " # Split the whole dataframe into train and test (70%-30%)\n", - " iris_train_df, iris_test_df = train_test_split(\n", - " iris_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Split the dataset into:\n", - " # - the X feature table\n", - " # - the y target vector (\"Class\" column)\n", - " X_train = iris_train_df.drop(\"Class\", axis=1)\n", - " X_test = iris_test_df.drop(\"Class\", axis=1)\n", - " y_train = iris_train_df[\"Class\"]\n", - " y_test = iris_test_df[\"Class\"]\n", - "\n", - " # Create the classifier object\n", - " khc = KhiopsClassifier()\n", - "\n", - " # Train the classifier\n", - " khc.fit(X_train, y_train)\n", - "\n", - " # Predict the classes on the test dataset\n", - " y_test_pred = khc.predict(X_test)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probabilities on the test datasets\n", - " y_test_probas = khc.predict_proba(X_test)\n", - " print(f\"Class order: {khc.classes_}\")\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class=\"ovr\")\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_multiclass()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the dataset into a pandas dataframe\n", + "iris_path = os.path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", + "iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", + "\n", + "# Split the whole dataframe into train and test (70%-30%)\n", + "iris_train_df, iris_test_df = train_test_split(iris_df, test_size=0.3, random_state=1)\n", + "\n", + "# Split the dataset into:\n", + "# - the X feature table\n", + "# - the y target vector (\"Class\" column)\n", + "X_train = iris_train_df.drop(\"Class\", axis=1)\n", + "X_test = iris_test_df.drop(\"Class\", axis=1)\n", + "y_train = iris_train_df[\"Class\"]\n", + "y_test = iris_test_df[\"Class\"]\n", + "\n", + "# Create the classifier object\n", + "khc = KhiopsClassifier()\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test datasets\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class=\"ovr\")\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_star()`\n\n", + "Trains a `.KhiopsClassifier` on a star multi-table dataset\n" ] }, { @@ -221,80 +220,68 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_multitable_star():\n", - " \"\"\"Trains a `.KhiopsClassifier` on a star multi-table dataset\"\"\"\n", - " # Load the root table of the dataset into a pandas dataframe\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )\n", - "\n", - " # Split the root dataframe into train and test\n", - " accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Obtain the main X feature table and the y target vector (\"Class\" column)\n", - " y_train = accidents_train_df[\"Gravity\"]\n", - " y_test = accidents_test_df[\"Gravity\"]\n", - " X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - " X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", - "\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " vehicles_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Split the secondary dataframe with the keys of the splitted root dataframe\n", - " X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - " X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - " X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - " X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - " # Create the dataset multitable specification for the train/test split\n", - " # We specify each table with a name and a tuple (dataframe, key_columns)\n", - " X_train = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " }\n", - " X_test = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_test_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " }\n", - "\n", - " # Train the classifier (by default it analyzes 100 multi-table features)\n", - " khc = KhiopsClassifier()\n", - " khc.fit(X_train, y_train)\n", - "\n", - " # Predict the class on the test dataset\n", - " y_test_pred = khc.predict(X_test)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probability on the test dataset\n", - " y_test_probas = khc.predict_proba(X_test)\n", - " print(f\"Class order: {khc.classes_}\")\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_multitable_star()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# Load the dataset into pandas dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "\n", + "# Create the dataset spec and the target\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + "}\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(\n", + " X, y, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Train the classifier (by default it analyzes 100 multi-table features)\n", + "khc = KhiopsClassifier()\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the class on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_star_file()`\n\n", + "Trains a `.KhiopsClassifier` with a file dataset\n" ] }, { @@ -303,80 +290,83 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_multitable_snowflake():\n", - " \"\"\"Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n", - "\n", - " .. note::\n", - " For simplicity we train from the whole dataset. To assess the performance one\n", - " usually splits the dataset into train and test subsets.\n", - "\n", - " \"\"\"\n", - " # Load the dataset tables into dataframes\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"Accidents\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )\n", - " users_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - " vehicles_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - " places_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - " # Build the multitable input X\n", - " # Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", - " # build the target column\n", - " X = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (accidents_df, \"AccidentId\"),\n", - " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", - " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", - " \"Places\": (places_df, [\"AccidentId\"]),\n", - " },\n", - " \"relations\": [\n", - " (\"Accidents\", \"Vehicles\"),\n", - " (\"Vehicles\", \"Users\"),\n", - " (\"Accidents\", \"Places\", True),\n", - " ],\n", - " }\n", - "\n", - " # Load the target variable from the AccidentsSummary dataset\n", - " y = pd.read_csv(\n", - " path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )[\"Gravity\"]\n", - "\n", - " # Train the classifier (by default it creates 1000 multi-table features)\n", - " khc = KhiopsClassifier(n_trees=0)\n", - " khc.fit(X, y)\n", - "\n", - " # Predict the class on the test dataset\n", - " y_pred = khc.predict(X)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probability on the train dataset\n", - " y_probas = khc.predict_proba(X)\n", - " print(f\"Class order: {khc.classes_}\")\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_probas[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the train dataset\n", - " train_accuracy = metrics.accuracy_score(y_pred, y)\n", - " train_auc = metrics.roc_auc_score(y, y_probas[:, 1])\n", - " print(f\"Train accuracy = {train_accuracy}\")\n", - " print(f\"Train auc = {train_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_multitable_snowflake()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# Create output directory\n", + "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_multitable_star_file\")\n", + "if not os.path.exists(\"kh_samples\"):\n", + " os.mkdir(\"kh_samples\")\n", + " os.mkdir(results_dir)\n", + "else:\n", + " if not os.path.exists(results_dir):\n", + " os.mkdir(results_dir)\n", + "\n", + "# Create the dataset spec\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " \"AccidentId\",\n", + " ),\n", + " \"Vehicles\": (\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", + " [\"AccidentId\", \"VehicleId\"],\n", + " ),\n", + " },\n", + " \"format\": (\"\\t\", True),\n", + "}\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test = train_test_split_dataset(\n", + " X, output_dir=os.path.join(results_dir, \"split\"), test_size=0.3\n", + ")\n", + "\n", + "# Create the classifier and fit it\n", + "khc = KhiopsClassifier(output_dir=results_dir)\n", + "khc.fit(X_train, y=\"Gravity\")\n", + "\n", + "# Predict the class in addition to the class probabilities on the test dataset\n", + "y_test_pred_path = khc.predict(X_test)\n", + "y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[\"PredictedGravity\"].head(10))\n", + "print(\"---\")\n", + "\n", + "y_test_probas_path = khc.predict_proba(X_test)\n", + "y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", + "proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[proba_columns].head(10))\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "# Note: For roc_auc_score we have to use the \"greatest\" label which is \"NonLethal\"\n", + "y_test = pd.read_csv(\n", + " X_test[\"tables\"][\"Accidents\"][0],\n", + " usecols=[\"Gravity\"],\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityNonLethal\"])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_snowflake()`\n\n", + "Trains a `.KhiopsClassifier` on a snowflake multi-table dataset\n" ] }, { @@ -385,43 +375,92 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_pickle():\n", - " \"\"\"Shows the serialization and deserialization of a `.KhiopsClassifier`\"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " iris_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - " iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - " # Train the model with the whole dataset\n", - " X = iris_df.drop([\"Class\"], axis=1)\n", - " y = iris_df[\"Class\"]\n", - " khc = KhiopsClassifier()\n", - " khc.fit(X, y)\n", - "\n", - " # Create/clean the output directory\n", - " results_dir = path.join(\"kh_samples\", \"khiops_classifier_pickle\")\n", - " khc_pickle_path = path.join(results_dir, \"khiops_classifier.pkl\")\n", - " if path.exists(khc_pickle_path):\n", - " os.remove(khc_pickle_path)\n", - " else:\n", - " os.makedirs(results_dir, exist_ok=True)\n", - "\n", - " # Pickle its content to a file\n", - " with open(khc_pickle_path, \"wb\") as khc_pickle_write_file:\n", - " pickle.dump(khc, khc_pickle_write_file)\n", - "\n", - " # Unpickle it\n", - " with open(khc_pickle_path, \"rb\") as khc_pickle_file:\n", - " new_khc = pickle.load(khc_pickle_file)\n", - "\n", - " # Make some predictions on the training dataset with the unpickled classifier\n", - " new_khc.predict(X)\n", - " y_predicted = new_khc.predict(X)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_predicted[:10])\n", - " print(\"---\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_pickle()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from khiops.utils.helpers import train_test_split_dataset\n", + "from sklearn import metrics\n", + "\n", + "# Load the dataset tables into dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "users_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", + "vehicles_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "places_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", + "\n", + "# Create the dataset spec\n", + "# Note: We discard the \"Gravity\" column from the \"Users\" table to avoid a target\n", + "# leak. This is because the column was used to build the target.\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_df, \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_df, [\"AccidentId\"]),\n", + " },\n", + " \"relations\": [\n", + " (\"Accidents\", \"Vehicles\"),\n", + " (\"Vehicles\", \"Users\"),\n", + " (\"Accidents\", \"Places\", True),\n", + " ],\n", + "}\n", + "\n", + "# Load the target variable \"Gravity\" from the \"AccidentsSummary\" dataset\n", + "y = pd.read_csv(\n", + " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ").squeeze(\"columns\")\n", + "\n", + "# Split into train and test datasets\n", + "X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)\n", + "\n", + "# Train the classifier (by default it creates 1000 multi-table features)\n", + "khc = KhiopsClassifier(n_trees=0)\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the class on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test_pred, y_test)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_pickle()`\n\n", + "Shows the serialization and deserialization of a `.KhiopsClassifier`\n" ] }, { @@ -430,45 +469,47 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_regressor():\n", - " \"\"\"Trains a `.KhiopsRegressor` on a monotable dataframe\"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " adult_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", - "\n", - " # Split the whole dataframe into train and test (40%-60% for speed)\n", - " adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.6, random_state=1\n", - " )\n", - "\n", - " # Split the dataset into:\n", - " # - the X feature table\n", - " # - the y target vector (\"age\" column)\n", - " X_train = adult_train_df.drop(\"age\", axis=1)\n", - " X_test = adult_test_df.drop(\"age\", axis=1)\n", - " y_train = adult_train_df[\"age\"]\n", - " y_test = adult_test_df[\"age\"]\n", - "\n", - " # Create the regressor object\n", - " khr = KhiopsRegressor()\n", - "\n", - " # Train the regressor\n", - " khr.fit(X_train, y_train)\n", - "\n", - " # Predict the values on the test dataset\n", - " y_test_pred = khr.predict(X_test)\n", - " print(\"Predicted values for 'age' (first 10):\")\n", - " print(y_test_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate R2 and MAE metrics on the test dataset\n", - " test_r2 = metrics.r2_score(y_test, y_test_pred)\n", - " test_mae = metrics.mean_absolute_error(y_test, y_test_pred)\n", - " print(f\"Test R2 = {test_r2}\")\n", - " print(f\"Test MAE = {test_mae}\")\n", - "\n", - "#Run sample\n", - "khiops_regressor()" + "# Imports\n", + "import os\n", + "import pickle\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn.datasets import load_iris\n", + "\n", + "# Create/clean the output directory\n", + "results_dir = os.path.join(\"kh_samples\", \"khiops_classifier_pickle\")\n", + "khc_pickle_path = os.path.join(results_dir, \"khiops_classifier.pkl\")\n", + "if os.path.exists(khc_pickle_path):\n", + " os.remove(khc_pickle_path)\n", + "else:\n", + " os.makedirs(results_dir, exist_ok=True)\n", + "\n", + "# Train the model with the Iris dataset\n", + "X, y = load_iris(return_X_y=True)\n", + "khc = KhiopsClassifier()\n", + "khc.fit(X, y)\n", + "\n", + "# Pickle its content to a file\n", + "with open(khc_pickle_path, \"wb\") as khc_pickle_output_file:\n", + " pickle.dump(khc, khc_pickle_output_file)\n", + "\n", + "# Unpickle it\n", + "with open(khc_pickle_path, \"rb\") as khc_pickle_file:\n", + " new_khc = pickle.load(khc_pickle_file)\n", + "\n", + "# Make some predictions on the training dataset with the unpickled classifier\n", + "new_khc.predict(X)\n", + "y_predicted = new_khc.predict(X)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_predicted[:10])\n", + "print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_regressor()`\n\n", + "Trains a `.KhiopsRegressor` on a monotable dataframe\n" ] }, { @@ -477,44 +518,48 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_encoder():\n", - " \"\"\"Trains a `.KhiopsEncoder` on a monotable dataframe\n", - "\n", - " The Khiops encoder is a supervised feature encoder. It discretizes numerical\n", - " features and groups categorical features in a way that the resulting interval/groups\n", - " have the highest class-purity.\n", - "\n", - " .. note::\n", - " For simplicity we train from the whole dataset. To assess the performance one\n", - " usually splits the dataset into train and test subsets.\n", - " \"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " iris_path = path.join(kh.get_samples_dir(), \"Iris\", \"Iris.txt\")\n", - " iris_df = pd.read_csv(iris_path, sep=\"\\t\")\n", - "\n", - " # Train the model with the whole dataset\n", - " X = iris_df.drop(\"Class\", axis=1)\n", - " y = iris_df[\"Class\"]\n", - "\n", - " # Create the encoder object\n", - " khe = KhiopsEncoder()\n", - " khe.fit(X, y)\n", - "\n", - " # Transform the training dataset\n", - " X_transformed = khe.transform(X)\n", - "\n", - " # Print both the original and transformed features\n", - " print(\"Original:\")\n", - " print(X.head(10))\n", - " print(\"---\")\n", - " print(\"Encoded feature names:\")\n", - " print(khe.feature_names_out_)\n", - " print(\"Encoded data:\")\n", - " print(X_transformed[:10])\n", - " print(\"---\")\n", - "\n", - "#Run sample\n", - "khiops_encoder()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsRegressor\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the \"Adult\" dataset and set the target to the \"age\" column\n", + "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"age\", axis=1)\n", + "y = adult_df[\"age\"]\n", + "\n", + "# Split the whole dataframe into train and test (40%-60% for speed)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)\n", + "\n", + "# Create the regressor object\n", + "khr = KhiopsRegressor()\n", + "\n", + "# Train the regressor\n", + "khr.fit(X_train, y_train)\n", + "\n", + "# Predict the values on the test dataset\n", + "y_test_pred = khr.predict(X_test)\n", + "print(\"Predicted values for 'age' (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate R2 and MAE metrics on the test dataset\n", + "test_r2 = metrics.r2_score(y_test, y_test_pred)\n", + "test_mae = metrics.mean_absolute_error(y_test, y_test_pred)\n", + "print(f\"Test R2 = {test_r2}\")\n", + "print(f\"Test MAE = {test_mae}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_encoder()`\n\n", + "Trains a `.KhiopsEncoder` on a monotable dataframe\n\n The Khiops encoder is a supervised feature encoder. It discretizes numerical\n features and groups categorical features in a way that the resulting interval/groups\n have the highest class-purity.\n\n .. note::\n For simplicity we train from the whole dataset. To assess the performance one\n usually splits the dataset into train and test subsets.\n \n" ] }, { @@ -523,47 +568,37 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_encoder_multitable_star():\n", - " \"\"\"Trains a `.KhiopsEncoder` on a star multi-table dataset\"\"\"\n", - " # Load the root table of the dataset into a pandas dataframe\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )\n", - "\n", - " # Obtain the root X feature table and the y target vector (\"Class\" column)\n", - " X_main = accidents_df.drop(\"Gravity\", axis=1)\n", - " y = accidents_df[\"Gravity\"]\n", - "\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " X_secondary = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Create the dataset multitable specification for the train/test split\n", - " # We specify each table with a name and a tuple (dataframe, key_columns)\n", - " X_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " }\n", - "\n", - " # Create the KhiopsEncoder with 10 additional multitable features and fit it\n", - " khe = KhiopsEncoder(n_features=10)\n", - " khe.fit(X_dataset, y)\n", - "\n", - " # Transform the train dataset\n", - " print(\"Encoded feature names:\")\n", - " print(khe.feature_names_out_)\n", - " print(\"Encoded data:\")\n", - " print(khe.transform(X_dataset)[:10])\n", - "\n", - "#Run sample\n", - "khiops_encoder_multitable_star()" + "# Imports\n", + "from khiops.sklearn import KhiopsEncoder\n", + "from sklearn.datasets import load_iris\n", + "\n", + "# Load the dataset\n", + "X, y = load_iris(return_X_y=True)\n", + "\n", + "# Create the encoder object\n", + "khe = KhiopsEncoder(transform_type_numerical=\"part_label\")\n", + "khe.fit(X, y)\n", + "\n", + "# Transform the training dataset\n", + "X_transformed = khe.transform(X)\n", + "\n", + "# Print both the original and transformed features\n", + "print(\"Original:\")\n", + "print(X[:10])\n", + "print(\"---\")\n", + "print(\"Encoded feature names:\")\n", + "print(khe.feature_names_out_)\n", + "print(\"Encoded data:\")\n", + "print(X_transformed[:10])\n", + "print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_encoder_multitable_star()`\n\n", + "Trains a `.KhiopsEncoder` on a star multi-table dataset\n" ] }, { @@ -572,60 +607,48 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_encoder_multitable_snowflake():\n", - " \"\"\"Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n", - "\n", - " .. note::\n", - " For simplicity we train from the whole dataset. To assess the performance\n", - " one usually splits the dataset into train and test subsets.\n", - " \"\"\"\n", - " # Load the tables into dataframes\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"Accidents\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - " users_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - " vehicles_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - " )\n", - "\n", - " # Build the multitable input X\n", - " # Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", - " # build the target column\n", - " X = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (accidents_df, \"AccidentId\"),\n", - " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", - " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"relations\": [\n", - " (\"Accidents\", \"Vehicles\"),\n", - " (\"Vehicles\", \"Users\"),\n", - " ],\n", - " }\n", - "\n", - " # Load the target variable from the AccidentsSummary dataset\n", - " y = pd.read_csv(\n", - " path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )[\"Gravity\"]\n", - "\n", - " # Create the KhiopsEncoder with 10 additional multitable features and fit it\n", - " khe = KhiopsEncoder(n_features=10)\n", - " khe.fit(X, y)\n", - "\n", - " # Transform the train dataset\n", - " print(\"Encoded feature names:\")\n", - " print(khe.feature_names_out_)\n", - " print(\"Encoded data:\")\n", - " print(khe.transform(X)[:10])\n", - "\n", - "#Run sample\n", - "khiops_encoder_multitable_snowflake()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsEncoder\n", + "\n", + "# Load the dataset tables into dataframe\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "\n", + "# Build the multi-table spec and the target\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + "}\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Create the KhiopsEncoder with 5 multitable features and fit it\n", + "khe = KhiopsEncoder(n_features=10)\n", + "khe.fit(X, y)\n", + "\n", + "# Transform the train dataset\n", + "print(\"Encoded feature names:\")\n", + "print(khe.feature_names_out_)\n", + "print(\"Encoded data:\")\n", + "print(khe.transform(X)[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_encoder_multitable_snowflake()`\n\n", + "Trains a `.KhiopsEncoder` on a snowflake multi-table dataset\n" ] }, { @@ -634,62 +657,74 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_encoder_pipeline_with_hgbc():\n", - " \"\"\"Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\"\"\"\n", - " # Load the dataset into a pandas dataframe\n", - " adult_path = path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", - " adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", - "\n", - " # Split the whole dataframe into train and test (70%-30%)\n", - " adult_train_df, adult_test_df = train_test_split(\n", - " adult_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Split the dataset into:\n", - " # - the X feature table\n", - " # - the y target vector (\"class\" column)\n", - " X_train = adult_train_df.drop(\"class\", axis=1)\n", - " X_test = adult_test_df.drop(\"class\", axis=1)\n", - " y_train = adult_train_df[\"class\"]\n", - " y_test = adult_test_df[\"class\"]\n", - "\n", - " # Create the pipeline and fit it. Steps:\n", - " # - The khiops supervised column encoder, generates a full-categorical table\n", - " # - One hot encoder in all columns\n", - " # - Train the HGB classifier\n", - " pipe_steps = [\n", - " (\"khiops_enc\", KhiopsEncoder()),\n", - " (\n", - " \"onehot_enc\",\n", - " ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)),\n", - " # For sklearn < 1.2, use\n", - " # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)),\n", - " ),\n", - " (\"hgb_clf\", HistGradientBoostingClassifier()),\n", - " ]\n", - " pipe = Pipeline(pipe_steps)\n", - " pipe.fit(X_train, y_train)\n", - "\n", - " # Predict the classes on the test dataset\n", - " y_test_pred = pipe.predict(X_test)\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probabilities on the test dataset\n", - " y_test_probas = pipe.predict_proba(X_test)\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_encoder_pipeline_with_hgbc()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsEncoder\n", + "\n", + "# Load the tables into dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "places_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", + "users_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", + ")\n", + "vehicles_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "\n", + "# Build the multi-table spec\n", + "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", + "# build the target column\n", + "X = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (accidents_df, \"AccidentId\"),\n", + " \"Places\": (places_df, \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + " \"relations\": [\n", + " (\"Accidents\", \"Vehicles\"),\n", + " (\"Accidents\", \"Places\", True),\n", + " (\"Vehicles\", \"Users\"),\n", + " ],\n", + "}\n", + "\n", + "# Load the target variable from the AccidentsSummary dataset\n", + "y = pd.read_csv(\n", + " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", + " usecols=[\"Gravity\"],\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ").squeeze(\"columns\")\n", + "\n", + "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", + "khe = KhiopsEncoder(n_features=10)\n", + "khe.fit(X, y)\n", + "\n", + "# Transform the train dataset\n", + "print(\"Encoded feature names:\")\n", + "print(khe.feature_names_out_)\n", + "print(\"Encoded data:\")\n", + "print(khe.transform(X)[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_encoder_pipeline_with_hgbc()`\n\n", + "Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`\n" ] }, { @@ -698,31 +733,67 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_coclustering():\n", - " \"\"\"Trains a `.KhiopsCoclustering` on a dataframe\"\"\"\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " splice_dataset_path = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " splice_dna_X = pd.read_csv(\n", - " path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Train with only 70% of data (for speed in this example)\n", - " X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)\n", - "\n", - " # Create the KhiopsCoclustering instance\n", - " khcc = KhiopsCoclustering()\n", - "\n", - " # Train the model with the whole dataset\n", - " khcc.fit(X, id_column=\"SampleId\")\n", - "\n", - " # Predict the clusters in some instances\n", - " X_clusters = khcc.predict(X)\n", - " print(\"Predicted clusters (first 10)\")\n", - " print(X_clusters[:10])\n", - " print(\"---\")\n", - "\n", - "#Run sample\n", - "khiops_coclustering()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsEncoder\n", + "from sklearn import metrics\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.ensemble import HistGradientBoostingClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "# Load the dataset into dataframes\n", + "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "X = adult_df.drop(\"class\", axis=1)\n", + "y = adult_df[\"class\"]\n", + "\n", + "# Split the dataset into train and test (70%-30%)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", + "\n", + "# Create the pipeline and fit it. Steps:\n", + "# - The khiops supervised column encoder, generates a full-categorical table\n", + "# - One hot encoder in all columns\n", + "# - Train the HGB classifier\n", + "pipe_steps = [\n", + " (\"khiops_enc\", KhiopsEncoder()),\n", + " (\n", + " \"onehot_enc\",\n", + " ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)),\n", + " ),\n", + " (\"hgb_clf\", HistGradientBoostingClassifier()),\n", + "]\n", + "pipe = Pipeline(pipe_steps)\n", + "pipe.fit(X_train, y_train)\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = pipe.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = pipe.predict_proba(X_test)\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_coclustering()`\n\n", + "Trains a `.KhiopsCoclustering` on a dataframe\n" ] }, { @@ -731,34 +802,41 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_coclustering_simplify():\n", - " \"\"\"Simplifies a `.KhiopsCoclustering` already trained on a dataframe\"\"\"\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " splice_dataset_path = path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", - " splice_dna_X = pd.read_csv(\n", - " path.join(splice_dataset_path, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Train with only 70% of data (for speed in this example)\n", - " X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)\n", - "\n", - " # Create the KhiopsCoclustering instance\n", - " khcc = KhiopsCoclustering()\n", - "\n", - " # Train the model with the whole dataset\n", - " khcc.fit(X, id_column=\"SampleId\")\n", - "\n", - " # Simplify coclustering along the individual ID dimension\n", - " simplified_khcc = khcc.simplify(max_part_numbers={\"SampleId\": 3})\n", - "\n", - " # Predict the clusters using the simplified model\n", - " X_clusters = simplified_khcc.predict(X)\n", - " print(\"Predicted clusters (only three at most)\")\n", - " print(X_clusters)\n", - " print(\"---\")\n", - "\n", - "#Run sample\n", - "khiops_coclustering_simplify()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsCoclustering\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_dna_df = pd.read_csv(\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + ")\n", + "\n", + "# Train with only 70% of data (for speed in this example)\n", + "X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1)\n", + "\n", + "# Create the KhiopsCoclustering instance\n", + "khcc = KhiopsCoclustering()\n", + "\n", + "# Train the model with the whole dataset\n", + "khcc.fit(X, id_column=\"SampleId\")\n", + "\n", + "# Predict the clusters in some instances\n", + "X_clusters = khcc.predict(X)\n", + "print(\"Predicted clusters (first 10)\")\n", + "print(X_clusters[:10])\n", + "print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_coclustering_simplify()`\n\n", + "Simplifies a `.KhiopsCoclustering` already trained on a dataframe\n" ] }, { @@ -767,68 +845,44 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_multitable_list():\n", - " \"\"\"Trains a KhiopsClassifier using a list dataset specification\n", - "\n", - " .. warning::\n", - " This dataset input method is **Deprecated** and will be removed in Khiops 11.\n", - " \"\"\"\n", - " # Load the root table of the dataset into a pandas dataframe\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )\n", - "\n", - " # Split the root dataframe into train and test\n", - " accidents_train_df, accidents_test_df = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Obtain the main X feature table and the y target vector (\"Class\" column)\n", - " y_train = accidents_train_df[\"Gravity\"]\n", - " y_test = accidents_test_df[\"Gravity\"]\n", - " X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", - " X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", - "\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " vehicles_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Split the secondary dataframe with the keys of the splitted root dataframe\n", - " X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - " X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - " X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - " X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - " # Create the classifier specifying the key column name\n", - " khc = KhiopsClassifier(key=\"AccidentId\")\n", - "\n", - " # Train the classifier\n", - " khc.fit([X_train_main, X_train_secondary], y_train)\n", - "\n", - " # Predict the class on the test dataset\n", - " y_test_pred = khc.predict([X_test_main, X_test_secondary])\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[:10])\n", - " print(\"---\")\n", - "\n", - " # Predict the class probability on the test dataset\n", - " y_test_probas = khc.predict_proba([X_test_main, X_test_secondary])\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[:10])\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_multitable_list()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsCoclustering\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "splice_data_dir = os.path.join(kh.get_samples_dir(), \"SpliceJunction\")\n", + "splice_dna_X = pd.read_csv(\n", + " os.path.join(splice_data_dir, \"SpliceJunctionDNA.txt\"), sep=\"\\t\"\n", + ")\n", + "\n", + "# Train with only 70% of data (for speed in this example)\n", + "X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)\n", + "\n", + "# Create the KhiopsCoclustering instance\n", + "khcc = KhiopsCoclustering()\n", + "\n", + "# Train the model with the whole dataset\n", + "khcc.fit(X, id_column=\"SampleId\")\n", + "\n", + "# Simplify coclustering along the individual ID dimension\n", + "simplified_khcc = khcc.simplify(max_part_numbers={\"SampleId\": 3})\n", + "\n", + "# Predict the clusters using the simplified model\n", + "X_clusters = simplified_khcc.predict(X)\n", + "print(\"Predicted clusters (only three at most)\")\n", + "print(X_clusters)\n", + "print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_multitable_list()`\n\n", + "Trains a KhiopsClassifier using a list dataset specification\n\n .. warning::\n This dataset input method is **Deprecated** and will be removed in Khiops 11.\n \n" ] }, { @@ -837,106 +891,59 @@ "metadata": {}, "outputs": [], "source": [ - "def khiops_classifier_multitable_star_file():\n", - " \"\"\"Trains a `.KhiopsClassifier` with a file path based dataset\n", - "\n", - " .. warning::\n", - " This dataset input method is **Deprecated** and will be removed in Khiops 11.\n", - " If you need to handle large datasets that do not easily fit into memory then you\n", - " may use the `~.khiops.core` API directly, which allows to specify file paths\n", - " directly.\n", - " \"\"\"\n", - " # Create output directory\n", - " results_dir = path.join(\"kh_samples\", \"khiops_classifier_multitable_file\")\n", - " if not path.exists(\"kh_samples\"):\n", - " os.mkdir(\"kh_samples\")\n", - " os.mkdir(results_dir)\n", - " else:\n", - " if not path.exists(results_dir):\n", - " os.mkdir(results_dir)\n", - "\n", - " # Load the root table of the dataset into a pandas dataframe\n", - " accidents_dataset_path = path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - " accidents_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - " )\n", - "\n", - " # Split the root dataframe into train and test\n", - " X_train_main, X_test_main = train_test_split(\n", - " accidents_df, test_size=0.3, random_state=1\n", - " )\n", - "\n", - " # Load the secondary table of the dataset into a pandas dataframe\n", - " vehicles_df = pd.read_csv(\n", - " path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - " )\n", - "\n", - " # Split the secondary dataframe with the keys of the splitted root dataframe\n", - " X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", - " X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", - " X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", - " X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", - "\n", - " # Write the train and test dataset sets to disk\n", - " # For the test file we remove the target column from the main table\n", - " X_train_main_path = path.join(results_dir, \"X_train_main.txt\")\n", - " X_train_main.to_csv(X_train_main_path, sep=\"\\t\", header=True, index=False)\n", - " X_train_secondary_path = path.join(results_dir, \"X_train_secondary.txt\")\n", - " X_train_secondary.to_csv(X_train_secondary_path, sep=\"\\t\", header=True, index=False)\n", - " X_test_main_path = path.join(results_dir, \"X_test_main.txt\")\n", - " y_test = X_test_main.sort_values(\"AccidentId\")[\"Gravity\"]\n", - " X_test_main.drop(columns=\"Gravity\").to_csv(\n", - " X_test_main_path, sep=\"\\t\", header=True, index=False\n", - " )\n", - " X_test_secondary_path = path.join(results_dir, \"X_test_secondary.txt\")\n", - " X_test_secondary.to_csv(X_test_secondary_path, sep=\"\\t\", header=True, index=False)\n", - "\n", - " # Define the dictionary of train\n", - " X_train_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_train_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_train_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - " }\n", - " X_test_dataset = {\n", - " \"main_table\": \"Accidents\",\n", - " \"tables\": {\n", - " \"Accidents\": (X_test_main_path, \"AccidentId\"),\n", - " \"Vehicles\": (X_test_secondary_path, [\"AccidentId\", \"VehicleId\"]),\n", - " },\n", - " \"format\": (\"\\t\", True),\n", - " }\n", - "\n", - " # Create the classifier and fit it\n", - " khc = KhiopsClassifier(output_dir=results_dir)\n", - " khc.fit(X_train_dataset, y=\"Gravity\")\n", - "\n", - " # Predict the class in addition to the class probabilities on the test dataset\n", - " y_test_pred_path = khc.predict(X_test_dataset)\n", - " y_test_pred = pd.read_csv(y_test_pred_path, sep=\"\\t\")\n", - " print(\"Predicted classes (first 10):\")\n", - " print(y_test_pred[\"PredictedGravity\"].head(10))\n", - " print(\"---\")\n", - "\n", - " y_test_probas_path = khc.predict_proba(X_test_dataset)\n", - " y_test_probas = pd.read_csv(y_test_probas_path, sep=\"\\t\")\n", - " proba_columns = [col for col in y_test_probas if col.startswith(\"Prob\")]\n", - " print(\"Predicted class probabilities (first 10):\")\n", - " print(y_test_probas[proba_columns].head(10))\n", - " print(\"---\")\n", - "\n", - " # Evaluate accuracy and auc metrics on the test dataset\n", - " test_accuracy = metrics.accuracy_score(y_test, y_test_pred[\"PredictedGravity\"])\n", - " test_auc = metrics.roc_auc_score(y_test, y_test_probas[\"ProbGravityLethal\"])\n", - " print(f\"Test accuracy = {test_accuracy}\")\n", - " print(f\"Test auc = {test_auc}\")\n", - "\n", - "#Run sample\n", - "khiops_classifier_multitable_star_file()" + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the root table of the dataset into a pandas dataframe\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "X = accidents_df.drop(\"Gravity\", axis=1)\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Split the dataset into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "\n", + "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "X_train_ids = X_train[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test[\"AccidentId\"].to_frame()\n", + "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "\n", + "# Create the classifier specifying the key column name\n", + "khc = KhiopsClassifier(key=\"AccidentId\")\n", + "\n", + "# Train the classifier\n", + "khc.fit([X_train, X_train_secondary], y_train)\n", + "\n", + "# Predict the class on the test dataset\n", + "y_test_pred = khc.predict([X_test, X_test_secondary])\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba([X_test, X_test_secondary])\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" ] } ], diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 7089a165..67c5391f 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -5,51 +5,37 @@ # see the "LICENSE.md" file for more details. # ###################################################################################### """Khiops Python sklearn submodule samples + The functions in this script demonstrate the basic use of the sklearn submodule of the Khiops Python library. """ import argparse -import os -import pickle -from os import path - -import pandas as pd -from sklearn import metrics -from sklearn.compose import ColumnTransformer - -# isort: off -# pylint: disable=unused-import -from sklearn.experimental import enable_hist_gradient_boosting -from sklearn.ensemble import HistGradientBoostingClassifier - -# isort: on -from sklearn.datasets import fetch_20newsgroups -from sklearn.feature_extraction.text import HashingVectorizer - -# pylint: enable=unused-import -from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder - import khiops +import os from khiops import core as kh -from khiops.sklearn import ( - KhiopsClassifier, - KhiopsCoclustering, - KhiopsEncoder, - KhiopsRegressor, -) # Disable PEP8 variable names because of scikit-learn X,y conventions # To capture invalid-names other than X,y run: -# pylint --disable=all --enable=invalid-names estimators.py +# pylint --disable=all --enable=invalid-names samples_sklearn.py # pylint: disable=invalid-name +# For ease of use the functions in this module contain (repeated) import statements +# We disable all pylint warnings related to imports +# pylint: disable=import-outside-toplevel,redefined-outer-name,reimported + def khiops_classifier(): """Trains a `.KhiopsClassifier` on a monotable dataframe""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") # Split the whole dataframe into train and test (70%-30%) @@ -99,7 +85,13 @@ def khiops_classifier_sparse(): natively. """ - # Load 3 classes of the 20newsgroups dataset + # Imports + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.datasets import fetch_20newsgroups + from sklearn.feature_extraction.text import HashingVectorizer + + # Load 4 classes of the 20newsgroups dataset categories = ["comp.graphics", "sci.space", "misc.forsale", "alt.atheism"] data_train, y_train = fetch_20newsgroups( subset="train", @@ -150,8 +142,16 @@ def khiops_classifier_sparse(): def khiops_classifier_multiclass(): """Trains a multiclass `.KhiopsClassifier` on a monotable dataframe""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") + iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") iris_df = pd.read_csv(iris_path, sep="\t") # Split the whole dataframe into train and test (70%-30%) @@ -195,52 +195,39 @@ def khiops_classifier_multiclass(): def khiops_classifier_multitable_star(): """Trains a `.KhiopsClassifier` on a star multi-table dataset""" - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics + + # Load the dataset into pandas dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_train = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main, "AccidentId"), - "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), - }, - } - X_test = { + # Create the dataset spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_test_main, "AccidentId"), - "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] + + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split_dataset( + X, y, test_size=0.3, random_state=1 + ) # Train the classifier (by default it analyzes 100 multi-table features) khc = KhiopsClassifier() @@ -266,33 +253,111 @@ def khiops_classifier_multitable_star(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_snowflake(): - """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset +def khiops_classifier_multitable_star_file(): + """Trains a `.KhiopsClassifier` with a file dataset""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics - .. note:: - For simplicity we train from the whole dataset. To assess the performance one - usually splits the dataset into train and test subsets. + # Create output directory + results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file") + if not os.path.exists("kh_samples"): + os.mkdir("kh_samples") + os.mkdir(results_dir) + else: + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + # Create the dataset spec + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + X = { + "main_table": "Accidents", + "tables": { + "Accidents": ( + os.path.join(accidents_data_dir, "Accidents.txt"), + "AccidentId", + ), + "Vehicles": ( + os.path.join(accidents_data_dir, "Vehicles.txt"), + ["AccidentId", "VehicleId"], + ), + }, + "format": ("\t", True), + } + + # Split the dataset into train and test + X_train, X_test = train_test_split_dataset( + X, output_dir=os.path.join(results_dir, "split"), test_size=0.3 + ) + + # Create the classifier and fit it + khc = KhiopsClassifier(output_dir=results_dir) + khc.fit(X_train, y="Gravity") + + # Predict the class in addition to the class probabilities on the test dataset + y_test_pred_path = khc.predict(X_test) + y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") + print("Predicted classes (first 10):") + print(y_test_pred["PredictedGravity"].head(10)) + print("---") + + y_test_probas_path = khc.predict_proba(X_test) + y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") + proba_columns = [col for col in y_test_probas if col.startswith("Prob")] + print("Predicted class probabilities (first 10):") + print(y_test_probas[proba_columns].head(10)) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + # Note: For roc_auc_score we have to use the "greatest" label which is "NonLethal" + y_test = pd.read_csv( + X_test["tables"]["Accidents"][0], + usecols=["Gravity"], + sep="\t", + encoding="latin1", + ) + test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) + test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityNonLethal"]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + + +def khiops_classifier_multitable_snowflake(): + """Trains a `.KhiopsClassifier` on a snowflake multi-table dataset""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from khiops.utils.helpers import train_test_split_dataset + from sklearn import metrics - """ # Load the dataset tables into dataframes - accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) users_df = pd.read_csv( - path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Vehicles.txt"), + sep="\t", + encoding="latin1", ) places_df = pd.read_csv( - path.join(accidents_dataset_path, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) - # Build the multitable input X - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + + # Create the dataset spec + # Note: We discard the "Gravity" column from the "Users" table to avoid a target + # leak. This is because the column was used to build the target. X = { "main_table": "Accidents", "tables": { @@ -308,60 +373,65 @@ def khiops_classifier_multitable_snowflake(): ], } - # Load the target variable from the AccidentsSummary dataset + # Load the target variable "Gravity" from the "AccidentsSummary" dataset y = pd.read_csv( - path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze("columns") + + # Split into train and test datasets + X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) # Train the classifier (by default it creates 1000 multi-table features) khc = KhiopsClassifier(n_trees=0) - khc.fit(X, y) + khc.fit(X_train, y_train) # Predict the class on the test dataset - y_pred = khc.predict(X) + y_test_pred = khc.predict(X_test) print("Predicted classes (first 10):") - print(y_pred[:10]) + print(y_test_pred[:10]) print("---") - # Predict the class probability on the train dataset - y_probas = khc.predict_proba(X) + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) print(f"Class order: {khc.classes_}") print("Predicted class probabilities (first 10):") - print(y_probas[:10]) + print(y_test_probas[:10]) print("---") - # Evaluate accuracy and auc metrics on the train dataset - train_accuracy = metrics.accuracy_score(y_pred, y) - train_auc = metrics.roc_auc_score(y, y_probas[:, 1]) - print(f"Train accuracy = {train_accuracy}") - print(f"Train auc = {train_auc}") + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test_pred, y_test) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") def khiops_classifier_pickle(): """Shows the serialization and deserialization of a `.KhiopsClassifier`""" - # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") - - # Train the model with the whole dataset - X = iris_df.drop(["Class"], axis=1) - y = iris_df["Class"] - khc = KhiopsClassifier() - khc.fit(X, y) + # Imports + import os + import pickle + from khiops.sklearn import KhiopsClassifier + from sklearn.datasets import load_iris # Create/clean the output directory - results_dir = path.join("kh_samples", "khiops_classifier_pickle") - khc_pickle_path = path.join(results_dir, "khiops_classifier.pkl") - if path.exists(khc_pickle_path): + results_dir = os.path.join("kh_samples", "khiops_classifier_pickle") + khc_pickle_path = os.path.join(results_dir, "khiops_classifier.pkl") + if os.path.exists(khc_pickle_path): os.remove(khc_pickle_path) else: os.makedirs(results_dir, exist_ok=True) + # Train the model with the Iris dataset + X, y = load_iris(return_X_y=True) + khc = KhiopsClassifier() + khc.fit(X, y) + # Pickle its content to a file - with open(khc_pickle_path, "wb") as khc_pickle_write_file: - pickle.dump(khc, khc_pickle_write_file) + with open(khc_pickle_path, "wb") as khc_pickle_output_file: + pickle.dump(khc, khc_pickle_output_file) # Unpickle it with open(khc_pickle_path, "rb") as khc_pickle_file: @@ -377,23 +447,25 @@ def khiops_classifier_pickle(): def khiops_regressor(): """Trains a `.KhiopsRegressor` on a monotable dataframe""" - # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsRegressor + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the "Adult" dataset and set the target to the "age" column + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("age", axis=1) + y = adult_df["age"] # Split the whole dataframe into train and test (40%-60% for speed) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.6, random_state=1 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("age" column) - X_train = adult_train_df.drop("age", axis=1) - X_test = adult_test_df.drop("age", axis=1) - y_train = adult_train_df["age"] - y_test = adult_test_df["age"] - # Create the regressor object khr = KhiopsRegressor() @@ -424,16 +496,15 @@ def khiops_encoder(): For simplicity we train from the whole dataset. To assess the performance one usually splits the dataset into train and test subsets. """ - # Load the dataset into a pandas dataframe - iris_path = path.join(kh.get_samples_dir(), "Iris", "Iris.txt") - iris_df = pd.read_csv(iris_path, sep="\t") + # Imports + from khiops.sklearn import KhiopsEncoder + from sklearn.datasets import load_iris - # Train the model with the whole dataset - X = iris_df.drop("Class", axis=1) - y = iris_df["Class"] + # Load the dataset + X, y = load_iris(return_X_y=True) # Create the encoder object - khe = KhiopsEncoder() + khe = KhiopsEncoder(transform_type_numerical="part_label") khe.fit(X, y) # Transform the training dataset @@ -441,7 +512,7 @@ def khiops_encoder(): # Print both the original and transformed features print("Original:") - print(X.head(10)) + print(X[:10]) print("---") print("Encoded feature names:") print(khe.feature_names_out_) @@ -452,85 +523,96 @@ def khiops_encoder(): def khiops_encoder_multitable_star(): """Trains a `.KhiopsEncoder` on a star multi-table dataset""" - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + + # Load the dataset tables into dataframe + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table spec and the target + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + y = accidents_df["Gravity"] - # Create the KhiopsEncoder with 10 additional multitable features and fit it + # Create the KhiopsEncoder with 5 multitable features and fit it khe = KhiopsEncoder(n_features=10) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) def khiops_encoder_multitable_snowflake(): - """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset + """Trains a `.KhiopsEncoder` on a snowflake multi-table dataset""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder - .. note:: - For simplicity we train from the whole dataset. To assess the performance - one usually splits the dataset into train and test subsets. - """ # Load the tables into dataframes - accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + places_df = pd.read_csv( + os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" ) users_df = pd.read_csv( - path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" ) vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Vehicles.txt"), + sep="\t", + encoding="latin1", ) - # Build the multitable input X + # Build the multi-table spec # Note: We discard the "Gravity" field from the "Users" table as it was used to # build the target column X = { "main_table": "Accidents", "tables": { "Accidents": (accidents_df, "AccidentId"), + "Places": (places_df, "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), }, "relations": [ ("Accidents", "Vehicles"), + ("Accidents", "Places", True), ("Vehicles", "Users"), ], } # Load the target variable from the AccidentsSummary dataset y = pd.read_csv( - path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), + usecols=["Gravity"], sep="\t", encoding="latin1", - )["Gravity"] + ).squeeze("columns") # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -546,24 +628,30 @@ def khiops_encoder_multitable_snowflake(): # Disable line too long just to have a title linking the sklearn documentation # pylint: disable=line-too-long def khiops_encoder_pipeline_with_hgbc(): - """Chains a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" - # Load the dataset into a pandas dataframe - adult_path = path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + """Uses a `.KhiopsEncoder` with a `~sklearn.ensemble.HistGradientBoostingClassifier`""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + from sklearn import metrics + from sklearn.compose import ColumnTransformer + from sklearn.ensemble import HistGradientBoostingClassifier + from sklearn.model_selection import train_test_split + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + # Load the dataset into dataframes + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") adult_df = pd.read_csv(adult_path, sep="\t") + X = adult_df.drop("class", axis=1) + y = adult_df["class"] - # Split the whole dataframe into train and test (70%-30%) - adult_train_df, adult_test_df = train_test_split( - adult_df, test_size=0.3, random_state=1 + # Split the dataset into train and test (70%-30%) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Split the dataset into: - # - the X feature table - # - the y target vector ("class" column) - X_train = adult_train_df.drop("class", axis=1) - X_test = adult_test_df.drop("class", axis=1) - y_train = adult_train_df["class"] - y_test = adult_test_df["class"] - # Create the pipeline and fit it. Steps: # - The khiops supervised column encoder, generates a full-categorical table # - One hot encoder in all columns @@ -573,8 +661,6 @@ def khiops_encoder_pipeline_with_hgbc(): ( "onehot_enc", ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)), - # For sklearn < 1.2, use - # ColumnTransformer([], remainder=OneHotEncoder(sparse=False)), ), ("hgb_clf", HistGradientBoostingClassifier()), ] @@ -605,14 +691,21 @@ def khiops_encoder_pipeline_with_hgbc(): def khiops_coclustering(): """Trains a `.KhiopsCoclustering` on a dataframe""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsCoclustering + from sklearn.model_selection import train_test_split + # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = path.join(kh.get_samples_dir(), "SpliceJunction") - splice_dna_X = pd.read_csv( - path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") + splice_dna_df = pd.read_csv( + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) - X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1) + X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1) # Create the KhiopsCoclustering instance khcc = KhiopsCoclustering() @@ -629,10 +722,17 @@ def khiops_coclustering(): def khiops_coclustering_simplify(): """Simplifies a `.KhiopsCoclustering` already trained on a dataframe""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsCoclustering + from sklearn.model_selection import train_test_split + # Load the secondary table of the dataset into a pandas dataframe - splice_dataset_path = path.join(kh.get_samples_dir(), "SpliceJunction") + splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") splice_dna_X = pd.read_csv( - path.join(splice_dataset_path, "SpliceJunctionDNA.txt"), sep="\t" + os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t" ) # Train with only 70% of data (for speed in this example) @@ -665,33 +765,37 @@ def khiops_classifier_multitable_list(): .. warning:: This dataset input method is **Deprecated** and will be removed in Khiops 11. """ + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", encoding="latin1", ) + X = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] - # Split the root dataframe into train and test - accidents_train_df, accidents_test_df = train_test_split( - accidents_df, test_size=0.3, random_state=1 + # Split the dataset into train and test + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=1 ) - # Obtain the main X feature table and the y target vector ("Class" column) - y_train = accidents_train_df["Gravity"] - y_test = accidents_test_df["Gravity"] - X_train_main = accidents_train_df.drop("Gravity", axis=1) - X_test_main = accidents_test_df.drop("Gravity", axis=1) - # Load the secondary table of the dataset into a pandas dataframe vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_ids = X_train["AccidentId"].to_frame() + X_test_ids = X_test["AccidentId"].to_frame() X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") @@ -699,16 +803,16 @@ def khiops_classifier_multitable_list(): khc = KhiopsClassifier(key="AccidentId") # Train the classifier - khc.fit([X_train_main, X_train_secondary], y_train) + khc.fit([X_train, X_train_secondary], y_train) # Predict the class on the test dataset - y_test_pred = khc.predict([X_test_main, X_test_secondary]) + y_test_pred = khc.predict([X_test, X_test_secondary]) print("Predicted classes (first 10):") print(y_test_pred[:10]) print("---") # Predict the class probability on the test dataset - y_test_probas = khc.predict_proba([X_test_main, X_test_secondary]) + y_test_probas = khc.predict_proba([X_test, X_test_secondary]) print("Predicted class probabilities (first 10):") print(y_test_probas[:10]) print("---") @@ -720,110 +824,12 @@ def khiops_classifier_multitable_list(): print(f"Test auc = {test_auc}") -def khiops_classifier_multitable_star_file(): - """Trains a `.KhiopsClassifier` with a file path based dataset - - .. warning:: - This dataset input method is **Deprecated** and will be removed in Khiops 11. - If you need to handle large datasets that do not easily fit into memory then you - may use the `~.khiops.core` API directly, which allows to specify file paths - directly. - """ - # Create output directory - results_dir = path.join("kh_samples", "khiops_classifier_multitable_file") - if not path.exists("kh_samples"): - os.mkdir("kh_samples") - os.mkdir(results_dir) - else: - if not path.exists(results_dir): - os.mkdir(results_dir) - - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Split the root dataframe into train and test - X_train_main, X_test_main = train_test_split( - accidents_df, test_size=0.3, random_state=1 - ) - - # Load the secondary table of the dataset into a pandas dataframe - vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) - - # Split the secondary dataframe with the keys of the splitted root dataframe - X_train_ids = X_train_main["AccidentId"].to_frame() - X_test_ids = X_test_main["AccidentId"].to_frame() - X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") - X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") - - # Write the train and test dataset sets to disk - # For the test file we remove the target column from the main table - X_train_main_path = path.join(results_dir, "X_train_main.txt") - X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False) - X_train_secondary_path = path.join(results_dir, "X_train_secondary.txt") - X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False) - X_test_main_path = path.join(results_dir, "X_test_main.txt") - y_test = X_test_main.sort_values("AccidentId")["Gravity"] - X_test_main.drop(columns="Gravity").to_csv( - X_test_main_path, sep="\t", header=True, index=False - ) - X_test_secondary_path = path.join(results_dir, "X_test_secondary.txt") - X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False) - - # Define the dictionary of train - X_train_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_train_main_path, "AccidentId"), - "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - X_test_dataset = { - "main_table": "Accidents", - "tables": { - "Accidents": (X_test_main_path, "AccidentId"), - "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]), - }, - "format": ("\t", True), - } - - # Create the classifier and fit it - khc = KhiopsClassifier(output_dir=results_dir) - khc.fit(X_train_dataset, y="Gravity") - - # Predict the class in addition to the class probabilities on the test dataset - y_test_pred_path = khc.predict(X_test_dataset) - y_test_pred = pd.read_csv(y_test_pred_path, sep="\t") - print("Predicted classes (first 10):") - print(y_test_pred["PredictedGravity"].head(10)) - print("---") - - y_test_probas_path = khc.predict_proba(X_test_dataset) - y_test_probas = pd.read_csv(y_test_probas_path, sep="\t") - proba_columns = [col for col in y_test_probas if col.startswith("Prob")] - print("Predicted class probabilities (first 10):") - print(y_test_probas[proba_columns].head(10)) - print("---") - - # Evaluate accuracy and auc metrics on the test dataset - test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"]) - test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"]) - print(f"Test accuracy = {test_accuracy}") - print(f"Test auc = {test_auc}") - - exported_samples = [ khiops_classifier, khiops_classifier_sparse, khiops_classifier_multiclass, khiops_classifier_multitable_star, + khiops_classifier_multitable_star_file, khiops_classifier_multitable_snowflake, khiops_classifier_pickle, khiops_regressor, @@ -834,14 +840,13 @@ def khiops_classifier_multitable_star_file(): khiops_coclustering, khiops_coclustering_simplify, khiops_classifier_multitable_list, - khiops_classifier_multitable_star_file, ] def execute_samples(args): """Executes all non-interactive samples""" # Create the results directory if it does not exist - if not path.isdir("./kh_samples"): + if not os.path.isdir("./kh_samples"): os.mkdir("./kh_samples") # Set the user-defined samples dir if any @@ -863,7 +868,7 @@ def execute_samples(args): print(f"{len(execution_samples)} sample(s) to execute\n") for sample in execution_samples: - print(">>> Executing samples_sklearn." + sample.__name__) + print(f">>> Executing samples_sklearn.{sample.__name__}") sample.__call__() print("> Done\n") diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 93825be7..5ca541de 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -479,12 +479,14 @@ def _transform_deploy_model( X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter root_name : str Name of root table in trained Khiops model .kdic @@ -745,12 +747,14 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. id_column : str The column that contains the id of the instance. @@ -1180,21 +1184,20 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. Returns ------- `numpy.ndarray` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in khiops-python 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("predict") @@ -1392,19 +1395,22 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values. + y : :external:term:`array-like` of shape (n_samples,) + The target values. - **Deprecated input modes** (will be removed in khiops-python 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + *Deprecated input modes* (will be removed in khiops-python 11): + + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -1843,19 +1849,21 @@ def fit(self, X, y, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + y : :external:term:`array-like` of shape (n_samples,) + The target values. - **Deprecated input modes** (will be removed in khiops-python 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + *Deprecated input modes* (will be removed in khiops-python 11): + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -1936,12 +1944,14 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. Returns ------- @@ -1993,12 +2003,14 @@ def predict_proba(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. Returns ------- @@ -2184,19 +2196,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. - **Deprecated input modes** (will be removed in khiops-python 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + *Deprecated input modes* (will be removed in khiops-python 11): + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- self : `KhiopsRegressor` @@ -2254,23 +2268,27 @@ def predict(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. Returns ------- - `numpy.ndarray` + `numpy.ndarray` or str + An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. The key columns are added for - multi-table tasks. - - *Deprecated return values* (will be removed in khiops-python 11): str for - file based dataset specification. + multi-table tasks. The array is in the form of: + - `numpy.ndarray` if X is :external:term:`array-like`, or dataset spec + containing `pandas.DataFrame` table. + - str (a path for the file containing the array) if X is a dataset spec + containing file-path tables. """ # Call the parent's method y_pred = super().predict(X) @@ -2499,19 +2517,21 @@ def fit(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): - y : :external:term:`array-like` of shape (n_samples,) or - a `pandas.Dataframe` of shape (n_samples, 1) containing the target values + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. - **Deprecated input modes** (will be removed in khiops-python 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + y : :external:term:`array-like` of shape (n_samples,) + The target values. + + *Deprecated input modes* (will be removed in khiops-python 11): + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- @@ -2572,21 +2592,20 @@ def transform(self, X): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. Returns ------- `numpy.ndarray` An array containing the encoded columns. A first column containing key column ids is added in multi-table mode. - - *Deprecated return values* (will be removed in khiops-python 11): str for - file based dataset specification. """ # Create temporary directory computation_dir = self._create_computation_dir("transform") @@ -2634,19 +2653,21 @@ def fit_transform(self, X, y=None, **kwargs): X : :external:term:`array-like` of shape (n_samples, n_features_in) or dict Training dataset. Either an :external:term:`array-like` or a ``dict`` specification for multi-table datasets (see :doc:`/multi_table_primer`). - *Deprecated input modes* (will be removed in khiops-python 11): - - tuple: A pair (``path_to_file``, ``separator``). - - list: A sequence of dataframes or paths, or pairs path-separator. The - first element of the list is the main table and the following are - secondary ones joined to the main table using ``key`` estimator parameter. + *Deprecated input modes* (will be removed in khiops-python 11): + + - tuple: A pair (``path_to_file``, ``separator``). + - list: A sequence of dataframes or paths, or pairs path-separator. The + first element of the list is the main table and the following are + secondary ones joined to the main table using ``key`` estimator + parameter. y : :external:term:`array-like` of shape (n_samples,) - :external:term:`array-like` object containing the target values. + The target values. - **Deprecated input modes** (will be removed in khiops-python 11): - - str: A path to a data table file for file-based ``dict`` dataset - specifications. + *Deprecated input modes* (will be removed in khiops-python 11): + - str: A path to a data table file for file-based ``dict`` dataset + specifications. Returns ------- diff --git a/khiops/utils/dataset.py b/khiops/utils/dataset.py index 348ab709..f3a34af1 100644 --- a/khiops/utils/dataset.py +++ b/khiops/utils/dataset.py @@ -37,6 +37,21 @@ def check_dataset_spec(ds_spec): + """Checks that a dataset spec is valid + + Parameters + ---------- + ds_spec : dict + A specification of a multi-table dataset (see :doc:`/multi_table_primer`). + + Raises + ------ + TypeError + If there are objects of the spec with invalid type. + ValueError + If there are objects of the spec with invalid values. + """ + # Check the "tables" field if "tables" not in ds_spec: raise ValueError("'tables' entry missing from dataset dict spec") @@ -47,18 +62,18 @@ def check_dataset_spec(ds_spec): if len(ds_spec["tables"]) == 0: raise ValueError("'tables' dictionary cannot be empty") for table_name, table_entry in ds_spec["tables"].items(): - check_table_entry(table_name, table_entry) + _check_table_entry(table_name, table_entry) # Multi-table specific table checks if len(ds_spec["tables"]) > 1: - check_multitable_spec(ds_spec) + _check_multitable_spec(ds_spec) # Check the 'format' field if "format" in ds_spec: - check_format_entry(ds_spec["format"]) + _check_format_entry(ds_spec["format"]) -def check_table_entry(table_name, table_spec): +def _check_table_entry(table_name, table_spec): if not isinstance(table_spec, tuple): raise TypeError( type_error_message(f"'{table_name}' table entry", table_spec, tuple) @@ -80,10 +95,10 @@ def check_table_entry(table_name, table_spec): str, ) ) - check_table_key(table_name, key) + _check_table_key(table_name, key) -def check_table_key(table_name, key): +def _check_table_key(table_name, key): if key is not None and not is_list_like(key) and not isinstance(key, str): raise TypeError( type_error_message(f"'{table_name}' table's key", key, str, Sequence) @@ -102,7 +117,7 @@ def check_table_key(table_name, key): ) -def check_multitable_spec(ds_spec): +def _check_multitable_spec(ds_spec): assert len(ds_spec) > 1 # Check the main table if "main_table" not in ds_spec: @@ -138,10 +153,10 @@ def check_multitable_spec(ds_spec): for table in ds_spec["tables"].keys() if table != ds_spec["main_table"] ] - check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) + _check_relations_entry(ds_spec["main_table"], ds_spec["tables"], relations_spec) -def check_relations_entry(main_table_name, tables_spec, relations_spec): +def _check_relations_entry(main_table_name, tables_spec, relations_spec): # Check the types and size of the relation entries if not is_list_like(relations_spec): raise TypeError( @@ -194,7 +209,7 @@ def check_relations_entry(main_table_name, tables_spec, relations_spec): ) # Check hierachical keys - check_hierarchical_keys( + _check_hierarchical_keys( i, parent_table, tables_spec[parent_table][1], @@ -203,10 +218,10 @@ def check_relations_entry(main_table_name, tables_spec, relations_spec): ) # Check there are no cycles - check_no_cycles(relations_spec, main_table_name) + _check_no_cycles(relations_spec, main_table_name) -def check_hierarchical_keys( +def _check_hierarchical_keys( relation_id, parent_table, parent_table_key, child_table, child_table_key ): """Check that the parent table's key is contained in the child table's key""" @@ -236,7 +251,7 @@ def check_hierarchical_keys( ) -def check_no_cycles(relations_spec, main_table_name): +def _check_no_cycles(relations_spec, main_table_name): """Check that there are no cycles in the 'relations' entry""" tables_to_visit = [main_table_name] tables_visited = set() @@ -254,7 +269,7 @@ def check_no_cycles(relations_spec, main_table_name): ) -def check_format_entry(format_spec): +def _check_format_entry(format_spec): if not isinstance(format_spec, tuple): raise TypeError(type_error_message("'format' entry", format_spec, tuple)) if len(format_spec) != 2: @@ -551,7 +566,7 @@ def _check_input_sequence(self, X, key=None): ) # Check the key for the main_table (it is the same for the others) - check_table_key("main_table", key) + _check_table_key("main_table", key) def _init_tables_from_mapping(self, X): """Initializes the table spec from a dict-like 'X'""" diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index 3551bc99..16107f45 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -1,5 +1,6 @@ """General helper functions""" +import itertools import os from sklearn.model_selection import train_test_split @@ -18,10 +19,15 @@ def sort_dataset(ds_spec, output_dir=None): Parameters ---------- ds_spec: dict - The dataset dictionary specification. The tables must be either - `pandas.DataFrame` or file path references. + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. output_dir: str, optional _Only for file datasets:_ The output directory for the sorted files. + + Examples + -------- + See the following functions of the ``samples.py`` documentation script: + - `samples.sort_data_tables_mt()` """ # Check the types if not is_dict_like(ds_spec): @@ -82,9 +88,38 @@ def _sort_file_table(table, sep, header, output_dir): return out_data_source +# Note: We build the splits with lists and itertools.chain avoid pylint warning about +# unbalanced-tuple-unpacking. See issue https://github.com/pylint-dev/pylint/issues/5671 + + def train_test_split_dataset( ds_spec, target_column=None, test_size=0.25, output_dir=None, **kwargs ): + """Splits a dataset spec into train and test + + Parameters + ---------- + ds_spec : ``dict`` + A dataset spec. The tables must be either `pandas.DataFrame` or file path + references. + target_column : :external:term:`array-like`, optional + The target values. + test_size : float, default 0.25 + The proportion of the dataset (between 0.0 and 1.0) to be include in the test + split. + output_dir : str, optional + *Only for file datasets:* The output directory for the sorted files. + ... : + Other optional parameters for `sklearn.model_selection.train_test_split` + + + Examples + -------- + See the following functions of the ``samples_sklearn.py`` documentation script: + - `samples_sklearn.khiops_classifier_multitable_star` + - `samples_sklearn.khiops_classifier_multitable_star_file` + - `samples_sklearn.khiops_classifier_multitable_snowflake` + """ # Check the types if not is_dict_like(ds_spec): raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) @@ -129,20 +164,12 @@ def train_test_split_dataset( train_ds, test_ds = _train_test_split_file_dataset(ds, test_size, output_dir) # Create the return tuple - # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking - if target_column is None: - split = tuple([train_ds.to_spec(), test_ds.to_spec()]) - else: - split = tuple( - [ - train_ds.to_spec(), - test_ds.to_spec(), - train_target_column, - test_target_column, - ] - ) + split_ds_specs = [train_ds.to_spec(), test_ds.to_spec()] + split_target_columns = [] + if target_column is not None: + split_target_columns = [train_target_column, test_target_column] - return split + return itertools.chain(split_ds_specs, split_target_columns) def _train_test_split_in_memory_dataset( @@ -194,15 +221,12 @@ def _train_test_split_in_memory_dataset( ) # Build the return value - # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking - if target_column is None: - return_tuple = tuple([train_ds, test_ds]) - else: - return_tuple = tuple( - [train_ds, test_ds, train_target_column, test_target_column] - ) + split_dss = [train_ds, test_ds] + split_targets = [] + if target_column is not None: + split_targets = [train_target_column, test_target_column] - return return_tuple + return itertools.chain(split_dss, split_targets) def _train_test_split_file_dataset(ds, test_size, output_dir): @@ -264,5 +288,4 @@ def _train_test_split_file_dataset(ds, test_size, output_dir): sampling_mode="Exclude sample", ) - # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking - return tuple([split_dss["train"], split_dss["test"]]) + return itertools.chain(split_dss.values())