diff --git a/khiops/utils/helpers.py b/khiops/utils/helpers.py index 39fb6a28..3551bc99 100644 --- a/khiops/utils/helpers.py +++ b/khiops/utils/helpers.py @@ -2,6 +2,8 @@ import os +from sklearn.model_selection import train_test_split + from khiops import core as kh from khiops.core.internals.common import is_dict_like, type_error_message from khiops.utils.dataset import Dataset, FileTable, PandasTable @@ -62,7 +64,6 @@ def _sort_df_table(table): def _sort_file_table(table, sep, header, output_dir): assert isinstance(table, FileTable), type_error_message("table", table, FileTable) - domain = kh.DictionaryDomain() dictionary = table.create_khiops_dictionary() domain.add_dictionary(dictionary) @@ -79,3 +80,189 @@ def _sort_file_table(table, sep, header, output_dir): ) return out_data_source + + +def train_test_split_dataset( + ds_spec, target_column=None, test_size=0.25, output_dir=None, **kwargs +): + # Check the types + if not is_dict_like(ds_spec): + raise TypeError(type_error_message("ds_spec", ds_spec, "dict-like")) + + # Build the dataset for the feature table + ds = Dataset(ds_spec) + + # Check the parameter coherence + if not ds.is_in_memory(): + if target_column is not None: + raise ValueError("'target_column' cannot be used with file path datasets") + if output_dir is None: + raise ValueError("'output_dir' must be specified for file path datasets") + if not isinstance(output_dir, str): + raise TypeError(type_error_message("output_dir", output_dir, str)) + + # Perform the split for each type of dataset + if ds.is_in_memory(): + # Obtain the keys for the other test_train_split function + sklearn_split_params = {} + for param in ("train_size", "random_state", "shuffle", "stratify"): + if param in kwargs: + sklearn_split_params[param] = kwargs[param] + + if target_column is None: + train_ds, test_ds = _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + else: + train_ds, test_ds, train_target_column, test_target_column = ( + _train_test_split_in_memory_dataset( + ds, + target_column, + test_size=test_size, + split_params=sklearn_split_params, + ) + ) + else: + train_ds, test_ds = _train_test_split_file_dataset(ds, test_size, output_dir) + + # Create the return tuple + # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking + if target_column is None: + split = tuple([train_ds.to_spec(), test_ds.to_spec()]) + else: + split = tuple( + [ + train_ds.to_spec(), + test_ds.to_spec(), + train_target_column, + test_target_column, + ] + ) + + return split + + +def _train_test_split_in_memory_dataset( + ds, target_column, test_size, split_params=None +): + # Create shallow copies of the feature dataset + train_ds = ds.copy() + test_ds = ds.copy() + + # Split the main table and the target (if any) + if target_column is None: + train_ds.main_table.data_source, test_ds.main_table.data_source = ( + train_test_split( + ds.main_table.data_source, test_size=test_size, **split_params + ) + ) + else: + ( + train_ds.main_table.data_source, + test_ds.main_table.data_source, + train_target_column, + test_target_column, + ) = train_test_split( + ds.main_table.data_source, + target_column, + test_size=test_size, + **split_params, + ) + + # Split the secondary tables tables + # Note: The tables are traversed in BFS + todo_relations = [ + relation for relation in ds.relations if relation[0] == ds.main_table.name + ] + while todo_relations: + current_parent_table_name, current_child_table_name, _ = todo_relations.pop(0) + for relation in ds.relations: + parent_table_name, _, _ = relation + if parent_table_name == current_child_table_name: + todo_relations.append(relation) + + for new_ds in (train_ds, test_ds): + origin_child_table = ds.get_table(current_child_table_name) + new_child_table = new_ds.get_table(current_child_table_name) + new_parent_table = new_ds.get_table(current_parent_table_name) + new_parent_key_cols_df = new_parent_table.data_source[new_parent_table.key] + new_child_table.data_source = new_parent_key_cols_df.merge( + origin_child_table.data_source, on=new_parent_table.key + ) + + # Build the return value + # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking + if target_column is None: + return_tuple = tuple([train_ds, test_ds]) + else: + return_tuple = tuple( + [train_ds, test_ds, train_target_column, test_target_column] + ) + + return return_tuple + + +def _train_test_split_file_dataset(ds, test_size, output_dir): + domain = ds.create_khiops_dictionary_domain() + secondary_data_paths = domain.extract_data_paths(ds.main_table.name) + additional_data_tables = {} + output_additional_data_tables = { + "train": {}, + "test": {}, + } + # Initialize the split datasets as copies of the original one + split_dss = { + "train": ds.copy(), + "test": ds.copy(), + } + for split, split_ds in split_dss.items(): + split_ds.main_table.data_source = os.path.join( + output_dir, split, f"{split_ds.main_table.name}.txt" + ) + + for data_path in secondary_data_paths: + dictionary = domain.get_dictionary_at_data_path(data_path) + table = ds.get_table(dictionary.name) + additional_data_tables[data_path] = table.data_source + for ( + split, + split_output_additional_data_tables, + ) in output_additional_data_tables.items(): + data_table_path = os.path.join(output_dir, split, f"{table.name}.txt") + split_output_additional_data_tables[data_path] = data_table_path + split_dss[split].get_table(table.name).data_source = data_table_path + + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["train"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["train"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Include sample", + ) + kh.deploy_model( + domain, + ds.main_table.name, + ds.main_table.data_source, + split_dss["test"].main_table.data_source, + additional_data_tables=additional_data_tables, + output_additional_data_tables=output_additional_data_tables["test"], + header_line=ds.header, + field_separator=ds.sep, + output_header_line=ds.header, + output_field_separator=ds.sep, + sample_percentage=100.0 * (1 - test_size), + sampling_mode="Exclude sample", + ) + + # Note: We use `tuple` to avoid pylint warning about unbalanced-tuple-unpacking + return tuple([split_dss["train"], split_dss["test"]]) diff --git a/tests/test_helper_functions.py b/tests/test_helper_functions.py index b69d8331..233d720f 100644 --- a/tests/test_helper_functions.py +++ b/tests/test_helper_functions.py @@ -14,7 +14,7 @@ from khiops.core.dictionary import DictionaryDomain from khiops.core.helpers import build_multi_table_dictionary_domain -from khiops.utils.helpers import sort_dataset +from khiops.utils.helpers import sort_dataset, train_test_split_dataset class KhiopsHelperFunctions(unittest.TestCase): @@ -101,26 +101,29 @@ def test_build_multi_table_dictionary_domain(self): def test_sort_dataset_dataframe(self): """Tests that the sort_dataset function works for dataframe datasets""" # Create the fixture dataset - clients_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CLIENTS_CSV)) - calls_df = pd.read_csv(io.StringIO(UNSORTED_TEST_CALLS_CSV)) + clients_df = pd.read_csv(io.StringIO(UNSORTED_CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(UNSORTED_CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(UNSORTED_CONNECTIONS_CSV)) ds_spec = { "main_table": "clients", "tables": { "clients": (clients_df, ["id"]), "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), }, - "relations": [("clients", "calls", False)], + "relations": [("clients", "calls", False), ("calls", "connections", False)], } # Call the sort_dataset function sorted_ds_spec = sort_dataset(ds_spec) ref_sorted_table_dfs = { - "clients": pd.read_csv(io.StringIO(TEST_CLIENTS_CSV)), - "calls": pd.read_csv(io.StringIO(TEST_CALLS_CSV)), + "clients": pd.read_csv(io.StringIO(CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(CONNECTIONS_CSV)), } # Check that the structure of the sorted dataset - self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) # Check that the table specs are the equivalent and the tables are sorted for table_name in ds_spec["tables"]: @@ -132,25 +135,36 @@ def test_sort_dataset_dataframe(self): def test_sort_dataset_file(self): """Tests that the sort_dataset function works for file datasets""" - # Create a execution context with temporary files and directories + # Create a execution context for temporary files and directories with contextlib.ExitStack() as exit_stack: # Create temporary files and a temporary directory clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) # Create the fixture dataset - clients_csv_file.write(bytes(UNSORTED_TEST_CLIENTS_CSV, encoding="utf8")) - calls_csv_file.write(bytes(UNSORTED_TEST_CALLS_CSV, encoding="utf8")) + clients_csv_file.write(bytes(UNSORTED_CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(UNSORTED_CALLS_CSV, encoding="ascii")) + connections_csv_file.write( + bytes(UNSORTED_CONNECTIONS_CSV, encoding="ascii") + ) clients_csv_file.flush() calls_csv_file.flush() + connections_csv_file.flush() ds_spec = { "main_table": "clients", "tables": { "clients": (clients_csv_file.name, ["id"]), "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), }, - "relations": [("clients", "calls", False)], + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], "format": (",", True), } @@ -158,10 +172,14 @@ def test_sort_dataset_file(self): sorted_ds_spec = sort_dataset(ds_spec, output_dir=tmp_dir) # Check that the structure of the sorted dataset - self._assert_sorted_dataset_keeps_structure(ds_spec, sorted_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, sorted_ds_spec) # Check that the table specs are the equivalent and the tables are sorted - ref_sorted_tables = {"clients": TEST_CLIENTS_CSV, "calls": TEST_CALLS_CSV} + ref_sorted_tables = { + "clients": CLIENTS_CSV, + "calls": CALLS_CSV, + "connections": CONNECTIONS_CSV, + } for table_name, _ in ds_spec["tables"].items(): # Read the contents of the sorted table to a list of strings sorted_table_spec = sorted_ds_spec["tables"][table_name] @@ -178,22 +196,157 @@ def test_sort_dataset_file(self): # Check that the sorted table is equal to the reference self.assertEqual(ref_sorted_table, sorted_table) - def _assert_sorted_dataset_keeps_structure(self, ds_spec, sorted_ds_spec): - """Asserts that the sorted dataset keeps the structure of the input dataset + def test_traint_test_split_dataset_dataframe(self): + """Tests that the train_test_split_dataset function works for df datasets""" + # Create the fixture dataset + clients_df = pd.read_csv(io.StringIO(CLIENTS_CSV)) + calls_df = pd.read_csv(io.StringIO(CALLS_CSV)) + connections_df = pd.read_csv(io.StringIO(CONNECTIONS_CSV)) + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_df.drop("class", axis=1), ["id"]), + "calls": (calls_df, ["id", "call_id"]), + "connections": (connections_df, ["id", "call_id"]), + }, + "relations": [("clients", "calls", False), ("calls", "connections", False)], + } + y = clients_df["class"] + + # Execute the train/test split function + ds_spec_train, ds_spec_test, y_train, y_test = train_test_split_dataset( + ds_spec, y, test_size=0.5, random_state=31614 + ) + + # Check that the target are the same as the reference + ref_y_train = pd.read_csv(io.StringIO(TRAIN_DF_TARGET_CSV))["class"] + ref_y_test = pd.read_csv(io.StringIO(TEST_DF_TARGET_CSV))["class"] + self._assert_series_equal(ref_y_train, y_train.reset_index()["class"]) + self._assert_series_equal(ref_y_test, y_test.reset_index()["class"]) + + # Check that the dataset spec structure is the same + self._assert_dataset_keeps_structure(ds_spec_train, ds_spec) + self._assert_dataset_keeps_structure(ds_spec_test, ds_spec) + + # Check that the table contents match those of the references + split_ds_specs = { + "train": ds_spec_train, + "test": ds_spec_test, + } + ref_table_dfs = { + "train": { + "clients": pd.read_csv(io.StringIO(TRAIN_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TRAIN_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TRAIN_DF_CONNECTIONS_CSV)), + }, + "test": { + "clients": pd.read_csv(io.StringIO(TEST_DF_CLIENTS_CSV)), + "calls": pd.read_csv(io.StringIO(TEST_DF_CALLS_CSV)), + "connections": pd.read_csv(io.StringIO(TEST_DF_CONNECTIONS_CSV)), + }, + } + for split, ref_tables in ref_table_dfs.items(): + for table_name in ds_spec["tables"]: + with self.subTest(split=split, table_name=table_name): + self._assert_frame_equal( + split_ds_specs[split]["tables"][table_name][0].reset_index( + drop=True + ), + ref_tables[table_name].reset_index(drop=True), + ) + + def test_train_test_split_dataset_file(self): + """Tests that the train_test_split_dataset function works for file datasets""" + # Create a execution context for temporary files and directories + with contextlib.ExitStack() as exit_stack: + # Create temporary files and a temporary directory + clients_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + calls_csv_file = exit_stack.enter_context(tempfile.NamedTemporaryFile()) + connections_csv_file = exit_stack.enter_context( + tempfile.NamedTemporaryFile() + ) + tmp_dir = exit_stack.enter_context(tempfile.TemporaryDirectory()) + + # Create the fixture dataset + clients_csv_file.write(bytes(CLIENTS_CSV, encoding="ascii")) + calls_csv_file.write(bytes(CALLS_CSV, encoding="ascii")) + connections_csv_file.write(bytes(CONNECTIONS_CSV, encoding="ascii")) + clients_csv_file.flush() + calls_csv_file.flush() + connections_csv_file.flush() + ds_spec = { + "main_table": "clients", + "tables": { + "clients": (clients_csv_file.name, ["id"]), + "calls": (calls_csv_file.name, ["id", "call_id"]), + "connections": (connections_csv_file.name, ["id", "call_id"]), + }, + "relations": [ + ("clients", "calls", False), + ("calls", "connections", False), + ], + "format": (",", True), + } + + # Call the train_test_split_dataset function + train_ds_spec, test_ds_spec = train_test_split_dataset( + ds_spec, test_size=0.5, output_dir=tmp_dir + ) + split_ds_specs = {"train": train_ds_spec, "test": test_ds_spec} + + # Check that the structure of the splitted datasets + self._assert_dataset_keeps_structure(ds_spec, train_ds_spec) + self._assert_dataset_keeps_structure(ds_spec, test_ds_spec) + + # Check that the table specs are the equivalent and the tables are sorted + ref_split_tables = { + "train": { + "clients": TRAIN_FILE_CLIENTS_CSV, + "calls": TRAIN_FILE_CALLS_CSV, + "connections": TRAIN_FILE_CONNECTIONS_CSV, + }, + "test": { + "clients": TEST_FILE_CLIENTS_CSV, + "calls": TEST_FILE_CALLS_CSV, + "connections": TEST_FILE_CONNECTIONS_CSV, + }, + } + for split, split_ds_spec in split_ds_specs.items(): + for table_name, _ in ds_spec["tables"].items(): + # Read the contents of the splitted table to a list of strings + split_table_spec = split_ds_spec["tables"][table_name] + split_table_file = exit_stack.enter_context( + open(split_table_spec[0], encoding="ascii") + ) + split_table = split_table_file.readlines() + + # Transform the reference table string to a list of strings + ref_split_table = ref_split_tables[split][table_name].splitlines( + keepends=True + ) + + # Check that the sorted table is equal to the reference + self.assertEqual(split_table, ref_split_table) + + def _assert_dataset_keeps_structure(self, ds_spec, ref_ds_spec): + """Asserts that the input dataset has the same structure as the reference It does not check the contents of the tables. """ # Check that the spec dictionary is the same excluding the tables - self.assertIn("main_table", sorted_ds_spec) - self.assertIn("tables", sorted_ds_spec) - self.assertIn("relations", sorted_ds_spec) - self.assertEqual(ds_spec["main_table"], sorted_ds_spec["main_table"]) - self.assertEqual(ds_spec["relations"], sorted_ds_spec["relations"]) - self.assertEqual(ds_spec["tables"].keys(), sorted_ds_spec["tables"].keys()) + self.assertIn("main_table", ref_ds_spec) + self.assertIn("tables", ref_ds_spec) + self.assertIn("relations", ref_ds_spec) + self.assertEqual(ds_spec["main_table"], ref_ds_spec["main_table"]) + self.assertEqual(ds_spec["relations"], ref_ds_spec["relations"]) + self.assertEqual(ds_spec["tables"].keys(), ref_ds_spec["tables"].keys()) + if "format" in ref_ds_spec: + self.assertIn("format", ds_spec) + self.assertEqual(ds_spec["format"], ref_ds_spec["format"]) # Check that the table keys are equal for table_name, table_spec in ds_spec["tables"].items(): - self.assertEqual(table_spec[1], sorted_ds_spec["tables"][table_name][1]) + self.assertEqual(table_spec[1], ref_ds_spec["tables"][table_name][1]) def _assert_frame_equal(self, ref_df, out_df): """Wrapper for the assert_frame_equal pandas function @@ -210,19 +363,37 @@ def _assert_frame_equal(self, ref_df, out_df): if failure_error is not None: self.fail(failure_error) + def _assert_series_equal(self, ref_series, out_series): + """Wrapper for the assert_frame_equal pandas function + + In case of failure of assert_frame_equal we capture the AssertionError thrown by + it and make a unittest call to fail. This reports the error found by + assert_frame_equal while avoiding a double thrown exception. + """ + failure_error = None + try: + pd.testing.assert_series_equal(ref_series, out_series) + except AssertionError as error: + failure_error = error + if failure_error is not None: + self.fail(failure_error) + # pylint: disable=line-too-long # fmt: off -TEST_CLIENTS_CSV = """ -id,name,phone,email,address,numberrange,time,date -1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" -10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" -13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" -4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" -7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" + +# Test data + +CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 """.lstrip() -TEST_CALLS_CSV = """ +CALLS_CSV = """ id,call_id,duration 1,1,38 1,20,29 @@ -235,24 +406,204 @@ def _assert_frame_equal(self, ref_df, out_df): 7,4,339 """.lstrip() -UNSORTED_TEST_CLIENTS_CSV = """ +CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip() + +UNSORTED_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +UNSORTED_CALLS_CSV = """ +id,call_id,duration +1,1,38 +10,2,7 +13,25,329 +4,2,543 +13,30,8 +13,3,1 +4,14,48 +1,20,29 +7,4,339 +""".lstrip() + +UNSORTED_CONNECTIONS_CSV = """ +id,call_id,connection_ip +13,25,193.23.02.67 +1,1,277.1.56.30 +4,14,16.56.66.16 +13,25,146.74.18.88 +13,25,118.41.87.47 +1,1,147.43.67.35 +4,14,19.30.36.57 +1,20,199.44.70.12 +10,2,170.05.79.41 +1,20,169.51.97.96 +10,2,118.45.57.51 +13,25,161.51.79.60 +13,3,115.45.02.58 +4,14,15.16.40.67 +1,1,164.27.26.50 +7,4,16.66.64.13 +13,30,12.115.90.93 +7,4,15.13.69.18 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + +TRAIN_DF_CLIENTS_CSV = """ id,name,phone,email,address,numberrange,time,date -1,Hakeem Wilkinson,1-352-535-7028,at.pede@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" -13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" 7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023" +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024" +""".lstrip() + +TRAIN_DF_CALLS_CSV = """ +id,call_id,duration +7,4,339 +13,25,329 +13,3,1 +13,30,8 +""".lstrip() + +TRAIN_DF_TARGET_CSV = """ +class +1 +0 +""".lstrip() + +TRAIN_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +7,4,16.66.64.13 +7,4,15.13.69.18 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +""".lstrip() + + +TEST_DF_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date 4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025" 10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025" +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024" """.lstrip() -UNSORTED_TEST_CALLS_CSV = """ +TEST_DF_TARGET_CSV = """ +class +1 +0 +1 +""".lstrip() + + +TEST_DF_CALLS_CSV = """ id,call_id,duration +4,14,48 +4,2,543 +10,2,7 1,1,38 +1,20,29 +""".lstrip() + +TEST_DF_CONNECTIONS_CSV = """ +id,call_id,connection_ip +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +10,2,170.05.79.41 +10,2,118.45.57.51 +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +""".lstrip() + +TRAIN_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +10,Axel Holman,1-340-743-8860,est@google.com,Ap #737-7185 Donec St.,9,1:17 PM,"Jan 8, 2025",0 +13,Armando Cleveland,(520) 285-3188,amet.consectetuer@icloud.edu,Ap #167-1519 Tempus Avenue,8,1:50 PM,"Jul 24, 2024",0 +4,Edward Miles,(959) 886-5744,in.nec@outlook.edu,2184 Gravida Road,6,10:02 PM,"Mar 30, 2025",1 +""".lstrip() + +TRAIN_FILE_CALLS_CSV = """ +id,call_id,duration 10,2,7 13,25,329 -4,2,543 -13,30,8 13,3,1 +13,30,8 4,14,48 +4,2,543 +""".lstrip() + +TRAIN_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +10,2,170.05.79.41 +10,2,118.45.57.51 +13,25,193.23.02.67 +13,25,146.74.18.88 +13,25,118.41.87.47 +13,25,161.51.79.60 +13,3,115.45.02.58 +13,30,12.115.90.93 +4,14,16.56.66.16 +4,14,19.30.36.57 +4,14,15.16.40.67 +4,2,10.189.71.73 +4,2,10.6.76.93 +""".lstrip() + + +TEST_FILE_CLIENTS_CSV = """ +id,name,phone,email,address,numberrange,time,date,class +1,Hakeem Wilkinson,1-352-535-7028,at.pete@outlook.org,247-2921 Elit. Rd.,2,3:02 PM,"May 1, 2024",1 +7,Aurora Valentine,1-838-806-6257,etiam.gravida.molestie@yahoo.com,Ap #923-3118 Ante Ave,8,4:02 AM,"Dec 12, 2023",1 +""".lstrip() + +TEST_FILE_CALLS_CSV = """ +id,call_id,duration +1,1,38 1,20,29 7,4,339 """.lstrip() + +TEST_FILE_CONNECTIONS_CSV = """ +id,call_id,connection_ip +1,1,277.1.56.30 +1,1,147.43.67.35 +1,1,164.27.26.50 +1,20,199.44.70.12 +1,20,169.51.97.96 +7,4,16.66.64.13 +7,4,15.13.69.18 +""".lstrip()