diff --git a/data/dicts/mexico_abbreviations.csv b/data/dicts/mexico_abbreviations.csv
index 0a0cbf31..4bd38473 100644
--- a/data/dicts/mexico_abbreviations.csv
+++ b/data/dicts/mexico_abbreviations.csv
@@ -8,4 +8,100 @@ BLVD,BOULEVARD
 LT,LOTE
 MZ,MANZANA
 CDMX,Ciudad de México
-DF,Distrito Federal
\ No newline at end of file
+DF,Distrito Federal
+AGU,Aguascalientes
+BCN,Baja California
+BCS,Baja California Sur
+CAM,Campeche
+CHP,Chiapas
+CHH,Chihuahua
+COA,Coahuila
+CL,Colima
+DUR,Durango
+MEX,Estado de México
+GTO,Guanajuato
+GRO,Guerrero
+HGO,Hidalgo
+JAL,Jalisco
+MIC,Michoacán
+MOR,Morelos
+NAY,Nayarit
+NLE,Nuevo León
+OAX,Oaxaca
+PUE,Puebla
+QRO,Querétaro
+QR,Quintana Roo
+SLP,San Luis Potosí
+SIN,Sinaloa
+SON,Sonora
+TAB,Tabasco
+TAM,Tamaulipas
+TLAX,Tlaxcala
+VER,Veracruz
+YUC,Yucatán
+ZAC,Zacatecas
+CDMX,Ciudad de México
+ARS,Aguascalientes	
+AG,Aguascalientes	
+B.C,Baja California
+BC,Baja California
+B.C.S,Baja California Sur
+BCS,Baja California Sur
+Camp,Campeche
+CM,Campeche
+Chis,Chiapas
+CS,Chiapas
+Chih,Chihuahua
+CH,Chihuahua
+Coah,Coahuila
+CO,Coahuila
+Col,Colima
+CL,Colima
+CDMX,Ciudad de México
+DF,Ciudad de México
+Dgo,Durango
+DG,Durango
+Gto,Guanajuato
+GT,Guanajuato
+Gro,Guerrero
+GR,Guerrero
+Hgo,Hidalgo
+HG,Hidalgo
+Jal,Jalisco
+JA,Jalisco
+Edomex,Mexico
+MEX,Mexico
+Mich,Michoacán
+MI,Michoacán
+Mor,Morelos
+MO,Morelos
+Nay,Nayarit
+NA,Nayarit
+N.L,Nuevo León
+NL,Nuevo León
+Oax,Oaxaca
+OA,Oaxaca
+Pue,Puebla
+PU,Puebla
+Qro,Querétaro
+QT,Querétaro
+Q.R,Quintana Roo
+QR,Quintana Roo
+S.L.P,San Luis Potosí
+SL,San Luis Potosí
+Sin,Sinaloa
+SI,Sinaloa
+Son,Sonora
+SO,Sonora
+Tab,Tabasco
+TB,Tabasco
+Tamps,Tamaulipas
+TM,Tamaulipas
+Tlax,Tlaxcala
+TL,Tlaxcala
+Ver,Veracruz
+VE,Veracruz
+Yuc,Yucatán
+YU,Yucatán
+Zac,Zacatecas
+ZA,Zacatecas
diff --git a/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl
index 73da2d97..2b582ac3 100644
Binary files a/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl and b/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl differ
diff --git a/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl
new file mode 100644
index 00000000..9f734744
Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl differ
diff --git a/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl
new file mode 100644
index 00000000..0a4c278a
Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl differ
diff --git a/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl
new file mode 100644
index 00000000..719cacf1
Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl differ
diff --git a/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl
index 85585477..e7e421ae 100644
Binary files a/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl and b/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl differ
diff --git a/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv
new file mode 100644
index 00000000..8d3908d4
--- /dev/null
+++ b/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv
@@ -0,0 +1,2 @@
+feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio
+15.03125,30.0,30.0,30.0,4.735393717824877,1.9491525423728815,2.0
diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv
new file mode 100644
index 00000000..239a4e65
--- /dev/null
+++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv
@@ -0,0 +1,2 @@
+feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio
+15.03125,30.0,30.0,30.0,4.680689288944333,1.9491525423728815,2.0
diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv
new file mode 100644
index 00000000..b955f54d
--- /dev/null
+++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv
@@ -0,0 +1,2 @@
+feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio
+15.03125,30.0,30.0,30.0,4.680689288944333,1.9491525423728815
diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv
new file mode 100644
index 00000000..ef125b19
--- /dev/null
+++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv
@@ -0,0 +1,2 @@
+feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy
+15.03125,30.0,30.0,30.0,4.779780045430954
diff --git a/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv
new file mode 100644
index 00000000..073ac606
--- /dev/null
+++ b/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv
@@ -0,0 +1,2 @@
+feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio
+15.03125,30.0,30.0,30.0,1.9491525423728815,2.0
diff --git a/examples/MEXICO_predict_sets.ipynb b/examples/MEXICO_predict_sets.ipynb
new file mode 100644
index 00000000..44af215e
--- /dev/null
+++ b/examples/MEXICO_predict_sets.ipynb
@@ -0,0 +1,323 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Illustrative Examples of Library Function Usage\n",
+    "\n",
+    "This notebook is designed for Hygia library users and provides examples of utilizing the main functions in the library. It is one of the resources offered by the Hygia community to support new users. For further information, please visit our documentation at https://hygia-org.github.io/hygia/.\n",
+    "\n",
+    "The example pipeline demonstrated in this notebook covers the following steps: importing dependencies, loading the model, pre-processing the data (e.g., concatenating and creating new columns), using the prediction and model functions, and finally saving the model results.\n",
+    "\n",
+    "## Imports and classes instanciations\n",
+    "\n",
+    " To take advantage of the library's functions and proceed with the pipeline, you will first need to import the Pandas and Hygia libraries.\n",
+    "\n",
+    "As a starting point, when first using the library, it is recommended to initialize the pre-processing and feature engineering classes. This will set the foundation for selecting the desired model stored in the .pkl format in the folder (/data/models/).\n",
+    "\n",
+    "Before utilizing the library functions, it is important to familiarize yourself with the pre-processing and feature engineering classes, which play a crucial role in the data preparation process. Once you have a clear understanding of these classes, you can then proceed to select the model that best fits your needs from the available options stored in the folder (/data/models/). With the right model selected, you can then proceed to execute the pipeline and achieve the desired results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n",
+      "\u001b[1mlanguage -> \u001b[22mes\n",
+      "\u001b[1mdimensions -> \u001b[22m25\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import hygia as hg\n",
+    "\n",
+    "# Chose your model based on the configs sets below\n",
+    "set_0 = {\n",
+    "    'set_name': 'rforest_ksmash_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n",
+    "}\n",
+    "set_1 = {\n",
+    "    'set_name': 'rforest_ksmash_regex_normal',\n",
+    "    'ignore_word_embedding': True,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "set_2 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_3 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_4 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "chosen_set = set_3\n",
+    "\n",
+    "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n",
+    "augment_data = hg.AugmentData(country=\"MEXICO\")\n",
+    "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n",
+    "                                            ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n",
+    "                                            ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n",
+    "                                            ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n",
+    "                                            ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n",
+    "                                            )\n",
+    "rf_model = hg.RandomForestModel(f\"../data/models/{chosen_set['model_output']}\",\n",
+    "                                normalization_absolutes_file=f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data\n",
+    "\n",
+    "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n",
+    "\n",
+    "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n",
+    "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Augment Data with context validations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = augment_data.augment_data(df, zipcode_column_name='ZIP_CODE_L')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add new columns\n",
+    "\n",
+    "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n",
+    "\n",
+    "1. Concatenate address\n",
+    "2. All features columns:\n",
+    "    - Key Smash\n",
+    "    - Regex\n",
+    "    - Word Embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
+      "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n",
+      "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
+     ]
+    }
+   ],
+   "source": [
+    "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
+    "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
+    "df = feature_engineering.extract_features(df, concatened_column_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check new columns names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
+    "model_features_columns = all_features_columns\n",
+    "model_features_columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Predict using pre-trained model\n",
+    "\n",
+    "This notebook showcases the utilization of a pre-trained model and its demonstration through prediction with the help of the pandas library. This serves as an example of how the Hygia library can be employed to perform predictions on your data, providing insight and generating new information based on the data at hand. The notebook also highlights the versatility of the Hygia library as it can be used in conjunction with other libraries such as pandas, further expanding its capabilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning model...\u001b[37m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0    2512460\n",
+       "1.0       7836\n",
+       "Name: prediction_is_key_smash, dtype: int64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['prediction_is_key_smash'] = rf_model.predict(df[model_features_columns], concatened_column_name)\n",
+    "df['prediction_is_key_smash'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save predicted data\n",
+    "\n",
+    "Por fim um exemplo de como salvar os dados e resultados do modelo armazenado no campo prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n",
+    "    .drop_duplicates(subset=[concatened_column_name]) \\\n",
+    "    .to_csv(f\"../data/tmp/prediction_{chosen_set['set_name']}.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/MEXICO_retrain_predict_example.ipynb b/examples/MEXICO_retrain_predict_example.ipynb
index 71b8a78a..6ef08e15 100644
--- a/examples/MEXICO_retrain_predict_example.ipynb
+++ b/examples/MEXICO_retrain_predict_example.ipynb
@@ -307,13 +307,15 @@
    ],
    "source": [
     "key_smash_thresholds = {\n",
-    "    'count_sequence_squared_vowels': 1.00,\n",
-    "    'count_sequence_squared_consonants': 1.999,\n",
-    "    'count_sequence_squared_special_characters': 2.2499,\n",
-    "    # 'ratio_of_numeric_digits_squared': 2.9,\n",
-    "    'average_of_char_count_squared': 2.78,\n",
+    "    'count_sequence_squared_vowels': ['above', 1.00],\n",
+    "    'count_sequence_squared_consonants':['above',  1.999],\n",
+    "    'count_sequence_squared_special_characters': ['above', 2.2499],\n",
+    "    # 'ratio_of_numeric_digits_squared': ['above', 2.9],\n",
+    "    'average_of_char_count_squared': ['above', 2.78],\n",
+    "    'shannon_entropy' : ['below', 2.0],\n",
     "}\n",
     "\n",
+    "\n",
     "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n",
     "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()"
    ]
diff --git a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb
index 6263dde5..d7be5ab4 100644
--- a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb
+++ b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb
@@ -270,11 +270,12 @@
    ],
    "source": [
     "key_smash_thresholds = {\n",
-    "    'count_sequence_squared_vowels': 1.00,\n",
-    "    'count_sequence_squared_consonants': 1.999,\n",
-    "    'count_sequence_squared_special_characters': 2.2499,\n",
-    "    # 'ratio_of_numeric_digits_squared': 2.9,\n",
-    "    'average_of_char_count_squared': 2.78,\n",
+    "    'count_sequence_squared_vowels': ['above', 1.00],\n",
+    "    'count_sequence_squared_consonants':['above',  1.999],\n",
+    "    'count_sequence_squared_special_characters': ['above', 2.2499],\n",
+    "    # 'ratio_of_numeric_digits_squared': ['above', 2.9],\n",
+    "    'average_of_char_count_squared': ['above', 2.78],\n",
+    "    'shannon_entropy' : ['below', 2.0],\n",
     "}\n",
     "\n",
     "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n",
diff --git a/examples/MEXICO_retrain_sets copy.ipynb b/examples/MEXICO_retrain_sets copy.ipynb
new file mode 100644
index 00000000..26286304
--- /dev/null
+++ b/examples/MEXICO_retrain_sets copy.ipynb	
@@ -0,0 +1,506 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exemplo de uso para treinar o modelo\n",
+    "\n",
+    "Welcome to the Hygia Boilerplate! This resource is designed to help data scientists understand and utilize the full capabilities of the Hygia library. The Hygia library provides a comprehensive suite of tools for pre-processing, feature engineering, model training, and prediction. By using this boilerplate, you will gain a deeper understanding of how to effectively use the library to perform various tasks in the data science pipeline.\n",
+    "\n",
+    "Starting with pre-processing, the Hygia library provides functions for cleaning and transforming your data. This is an important step in preparing your data for analysis and modeling. The library also includes functions for feature engineering, allowing you to create new features and extract insights from your data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import hygia as hg\n",
+    "import time"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chose your model based on the configs sets below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_0 = {\n",
+    "    'set_name': 'rforest_ksmash_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n",
+    "}\n",
+    "set_1 = {\n",
+    "    'set_name': 'rforest_ksmash_regex_normal',\n",
+    "    'ignore_word_embedding': True,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "set_2 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_3 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_4 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "chosen_set = set_0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classes instanciations\n",
+    "\n",
+    "As a starting point, when first using the library, it is recommended to initialize the pre-processing, feature engineering, annotate data, and new random forest classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n",
+      "\u001b[1mlanguage -> \u001b[22mes\n",
+      "\u001b[1mdimensions -> \u001b[22m25\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n",
+    "augment_data = hg.AugmentData(country=\"MEXICO\")\n",
+    "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n",
+    "                                            ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n",
+    "                                            ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n",
+    "                                            ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n",
+    "                                            ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n",
+    "                                            )\n",
+    "annotate_data = hg.AnnotateData()\n",
+    "new_rf_model = hg.RandomForestModel()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data\n",
+    "\n",
+    "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n",
+    "\n",
+    "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file.\n",
+    "\n",
+    "NOTE: Please check if the file_path matches your data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n",
+    "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add new columns\n",
+    "\n",
+    "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n",
+    "\n",
+    "1. Concatenate address\n",
+    "2. All features columns:\n",
+    "    - Key Smash\n",
+    "    - Regex\n",
+    "    - Word Embedding\n",
+    "\n",
+    "NOTE: Please check if the columns names matches your data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
+      "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n",
+      "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
+     ]
+    }
+   ],
+   "source": [
+    "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
+    "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
+    "df = feature_engineering.extract_features(df, concatened_column_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Check new columns names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_repeated_bigram_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_unique_char_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
+    "all_features_columns"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Select Features\n",
+    "- remove word embeddings\n",
+    "- remove key smash feature: ratio_of_numeric_digits_squared"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "selected_features = all_features_columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Annotate data\n",
+    "\n",
+    "The Hygia library has a dedicated class to assist in the process of annotating data using keyboard smashing threshold. This information can then be used to improve the performance of machine learning models by providing more relevant training data. The use of the Hygia library's annotation functions is a key step in ensuring that your data is ready for analysis and can lead to more accurate and reliable results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning annotate data with configs below...\u001b[37m\n",
+      "\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': ['above', 1.0], 'count_sequence_squared_consonants': ['above', 1.999], 'count_sequence_squared_special_characters': ['above', 2.2499], 'ratio_of_numeric_digits_squared': ['above', 2.9], 'average_of_char_count_squared': ['above', 2.78], 'shannon_entropy': ['below', 1.0], 'repeated_bigram_ratio': ['above', 1.7058], 'unique_char_ratio': ['below', 1.15789]}\n",
+      "column -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "valid                             1337828\n",
+       "key_smash                             645\n",
+       "contains_email                        567\n",
+       "contains_exactly_the_word_test        177\n",
+       "only_special_characters               144\n",
+       "contains_context_invalid_words        128\n",
+       "contains_exactly_the_word_dell        125\n",
+       "only_numbers                          106\n",
+       "only_one_char                          14\n",
+       "contains_exactly_invalid_words         10\n",
+       "is_substring_of_column_name             3\n",
+       "contains_date                           1\n",
+       "empty                                   1\n",
+       "Name: target, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "key_smash_thresholds = {\n",
+    "    'count_sequence_squared_vowels': ['above', 1.00],\n",
+    "    'count_sequence_squared_consonants':['above',  1.999],\n",
+    "    'count_sequence_squared_special_characters': ['above', 2.2499],\n",
+    "    'ratio_of_numeric_digits_squared': ['above', 2.9],\n",
+    "    'average_of_char_count_squared': ['above', 2.78],\n",
+    "    'shannon_entropy' : ['below', 1.0],\n",
+    "    'repeated_bigram_ratio' : ['above', 1.7058],\n",
+    "    'unique_char_ratio' : ['below', 1.15789],\n",
+    "}\n",
+    "\n",
+    "\n",
+    "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n",
+    "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "valid                             2511552\n",
+       "contains_context_invalid_words       3079\n",
+       "key_smash                            1472\n",
+       "only_special_characters              1291\n",
+       "contains_email                       1045\n",
+       "contains_exactly_the_word_test        667\n",
+       "contains_exactly_the_word_dell        553\n",
+       "only_one_char                         287\n",
+       "only_numbers                          239\n",
+       "empty                                  71\n",
+       "contains_exactly_invalid_words         26\n",
+       "is_substring_of_column_name            12\n",
+       "contains_date                           2\n",
+       "Name: target, dtype: int64"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['target'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Experiment: retrain model\n",
+    "\n",
+    "In addition to pre-processing and feature engineering, the Hygia library provides tools for training and retraining models. You can use the available models, or train your own using the functions provided. Once you have trained your model, you can use the prediction function to make predictions based on your data. Finally, the library includes functions for saving your model, so that you can use it again in the future."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mtranning model...\u001b[37m\n",
+      "\u001b[32mdone\u001b[37m\n",
+      "\u001b[33mget model score...\u001b[37m\n",
+      "\u001b[1maccuracy -> \u001b[22m0.9857142857142858\n",
+      "\u001b[1mprecision -> \u001b[22m0.967741935483871\n",
+      "\u001b[1mrecall -> \u001b[22m0.972972972972973\n",
+      "\u001b[1mf1 -> \u001b[22m0.9703504043126685\n"
+     ]
+    }
+   ],
+   "source": [
+    "scores = new_rf_model.train_and_get_scores(df, concatened_column_name, selected_features)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Predict using pre-trained model\n",
+    "\n",
+    "After retraining the model you can make the prediction and save the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning model...\u001b[37m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0    1337014\n",
+       "1.0       2735\n",
+       "Name: prediction, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['prediction'] = new_rf_model.predict(df[selected_features], concatened_column_name)\n",
+    "df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Save model and predicted data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mexporting model and normalization absolutes...\u001b[37m\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_rf_model.export_model(f\"../data/models/{chosen_set['model_output']}\",\n",
+    "                          f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[df['prediction'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n",
+    "    .drop_duplicates(subset=[concatened_column_name]) \\\n",
+    "    .to_csv(f\"../data/tmp/{time.strftime('%Y%m%d-%H%M%S')}prediction_{chosen_set['set_name']}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We hope that this boilerplate provides you with a clear understanding of the capabilities of the Hygia library and inspires you to explore its full potential. With its comprehensive suite of tools, the Hygia library is a valuable resource for any data scientist looking to streamline their workflow and perform high-quality data analysis and modeling."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/MEXICO_retrain_sets.ipynb b/examples/MEXICO_retrain_sets.ipynb
new file mode 100644
index 00000000..bde5e47b
--- /dev/null
+++ b/examples/MEXICO_retrain_sets.ipynb
@@ -0,0 +1,482 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exemplo de uso para treinar o modelo\n",
+    "\n",
+    "Welcome to the Hygia Boilerplate! This resource is designed to help data scientists understand and utilize the full capabilities of the Hygia library. The Hygia library provides a comprehensive suite of tools for pre-processing, feature engineering, model training, and prediction. By using this boilerplate, you will gain a deeper understanding of how to effectively use the library to perform various tasks in the data science pipeline.\n",
+    "\n",
+    "Starting with pre-processing, the Hygia library provides functions for cleaning and transforming your data. This is an important step in preparing your data for analysis and modeling. The library also includes functions for feature engineering, allowing you to create new features and extract insights from your data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import hygia as hg\n",
+    "import time"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chose your model based on the configs sets below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_0 = {\n",
+    "    'set_name': 'rforest_ksmash_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n",
+    "}\n",
+    "set_1 = {\n",
+    "    'set_name': 'rforest_ksmash_regex_normal',\n",
+    "    'ignore_word_embedding': True,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "set_2 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_3 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': True,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "set_4 = {\n",
+    "    'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n",
+    "    'ignore_word_embedding': False,\n",
+    "    'ignore_shannon_entropy': False,\n",
+    "    'ignore_repeated_bigram_ratio': False,\n",
+    "    'ignore_unique_char_ratio': False,\n",
+    "    'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n",
+    "}\n",
+    "\n",
+    "chosen_set = set_1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classes instanciations\n",
+    "\n",
+    "As a starting point, when first using the library, it is recommended to initialize the pre-processing, feature engineering, annotate data, and new random forest classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n",
+      "\u001b[1mlanguage -> \u001b[22mes\n",
+      "\u001b[1mdimensions -> \u001b[22m25\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n",
+    "augment_data = hg.AugmentData(country=\"MEXICO\")\n",
+    "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n",
+    "                                            ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n",
+    "                                            ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n",
+    "                                            ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n",
+    "                                            ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n",
+    "                                            )\n",
+    "annotate_data = hg.AnnotateData()\n",
+    "new_rf_model = hg.RandomForestModel()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data\n",
+    "\n",
+    "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n",
+    "\n",
+    "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file.\n",
+    "\n",
+    "NOTE: Please check if the file_path matches your data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n",
+    "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add new columns\n",
+    "\n",
+    "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n",
+    "\n",
+    "1. Concatenate address\n",
+    "2. All features columns:\n",
+    "    - Key Smash\n",
+    "    - Regex\n",
+    "    - Word Embedding\n",
+    "\n",
+    "NOTE: Please check if the columns names matches your data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n",
+      "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n",
+      "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
+     ]
+    }
+   ],
+   "source": [
+    "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n",
+    "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n",
+    "df = feature_engineering.extract_features(df, concatened_column_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Check new columns names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_shannon_entropy_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_repeated_bigram_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_ks_unique_char_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n",
+       " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n",
+    "all_features_columns"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Select Features\n",
+    "- remove word embeddings\n",
+    "- remove key smash feature: ratio_of_numeric_digits_squared"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "selected_features = all_features_columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Annotate data\n",
+    "\n",
+    "The Hygia library has a dedicated class to assist in the process of annotating data using keyboard smashing threshold. This information can then be used to improve the performance of machine learning models by providing more relevant training data. The use of the Hygia library's annotation functions is a key step in ensuring that your data is ready for analysis and can lead to more accurate and reliable results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning annotate data with configs below...\u001b[37m\n",
+      "\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': ['above', 1.0], 'count_sequence_squared_consonants': ['above', 1.999], 'count_sequence_squared_special_characters': ['above', 2.2499], 'ratio_of_numeric_digits_squared': ['above', 2.9], 'average_of_char_count_squared': ['above', 2.78], 'shannon_entropy': ['below', 1.0], 'repeated_bigram_ratio': ['above', 1.7058], 'unique_char_ratio': ['below', 1.15789]}\n",
+      "column -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "valid                             1337757\n",
+       "key_smash                             716\n",
+       "contains_email                        567\n",
+       "contains_exactly_the_word_test        177\n",
+       "only_special_characters               144\n",
+       "contains_context_invalid_words        128\n",
+       "contains_exactly_the_word_dell        125\n",
+       "only_numbers                          106\n",
+       "only_one_char                          14\n",
+       "contains_exactly_invalid_words         10\n",
+       "is_substring_of_column_name             3\n",
+       "contains_date                           1\n",
+       "empty                                   1\n",
+       "Name: target, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "key_smash_thresholds = {\n",
+    "    'count_sequence_squared_vowels': ['above', 1.00],\n",
+    "    'count_sequence_squared_consonants':['above',  1.999],\n",
+    "    'count_sequence_squared_special_characters': ['above', 2.2499],\n",
+    "    'ratio_of_numeric_digits_squared': ['above', 2.9],\n",
+    "    'average_of_char_count_squared': ['above', 2.78],\n",
+    "    'shannon_entropy' : ['below', 1.0],\n",
+    "    'repeated_bigram_ratio' : ['above', 1.7058],\n",
+    "    'unique_char_ratio' : ['below', 1.15789],\n",
+    "}\n",
+    "\n",
+    "\n",
+    "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n",
+    "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "valid                             2510903\n",
+       "contains_context_invalid_words       3079\n",
+       "key_smash                            2121\n",
+       "only_special_characters              1291\n",
+       "contains_email                       1045\n",
+       "contains_exactly_the_word_test        667\n",
+       "contains_exactly_the_word_dell        553\n",
+       "only_one_char                         287\n",
+       "only_numbers                          239\n",
+       "empty                                  71\n",
+       "contains_exactly_invalid_words         26\n",
+       "is_substring_of_column_name            12\n",
+       "contains_date                           2\n",
+       "Name: target, dtype: int64"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['target'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Experiment: retrain model\n",
+    "\n",
+    "In addition to pre-processing and feature engineering, the Hygia library provides tools for training and retraining models. You can use the available models, or train your own using the functions provided. Once you have trained your model, you can use the prediction function to make predictions based on your data. Finally, the library includes functions for saving your model, so that you can use it again in the future."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mtranning model...\u001b[37m\n",
+      "\u001b[32mdone\u001b[37m\n",
+      "\u001b[33mget model score...\u001b[37m\n",
+      "\u001b[1maccuracy -> \u001b[22m0.991389913899139\n",
+      "\u001b[1mprecision -> \u001b[22m0.9774774774774775\n",
+      "\u001b[1mrecall -> \u001b[22m0.9908675799086758\n",
+      "\u001b[1mf1 -> \u001b[22m0.9841269841269841\n"
+     ]
+    }
+   ],
+   "source": [
+    "scores = new_rf_model.train_and_get_scores(df, concatened_column_name, selected_features)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Predict using pre-trained model\n",
+    "\n",
+    "After retraining the model you can make the prediction and save the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mrunning model...\u001b[37m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0    1338136\n",
+       "1.0       1613\n",
+       "Name: prediction, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['prediction'] = new_rf_model.predict(df[selected_features], concatened_column_name)\n",
+    "df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Save model and predicted data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mexporting model and normalization absolutes...\u001b[37m\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_rf_model.export_model(f\"../data/models/{chosen_set['model_output']}\",\n",
+    "                          f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[df['prediction'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n",
+    "    .drop_duplicates(subset=[concatened_column_name]) \\\n",
+    "    .to_csv(f\"../data/tmp/{time.strftime('%Y%m%d-%H%M%S')}prediction_{chosen_set['set_name']}.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We hope that this boilerplate provides you with a clear understanding of the capabilities of the Hygia library and inspires you to explore its full potential. With its comprehensive suite of tools, the Hygia library is a valuable resource for any data scientist looking to streamline their workflow and perform high-quality data analysis and modeling."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/hygia/data_pipeline/annotate_data/annotate_data.py b/hygia/data_pipeline/annotate_data/annotate_data.py
index 4234beee..bddc796a 100644
--- a/hygia/data_pipeline/annotate_data/annotate_data.py
+++ b/hygia/data_pipeline/annotate_data/annotate_data.py
@@ -13,11 +13,12 @@ class AnnotateData:
 
         annotate_data = hg.AnnotateData()
         key_smash_thresholds = {
-        'count_sequence_squared_vowels': 1.00,
-        'count_sequence_squared_consonants': 1.999,
-        'count_sequence_squared_special_characters': 2.2499,
-        'ratio_of_numeric_digits_squared': 2.9,
-        'average_of_char_count_squared': 2.78,
+        'count_sequence_squared_vowels': ['above', 1.00],
+        'count_sequence_squared_consonants': ['above', 1.999],
+        'count_sequence_squared_special_characters': ['above', 2.2499],
+        # 'ratio_of_numeric_digits_squared': ['above', 2.9],
+        'average_of_char_count_squared': ['above', 2.78],
+        'shannon_entropy' : ['below', 2.0]
         }
 
         df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)
@@ -42,13 +43,13 @@ def annotate_data(self, df, concatened_column_name, ks_thresholds):
         
         df['target'] = 'valid'
         
-        ks_colummns = [col for col in df if col.startswith('feature_ks')]
+        ks_colummns = [col for col in df if col.startswith('feature_ks') and col.endswith(concatened_column_name)]
         for ks_colummn in ks_colummns:
-            threshold = float("inf")
-            for th in ks_thresholds:
-                if th in ks_colummn:
-                    threshold = ks_thresholds[th]
-            df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] >= threshold else x['target'], axis=1) 
+            threshold = ks_thresholds[ks_colummn.replace('feature_ks_', '').replace(f'_{concatened_column_name}', '')]
+            if threshold[0] == 'above':
+                df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] >= threshold[1] else x['target'], axis=1) 
+            elif threshold[0] == 'below':
+                df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] <= threshold[1] else x['target'], axis=1)
         
         re_colummns = [col for col in df if col.startswith('feature_re')]
         for re_colummn in re_colummns:
diff --git a/hygia/data_pipeline/feature_engineering/feature_engineering.py b/hygia/data_pipeline/feature_engineering/feature_engineering.py
index 794601d8..73f3e591 100644
--- a/hygia/data_pipeline/feature_engineering/feature_engineering.py
+++ b/hygia/data_pipeline/feature_engineering/feature_engineering.py
@@ -22,7 +22,17 @@ class FeatureEngineering:
     \endcode
     """
 
-    def __init__(self, lang:str='es', dimensions:int=25, model:str='bytepair', country:str=None, context_words_file:str=None):
+    def __init__(self, lang:str='es',
+                 dimensions:int=25,
+                 model:str='bytepair',
+                 country:str=None,
+                 context_words_file:str=None,
+                 ignore_word_embedding:bool=False,
+                 ignore_ratio_of_numeric_digits_squared:bool=True,
+                 ignore_shannon_entropy:bool=True,
+                 ignore_repeated_bigram_ratio:bool=True,
+                 ignore_unique_char_ratio:bool=True,
+                 ignore_regex_features:bool=False) -> None:
         """
         Initialize the FeatureEngineering class.
         
@@ -34,8 +44,12 @@ def __init__(self, lang:str='es', dimensions:int=25, model:str='bytepair', count
         print(f'{Style.BRIGHT}language -> {Style.NORMAL}{lang}')
         print(f'{Style.BRIGHT}dimensions -> {Style.NORMAL}{dimensions}')
         
-        
-        self.key_smash = KeySmash()
+        self.ignore_word_embedding = ignore_word_embedding
+        self.ignore_regex_features = ignore_regex_features
+        self.key_smash = KeySmash(ignore_ratio_of_numeric_digits_squared,
+                                  ignore_shannon_entropy,
+                                  ignore_repeated_bigram_ratio,
+                                  ignore_unique_char_ratio)
         self.word_embedding = WordEmbedding(lang=lang, dimensions=dimensions, model=model)
         self.regex = Regex(country=country, context_words_file=context_words_file)
 
@@ -62,6 +76,8 @@ def extract_features(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
         print(f'extract features from -> {text_column}')
         
         df = self.key_smash.extract_key_smash_features(df, text_column)
-        df = self.word_embedding.extract_word_embedding_features(df, text_column)
-        df = self.regex.extract_regex_features(df, text_column)
+        if not self.ignore_word_embedding:
+            df = self.word_embedding.extract_word_embedding_features(df, text_column)
+        if not self.ignore_regex_features:
+            df = self.regex.extract_regex_features(df, text_column)
         return df
diff --git a/hygia/data_pipeline/feature_engineering/key_smash.py b/hygia/data_pipeline/feature_engineering/key_smash.py
index 941ec1f6..323f6584 100644
--- a/hygia/data_pipeline/feature_engineering/key_smash.py
+++ b/hygia/data_pipeline/feature_engineering/key_smash.py
@@ -1,7 +1,8 @@
-from statistics import mean
 import pandas as pd
+import numpy as np
 import re
 
+MAX_STRING_LENGTH = 128
 class KeySmash:
     """
     A class for calculating metrics to indicate key smashing behavior in a text.
@@ -20,13 +21,22 @@ class KeySmash:
     \endcode
     """
     
-    def __init__(self):
+    def __init__(self,
+                ignore_ratio_of_numeric_digits_squared:bool=True,
+                ignore_shannon_entropy:bool=True,
+                ignore_repeated_bigram_ratio:bool=True,
+                ignore_unique_char_ratio:bool=True
+                ):
         """
         Initialize the KeySmash class.
         """
+        self.ignore_shannon_entropy = ignore_shannon_entropy
+        self.ignore_ratio_of_numeric_digits_squared = ignore_ratio_of_numeric_digits_squared
+        self.ignore_repeated_bigram_ratio = ignore_repeated_bigram_ratio
+        self.ignore_unique_char_ratio = ignore_unique_char_ratio
         self.char_sets = {
             "vowels": 'aeiouáéíóúãõ',
-            "consonants": 'bcdfghjklmnñpqrstvwxyz',
+            "consonants": 'bcdfghjklmnñpqrstvwxz', # except 'y'
             "special_characters": '!@#$%^¨|\'\"&*()_+:;~`´]}{[}ºª=-.¿¡'
         }
     
@@ -160,18 +170,57 @@ def ratio_of_numeric_digits_squared(self, text:str) -> float:
             return num_of_numeric_digits / len(' '.join(text_list))
         else:
             return 0
-    
-    def _normalize_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
+
+    def shannon_entropy(self, text:str) -> float:
         """
-        Normalize a given column in a dataframe.
+        Calculates the Shannon entropy for the given string.
+
+        \param text (Type: str) The text to extract the metric from.
         
-        \param df (Type: DataFrame) Dataframe to normalize the column in.
-        \param column (Type: str) Name of the column to be normalized.
+        \return (Type: float) Shannon entropy (min bits per byte-character).
+        """
+        text = str(text)
+        text = text.replace(" ", "")
+        size = len(text)
+        if size < 2:
+            return 0.0
+        unique_chars = set(text)
+        freq_dict = {char: text.count(char) for char in unique_chars}
+        freq_array = np.array(list(freq_dict.values()), dtype=float)
+        prob_array = freq_array / size
+        log_array = np.log2(prob_array)
+        ent = -np.sum(prob_array * log_array)
+        return ent
+    
+    def __count_repeated_bigrams(self, text:str):
+        bigrams = [text[i:i+2] for i in range(len(text)-1)]
+        unique_bigrams = set(bigrams)
+        count = len(bigrams) - len(unique_bigrams)
+        return count
 
-        \return (Type: DataFrame) The input dataframe with the normalized column.
+    def repeated_bigram_ratio(self, text:str) -> float:
         """
-        return df[column]  / df[column].abs().max() if df[column].abs().max() != 0.0 else 0.0
+        Calculates the Repeated Bigrams Ratio for the given string.
 
+        \param text (Type: str) The text to extract the metric from.
+        
+        \return (Type: float) Repeated Bigrams Ratio (min bits per byte-character).
+        """
+        repeated_bigram_count = self.__count_repeated_bigrams(text)
+        ratio = repeated_bigram_count * 1.0 / len(text) + 1
+        return ratio
+    
+    def unique_char_ratio(self, text:str) -> float:
+        """
+        Calculates the Unique Char Ratio for the given string.
+
+        \param text (Type: str) The text to extract the metric from.
+        
+        \return (Type: float) Unique Char Ratio (min bits per byte-character).
+        """
+        unique_chars = set(text)
+        ratio = len(unique_chars) / len(text) + 1
+        return ratio
     
     def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.DataFrame:
         """
@@ -181,7 +230,15 @@ def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.Dat
         \param column_name (Type: str) Name of the column in the dataframe that contains the text data to extract features from.
         \param normalize (bool, optional) Indicates whether to normalize the key smash feature columns. Default is True.
 
-        \return (Type: DataFrame) The input dataframe with additional columns for key smash features: 'irregular_sequence_vowels', 'irregular_sequence_consonants', 'irregular_sequence_special_characters', 'number_count_metric', 'char_frequency_metric'
+        \return (Type: DataFrame) The input dataframe with additional columns for key smash features:
+            'irregular_sequence_vowels',
+            'irregular_sequence_consonants',
+            'irregular_sequence_special_characters',
+            'number_count_metric',
+            'char_frequency_metric',
+            'shannon_entropy',
+            'repeated_bigram_ratio',
+            'unique_char_ratio'
 
         Examples
         Use this function like this:
@@ -198,7 +255,14 @@ def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.Dat
         df[f'feature_ks_count_sequence_squared_vowels_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'vowels') if len(x) > 0 else 0.0)
         df[f'feature_ks_count_sequence_squared_consonants_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'consonants') if len(x) > 0 else 0.0)
         df[f'feature_ks_count_sequence_squared_special_characters_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'special_characters') if len(x) > 0 else 0.0)
-        df[f'feature_ks_ratio_of_numeric_digits_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.ratio_of_numeric_digits_squared(x) if len(x) > 0 else 0.0)
+        if not self.ignore_ratio_of_numeric_digits_squared:
+            df[f'feature_ks_ratio_of_numeric_digits_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.ratio_of_numeric_digits_squared(x) if len(x) > 0 else 0.0)
         df[f'feature_ks_average_of_char_count_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.average_of_char_count_squared(x) if len(x) > 0 else 0.0)
+        if not self.ignore_shannon_entropy:
+            df[f'feature_ks_shannon_entropy_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.shannon_entropy(x) if len(x) > 0 else 0.0)
+        if not self.ignore_repeated_bigram_ratio:
+            df[f'feature_ks_repeated_bigram_ratio_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.repeated_bigram_ratio(x) if len(x) > 0 else 0.0)
+        if not self.ignore_unique_char_ratio:
+            df[f'feature_ks_unique_char_ratio_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.unique_char_ratio(x) if len(x) > 0 else 0.0)
         
         return df
\ No newline at end of file
diff --git a/hygia/data_pipeline/feature_engineering/word_embedding.py b/hygia/data_pipeline/feature_engineering/word_embedding.py
index 704d372e..ab309f68 100644
--- a/hygia/data_pipeline/feature_engineering/word_embedding.py
+++ b/hygia/data_pipeline/feature_engineering/word_embedding.py
@@ -35,9 +35,9 @@ def __init__(self, lang: str = 'es', dimensions: int = 25, model: str = 'bytepai
         self.lang = lang
         self.dimensions = dimensions
         self.model = model
-        self.word_embedding_model = self._load_model()
+        self.word_embedding_model = self.__load_model()
 
-    def _load_model(self) -> Any:
+    def __load_model(self) -> Any:
         """
         Load the word embedding model.
 
@@ -56,7 +56,7 @@ def _load_model(self) -> Any:
         else:
             raise ValueError
     
-    def _pre_embedding(self, text: str) -> str:
+    def __pre_embedding(self, text: str) -> str:
         text = ' '.join(e for e in text.split() if e.isalpha() and len(e) >= 3 and not e.isspace())
         return text
         
@@ -85,7 +85,7 @@ def get_embedding(self, text: str) -> np.ndarray:
         
         empty_vector = [0.0] * self.dimensions
 
-        text = self._pre_embedding(text)
+        text = self.__pre_embedding(text)
         
         # White space string
         if len(text.strip().split()) == 0:
diff --git a/hygia/data_pipeline/model/random_forest.py b/hygia/data_pipeline/model/random_forest.py
index 7fe0e818..664bc054 100644
--- a/hygia/data_pipeline/model/random_forest.py
+++ b/hygia/data_pipeline/model/random_forest.py
@@ -22,7 +22,11 @@ class RandomForestModel:
         scores
     \endcode
     """
-  def __init__(self, model_file=None, normalization_absolutes_file=None ,n_estimators=100, max_depth=None, random_state=0, normalize=True):
+  def __init__(self, model_file=None,
+               normalization_absolutes_file=None,
+               n_estimators=100, max_depth=None,
+               random_state=0,
+               normalize=True) -> None:
     """
     Initialize the RandomForestModel class.
 
@@ -44,7 +48,7 @@ def __init__(self, model_file=None, normalization_absolutes_file=None ,n_estimat
       self.random_state = random_state
       self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, random_state=self.random_state)
   
-  def _get_absolute_maximums(self, df, features_columns_to_normalize, concatened_column_name):
+  def __get_absolute_maximums(self, df, features_columns_to_normalize, concatened_column_name):
     if self.normalization_absolutes:
       return
     absolutes_dict = {}
@@ -53,7 +57,7 @@ def _get_absolute_maximums(self, df, features_columns_to_normalize, concatened_c
       absolutes_dict[column.replace(f"_{concatened_column_name}", '')] = [absolute_maximum] if absolute_maximum else [1.0]
     self.normalization_absolutes = pd.DataFrame(absolutes_dict)
       
-  def _normalization(self, df, features_columns_to_normalize, concatened_column_name):
+  def __normalization(self, df, features_columns_to_normalize, concatened_column_name):
     if not self.normalize:
       return df
     for features_column_to_normalize in features_columns_to_normalize:
@@ -83,8 +87,8 @@ def train_and_get_scores(self, df, concatened_column_name, all_features_columns,
     
     # Normalization
     key_smash_features_columns = [column for column in all_features_columns if column.startswith('feature_ks')]
-    self._get_absolute_maximums(df_balanced, key_smash_features_columns, concatened_column_name)
-    df_balanced_normalized = self._normalization(df_balanced.copy(), key_smash_features_columns, concatened_column_name)
+    self.__get_absolute_maximums(df_balanced, key_smash_features_columns, concatened_column_name)
+    df_balanced_normalized = self.__normalization(df_balanced.copy(), key_smash_features_columns, concatened_column_name)
     
     # Train/Test split
     X = df_balanced_normalized[[*all_features_columns]].values
@@ -123,7 +127,7 @@ def predict(self, X, concatened_column_name):
     print(f'{Fore.YELLOW}running model...{Fore.WHITE}')
     
     key_smash_features_columns = [column for column in X.columns if column.startswith('feature_ks')]
-    X = self._normalization(X.copy(), key_smash_features_columns, concatened_column_name)
+    X = self.__normalization(X.copy(), key_smash_features_columns, concatened_column_name)
     
     return self.model.predict(X.values)
   
diff --git a/hygia/data_pipeline/pre_process_data/pre_process_data.py b/hygia/data_pipeline/pre_process_data/pre_process_data.py
index 807ef8b8..13132a77 100644
--- a/hygia/data_pipeline/pre_process_data/pre_process_data.py
+++ b/hygia/data_pipeline/pre_process_data/pre_process_data.py
@@ -7,7 +7,8 @@ class PreProcessData:
     """
     This class presents a series of functions that help in data pre-processing.
     As concatenate columns, replace abbreviation, and etc.
-
+    
+    Some abbreviations were taken from this website: https://en.wikipedia.org/wiki/Template:Mexico_State-Abbreviation_Codes
 
     Examples - 
     Use this class like this:
@@ -67,14 +68,14 @@ def handle_nulls(self, df, column_name):
         """
         print(f'handle null values in the column {Style.BRIGHT}{column_name}{Style.NORMAL}')
         
-        df[column_name].fillna('').astype(str)
+        df[column_name] = df[column_name].fillna('').astype(str)
         return df
 
     def handle_extra_spaces(self, df, column_name:str) -> str:
         df[column_name] = df[column_name].apply(lambda x: ' '.join(x.split()))
         return df
     
-    def _replace_abbreviation(self, text:str) -> str:
+    def __replace_abbreviation(self, text:str) -> str:
         """
         Function that identifies abbreviations and according to the dictionary changes the names
         
@@ -93,7 +94,7 @@ def handle_abreviations(self, df, column_name):
         \param column_name (Type: str) Column name to check
         """
 
-        df[column_name] = df[column_name].apply(lambda x: self._replace_abbreviation(x))
+        df[column_name] = df[column_name].apply(lambda x: self.__replace_abbreviation(x))
         return df
     
     def pre_process_data(self, df, columns_to_concat=None, column_name=None):
diff --git a/tests/data_pipeline/annotate_data/test_annotate_data.py b/tests/data_pipeline/annotate_data/test_annotate_data.py
index 5e485929..dde4cf81 100644
--- a/tests/data_pipeline/annotate_data/test_annotate_data.py
+++ b/tests/data_pipeline/annotate_data/test_annotate_data.py
@@ -16,11 +16,12 @@ def test_annotate_data(self):
         })
         
         key_smash_thresholds = {
-            'count_sequence_squared_vowels': 0.9,
-            'count_sequence_squared_consonants': 0.9,
-            'count_sequence_squared_special_characters': 0.9,
-            'ratio_of_numeric_digits_squared': 0.9,
-            'average_of_char_count_squared': 0.9,
+            'count_sequence_squared_vowels': ['above', 1.00],
+            'count_sequence_squared_consonants':['above',  1.999],
+            'count_sequence_squared_special_characters': ['above', 2.2499],
+            # 'ratio_of_numeric_digits_squared': ['above', 2.9],
+            'average_of_char_count_squared': ['above', 2.78],
+            'shannon_entropy' : ['below', 2.0]
         }
         
         result = self.annotate_data.annotate_data(df, concatened_column_name='concat_address', ks_thresholds=key_smash_thresholds)
diff --git a/tests/data_pipeline/feature_engineering/test_feature_engineering.py b/tests/data_pipeline/feature_engineering/test_feature_engineering.py
index 3c973ecf..564de197 100644
--- a/tests/data_pipeline/feature_engineering/test_feature_engineering.py
+++ b/tests/data_pipeline/feature_engineering/test_feature_engineering.py
@@ -20,6 +20,5 @@ def test_extract_features(self, feature_engineering, dataframe):
         assert 'feature_ks_count_sequence_squared_vowels_text_column' in df.columns
         assert 'feature_ks_count_sequence_squared_consonants_text_column' in df.columns
         assert 'feature_ks_count_sequence_squared_special_characters_text_column' in df.columns
-        assert 'feature_ks_ratio_of_numeric_digits_squared_text_column' in df.columns
         assert 'feature_ks_average_of_char_count_squared_text_column' in df.columns
         assert 'feature_we_0_text_column' in df.columns
diff --git a/tests/data_pipeline/feature_engineering/test_key_smash.py b/tests/data_pipeline/feature_engineering/test_key_smash.py
index e72bcec6..bb5bb88b 100644
--- a/tests/data_pipeline/feature_engineering/test_key_smash.py
+++ b/tests/data_pipeline/feature_engineering/test_key_smash.py
@@ -5,7 +5,7 @@
 class TestKeySmash:
     
     def setup_method(self):
-        self.key_smash = KeySmash()
+        self.key_smash = KeySmash(ignore_shannon_entropy=False)
 
     @pytest.mark.parametrize("data, expected_output", [
         ("PUENTECILLA KM. 1.7", 1.121212121212121),
@@ -16,7 +16,8 @@ def test_average_of_char_count_squared(self, data, expected_output):
 
     @pytest.mark.parametrize("data, opt, expected_output", [
         ("PUENTECILLA KM. 1.7", "vowels", 0.0),
-        ("ASDASD XXXX", "consonants", 2.272727272727273)
+        ("ASDASD XXXX", "consonants", 2.272727272727273),
+        ("ABC123 !@#$%", "special_characters", 2.0833333333333335)
     ])
     def test_count_sequence_squared(self, data, opt, expected_output):
         assert self.key_smash.count_sequence_squared(data, opt) == expected_output
@@ -29,6 +30,33 @@ def test_count_sequence_squared(self, data, opt, expected_output):
     def test_ratio_of_numeric_digits_squared(self, data, expected_output):
         assert self.key_smash.ratio_of_numeric_digits_squared(data) == expected_output
     
+    @pytest.mark.parametrize("data, expected_output", [
+        ("PUENTECILLA KM. 1.7",3.7345216647797517),
+        ("ASDASD XXXX",1.9219280948873623),
+        ("AS AA",0.8112781244591328),
+        ("XX XX",-0.0)
+    ])
+    def test_shannon_entropy(self, data, expected_output):
+        assert self.key_smash.shannon_entropy(data) == expected_output
+    
+    @pytest.mark.parametrize("data, expected_output", [
+        ("PUENTECILLA KM. 1.7",1.0),
+        ("ASDASD XXXX",1.3636363636363638),
+        ("AAAAAA AAAA",1.6363636363636362),
+        ("XX XX",1.2)
+    ])
+    def test_repeated_bigram_ratio(self, data, expected_output):
+        assert self.key_smash.repeated_bigram_ratio(data) == expected_output
+    
+    @pytest.mark.parametrize("data, expected_output", [
+        ("PUENTECILLA KM. 1.7",1.7894736842105263),
+        ("ASDASD XXXX",1.4545454545454546),
+        ("AAAAAA AAAA",1.1818181818181819),
+        ("XX XX",1.4)
+    ])
+    def test_unique_char_ratio(self, data, expected_output):
+        assert self.key_smash.unique_char_ratio(data) == expected_output
+    
     def test_extract_key_smash_features(self):
         df = pd.DataFrame({"text_column": ["abcdefgh", "ijklmnop", "qrstuvwxyz"]})
         result = self.key_smash.extract_key_smash_features(df, "text_column")
@@ -36,6 +64,6 @@ def test_extract_key_smash_features(self):
         assert 'feature_ks_count_sequence_squared_vowels_text_column' in result.columns
         assert 'feature_ks_count_sequence_squared_consonants_text_column' in result.columns
         assert 'feature_ks_count_sequence_squared_special_characters_text_column' in result.columns
-        assert 'feature_ks_ratio_of_numeric_digits_squared_text_column' in result.columns
         assert 'feature_ks_average_of_char_count_squared_text_column' in result.columns
+        assert 'feature_ks_shannon_entropy_text_column' in result.columns
         assert result.shape[1] == 6 # Ensure no extra columns are added
\ No newline at end of file
diff --git a/tests/data_pipeline/feature_engineering/test_word_embedding.py b/tests/data_pipeline/feature_engineering/test_word_embedding.py
index 1c1d688f..5bdfd300 100644
--- a/tests/data_pipeline/feature_engineering/test_word_embedding.py
+++ b/tests/data_pipeline/feature_engineering/test_word_embedding.py
@@ -1,26 +1,25 @@
 import pytest
 import pandas as pd
 import numpy as np
-from whatlies.language import BytePairLanguage
 
 from hygia import WordEmbedding
 
 class TestWordEmbedding:
-    def setup_method(self):
+    @pytest.fixture(autouse=True)
+    def setup_class(self):
         self.word_embedding = WordEmbedding()
 
-    def test_load_model(self):
-        assert isinstance(self.word_embedding._load_model(), BytePairLanguage)
-    
+    def test_pre_embedding(self):
+        text = 'A test with ABC123 AVENUE'
+        pre_embedding = self.word_embedding._WordEmbedding__pre_embedding(text)
+        assert pre_embedding == 'test with AVENUE'
+
     def test_get_embedding(self):
         embedding = self.word_embedding.get_embedding("This is a sample text.")
         assert isinstance(embedding, np.ndarray)
 
-    def test_pre_embedding(self):
-        assert self.word_embedding._pre_embedding("A test with ABC123 AVENUE") == "test with AVENUE"
-
     def test_extract_word_embedding_features(self):
         df = pd.DataFrame({"text_column": ["This is a sample text.", "Another sample text."]})
         result = self.word_embedding.extract_word_embedding_features(df, "text_column")
         assert isinstance(result, pd.DataFrame)
-        assert any(col.startswith("feature_we_") for col in result.columns)
\ No newline at end of file
+        assert any(col.startswith("feature_we_") for col in result.columns)
diff --git a/tests/data_pipeline/model/test_random_forest.py b/tests/data_pipeline/model/test_random_forest.py
index 433bcc75..4d9dff47 100644
--- a/tests/data_pipeline/model/test_random_forest.py
+++ b/tests/data_pipeline/model/test_random_forest.py
@@ -1,10 +1,20 @@
 import pandas as pd
+from sklearn.datasets import make_classification
 from hygia import RandomForestModel
 
 class TestRandomForestModel:
-    def setup_method(self):
-        self.random_forest = RandomForestModel()
-    
-    def test_random_forest(self):
-        # TODO improve model tests
-        assert self.random_forest
\ No newline at end of file
+    def test_random_forest_model(self):
+        X, y = make_classification(random_state=42)
+
+        columns = ['feature_'+str(i) for i in range(X.shape[1])]
+        df = pd.DataFrame(X, columns=columns)
+        df['target'] = ['valid' if label == 0 else 'key_smash' for label in y]
+
+        model = RandomForestModel(normalize=False)
+
+        scores = model.train_and_get_scores(df, 'target', columns)
+
+        assert scores['accuracy'] >= 0.0 and scores['accuracy'] <= 1
+        assert scores['precision'] >= 0.0 and scores['precision'] <= 1
+        assert scores['recall'] >= 0.0 and scores['recall'] <= 1
+        assert scores['f1'] >= 0.0 and scores['f1'] <= 1
diff --git a/tests/data_pipeline/pre_process_data/test_pre_process_data.py b/tests/data_pipeline/pre_process_data/test_pre_process_data.py
index ea54d05a..1b223816 100644
--- a/tests/data_pipeline/pre_process_data/test_pre_process_data.py
+++ b/tests/data_pipeline/pre_process_data/test_pre_process_data.py
@@ -1,23 +1,43 @@
 import pytest
-
+import pandas as pd
 from hygia.data_pipeline.pre_process_data.pre_process_data import PreProcessData
-from hygia.paths.paths import root_path
 
-@pytest.mark.parametrize("abbreviation, expected_replacement", [
-    ('NO', "NUMBER"),
-    ('no', "NUMBER"),
-    ('no123', "NUMBER123"),
-    ('no 123', "NUMBER 123"),
-    ('123 no', "123 NUMBER"),
-    ('not', "not"),
-    ('NOT', "NOT"),
-    ('ono', "ono")
-])
 class TestPreProcessData:
-    def test_replace_abbreviation_coutry(self, abbreviation, expected_replacement):
-        pre_process_data = PreProcessData(country='MEXICO')
-        assert pre_process_data._replace_abbreviation(abbreviation) == expected_replacement
-        
-    def test_replace_abbreviation_abbreviations_file(self, abbreviation, expected_replacement):
-        pre_process_data = PreProcessData(abbreviations_file=root_path + '/data/dicts/mexico_abbreviations.csv')
-        assert pre_process_data._replace_abbreviation(abbreviation) == expected_replacement
+    def setup_method(self):
+        self.pre_processor = PreProcessData(country='MEXICO')
+
+    def test_concatenate_columns(self):
+        data = {'A': ['a', 'b', 'c'], 'B': ['d', 'e', 'f']}
+        df = pd.DataFrame(data)
+        expected_output = ['a d', 'b e', 'c f']
+        output = self.pre_processor.concatenate_columns(df, ['A', 'B'], 'C')
+        assert list(output['C']) == expected_output
+
+    def test_handle_nulls(self):
+        data = {'A': ['a', 'b', None]}
+        df = pd.DataFrame(data)
+        expected_output = ['a', 'b', '']
+        output = self.pre_processor.handle_nulls(df, 'A')
+        assert list(output['A']) == expected_output
+
+    def test_handle_extra_spaces(self):
+        data = {'A': ['  a  ', ' b  ', ' c']}
+        df = pd.DataFrame(data)
+        expected_output = ['a', 'b', 'c']
+        output = self.pre_processor.handle_extra_spaces(df, 'A')
+        assert list(output['A']) == expected_output
+
+    def test_handle_abreviations(self):
+        data = {'A': ['CDMX', 'MZ', 'BCN']}
+        df = pd.DataFrame(data)
+        expected_output = ['Ciudad de México', 'MANZANA', 'Baja California']
+        output = self.pre_processor.handle_abreviations(df, 'A')
+        assert list(output['A']) == expected_output
+
+    def test_pre_process_data(self):
+        data = {'A': ['  a  ', ' b  ', ' c'], 'B': ['d', 'e', 'f'], 'C': ['NLE', 'CANCUN', 'NLE Monterrey']}
+        df = pd.DataFrame(data)
+        expected_output = ['a d', 'b e', 'c f']
+        output = self.pre_processor.pre_process_data(df, ['A', 'B'], 'D')
+        assert 'D' in output.columns
+        assert list(output['D']) == expected_output