diff --git a/data/dicts/mexico_abbreviations.csv b/data/dicts/mexico_abbreviations.csv index 0a0cbf31..4bd38473 100644 --- a/data/dicts/mexico_abbreviations.csv +++ b/data/dicts/mexico_abbreviations.csv @@ -8,4 +8,100 @@ BLVD,BOULEVARD LT,LOTE MZ,MANZANA CDMX,Ciudad de México -DF,Distrito Federal \ No newline at end of file +DF,Distrito Federal +AGU,Aguascalientes +BCN,Baja California +BCS,Baja California Sur +CAM,Campeche +CHP,Chiapas +CHH,Chihuahua +COA,Coahuila +CL,Colima +DUR,Durango +MEX,Estado de México +GTO,Guanajuato +GRO,Guerrero +HGO,Hidalgo +JAL,Jalisco +MIC,Michoacán +MOR,Morelos +NAY,Nayarit +NLE,Nuevo León +OAX,Oaxaca +PUE,Puebla +QRO,Querétaro +QR,Quintana Roo +SLP,San Luis Potosí +SIN,Sinaloa +SON,Sonora +TAB,Tabasco +TAM,Tamaulipas +TLAX,Tlaxcala +VER,Veracruz +YUC,Yucatán +ZAC,Zacatecas +CDMX,Ciudad de México +ARS,Aguascalientes +AG,Aguascalientes +B.C,Baja California +BC,Baja California +B.C.S,Baja California Sur +BCS,Baja California Sur +Camp,Campeche +CM,Campeche +Chis,Chiapas +CS,Chiapas +Chih,Chihuahua +CH,Chihuahua +Coah,Coahuila +CO,Coahuila +Col,Colima +CL,Colima +CDMX,Ciudad de México +DF,Ciudad de México +Dgo,Durango +DG,Durango +Gto,Guanajuato +GT,Guanajuato +Gro,Guerrero +GR,Guerrero +Hgo,Hidalgo +HG,Hidalgo +Jal,Jalisco +JA,Jalisco +Edomex,Mexico +MEX,Mexico +Mich,Michoacán +MI,Michoacán +Mor,Morelos +MO,Morelos +Nay,Nayarit +NA,Nayarit +N.L,Nuevo León +NL,Nuevo León +Oax,Oaxaca +OA,Oaxaca +Pue,Puebla +PU,Puebla +Qro,Querétaro +QT,Querétaro +Q.R,Quintana Roo +QR,Quintana Roo +S.L.P,San Luis Potosí +SL,San Luis Potosí +Sin,Sinaloa +SI,Sinaloa +Son,Sonora +SO,Sonora +Tab,Tabasco +TB,Tabasco +Tamps,Tamaulipas +TM,Tamaulipas +Tlax,Tlaxcala +TL,Tlaxcala +Ver,Veracruz +VE,Veracruz +Yuc,Yucatán +YU,Yucatán +Zac,Zacatecas +ZA,Zacatecas diff --git a/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl index 73da2d97..2b582ac3 100644 Binary files a/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl and b/data/models/RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl differ diff --git a/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl new file mode 100644 index 00000000..9f734744 Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl differ diff --git a/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl new file mode 100644 index 00000000..0a4c278a Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl differ diff --git a/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl new file mode 100644 index 00000000..719cacf1 Binary files /dev/null and b/data/models/RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl differ diff --git a/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl b/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl index 85585477..e7e421ae 100644 Binary files a/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl and b/data/models/RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl differ diff --git a/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv new file mode 100644 index 00000000..8d3908d4 --- /dev/null +++ b/data/models/normalization_absolutes_rforest_ksmash_regex_normal.csv @@ -0,0 +1,2 @@ +feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio +15.03125,30.0,30.0,30.0,4.735393717824877,1.9491525423728815,2.0 diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv new file mode 100644 index 00000000..239a4e65 --- /dev/null +++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal.csv @@ -0,0 +1,2 @@ +feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio +15.03125,30.0,30.0,30.0,4.680689288944333,1.9491525423728815,2.0 diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv new file mode 100644 index 00000000..b955f54d --- /dev/null +++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_bigram_wembedding_regex_normal.csv @@ -0,0 +1,2 @@ +feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy,feature_ks_repeated_bigram_ratio +15.03125,30.0,30.0,30.0,4.680689288944333,1.9491525423728815 diff --git a/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv new file mode 100644 index 00000000..ef125b19 --- /dev/null +++ b/data/models/normalization_absolutes_rforest_ksmash_shannon_wembedding_regex_normal.csv @@ -0,0 +1,2 @@ +feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_shannon_entropy +15.03125,30.0,30.0,30.0,4.779780045430954 diff --git a/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv b/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv new file mode 100644 index 00000000..073ac606 --- /dev/null +++ b/data/models/normalization_absolutes_rforest_ksmash_wembedding_regex_normal.csv @@ -0,0 +1,2 @@ +feature_ks_count_sequence_squared_vowels,feature_ks_count_sequence_squared_consonants,feature_ks_count_sequence_squared_special_characters,feature_ks_average_of_char_count_squared,feature_ks_repeated_bigram_ratio,feature_ks_unique_char_ratio +15.03125,30.0,30.0,30.0,1.9491525423728815,2.0 diff --git a/examples/MEXICO_predict_sets.ipynb b/examples/MEXICO_predict_sets.ipynb new file mode 100644 index 00000000..44af215e --- /dev/null +++ b/examples/MEXICO_predict_sets.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Illustrative Examples of Library Function Usage\n", + "\n", + "This notebook is designed for Hygia library users and provides examples of utilizing the main functions in the library. It is one of the resources offered by the Hygia community to support new users. For further information, please visit our documentation at https://hygia-org.github.io/hygia/.\n", + "\n", + "The example pipeline demonstrated in this notebook covers the following steps: importing dependencies, loading the model, pre-processing the data (e.g., concatenating and creating new columns), using the prediction and model functions, and finally saving the model results.\n", + "\n", + "## Imports and classes instanciations\n", + "\n", + " To take advantage of the library's functions and proceed with the pipeline, you will first need to import the Pandas and Hygia libraries.\n", + "\n", + "As a starting point, when first using the library, it is recommended to initialize the pre-processing and feature engineering classes. This will set the foundation for selecting the desired model stored in the .pkl format in the folder (/data/models/).\n", + "\n", + "Before utilizing the library functions, it is important to familiarize yourself with the pre-processing and feature engineering classes, which play a crucial role in the data preparation process. Once you have a clear understanding of these classes, you can then proceed to select the model that best fits your needs from the available options stored in the folder (/data/models/). With the right model selected, you can then proceed to execute the pipeline and achieve the desired results." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n", + "\u001b[1mlanguage -> \u001b[22mes\n", + "\u001b[1mdimensions -> \u001b[22m25\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import hygia as hg\n", + "\n", + "# Chose your model based on the configs sets below\n", + "set_0 = {\n", + " 'set_name': 'rforest_ksmash_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': True,\n", + " 'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n", + "}\n", + "set_1 = {\n", + " 'set_name': 'rforest_ksmash_regex_normal',\n", + " 'ignore_word_embedding': True,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "set_2 = {\n", + " 'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_3 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': True,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_4 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "chosen_set = set_3\n", + "\n", + "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n", + "augment_data = hg.AugmentData(country=\"MEXICO\")\n", + "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n", + " ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n", + " ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n", + " ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n", + " ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n", + " )\n", + "rf_model = hg.RandomForestModel(f\"../data/models/{chosen_set['model_output']}\",\n", + " normalization_absolutes_file=f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data\n", + "\n", + "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n", + "\n", + "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n", + "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Augment Data with context validations" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = augment_data.augment_data(df, zipcode_column_name='ZIP_CODE_L')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add new columns\n", + "\n", + "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n", + "\n", + "1. Concatenate address\n", + "2. All features columns:\n", + " - Key Smash\n", + " - Regex\n", + " - Word Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n", + "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n", + "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n" + ] + } + ], + "source": [ + "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n", + "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n", + "df = feature_engineering.extract_features(df, concatened_column_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check new columns names" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n", + "model_features_columns = all_features_columns\n", + "model_features_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict using pre-trained model\n", + "\n", + "This notebook showcases the utilization of a pre-trained model and its demonstration through prediction with the help of the pandas library. This serves as an example of how the Hygia library can be employed to perform predictions on your data, providing insight and generating new information based on the data at hand. The notebook also highlights the versatility of the Hygia library as it can be used in conjunction with other libraries such as pandas, further expanding its capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning model...\u001b[37m\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0 2512460\n", + "1.0 7836\n", + "Name: prediction_is_key_smash, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['prediction_is_key_smash'] = rf_model.predict(df[model_features_columns], concatened_column_name)\n", + "df['prediction_is_key_smash'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save predicted data\n", + "\n", + "Por fim um exemplo de como salvar os dados e resultados do modelo armazenado no campo prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df[df['prediction_is_key_smash'] == 1][[concatened_column_name, 'prediction_is_key_smash']] \\\n", + " .drop_duplicates(subset=[concatened_column_name]) \\\n", + " .to_csv(f\"../data/tmp/prediction_{chosen_set['set_name']}.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/MEXICO_retrain_predict_example.ipynb b/examples/MEXICO_retrain_predict_example.ipynb index 71b8a78a..6ef08e15 100644 --- a/examples/MEXICO_retrain_predict_example.ipynb +++ b/examples/MEXICO_retrain_predict_example.ipynb @@ -307,13 +307,15 @@ ], "source": [ "key_smash_thresholds = {\n", - " 'count_sequence_squared_vowels': 1.00,\n", - " 'count_sequence_squared_consonants': 1.999,\n", - " 'count_sequence_squared_special_characters': 2.2499,\n", - " # 'ratio_of_numeric_digits_squared': 2.9,\n", - " 'average_of_char_count_squared': 2.78,\n", + " 'count_sequence_squared_vowels': ['above', 1.00],\n", + " 'count_sequence_squared_consonants':['above', 1.999],\n", + " 'count_sequence_squared_special_characters': ['above', 2.2499],\n", + " # 'ratio_of_numeric_digits_squared': ['above', 2.9],\n", + " 'average_of_char_count_squared': ['above', 2.78],\n", + " 'shannon_entropy' : ['below', 2.0],\n", "}\n", "\n", + "\n", "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n", "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()" ] diff --git a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb index 6263dde5..d7be5ab4 100644 --- a/examples/MEXICO_retrain_predict_example_no_embedding.ipynb +++ b/examples/MEXICO_retrain_predict_example_no_embedding.ipynb @@ -270,11 +270,12 @@ ], "source": [ "key_smash_thresholds = {\n", - " 'count_sequence_squared_vowels': 1.00,\n", - " 'count_sequence_squared_consonants': 1.999,\n", - " 'count_sequence_squared_special_characters': 2.2499,\n", - " # 'ratio_of_numeric_digits_squared': 2.9,\n", - " 'average_of_char_count_squared': 2.78,\n", + " 'count_sequence_squared_vowels': ['above', 1.00],\n", + " 'count_sequence_squared_consonants':['above', 1.999],\n", + " 'count_sequence_squared_special_characters': ['above', 2.2499],\n", + " # 'ratio_of_numeric_digits_squared': ['above', 2.9],\n", + " 'average_of_char_count_squared': ['above', 2.78],\n", + " 'shannon_entropy' : ['below', 2.0],\n", "}\n", "\n", "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n", diff --git a/examples/MEXICO_retrain_sets copy.ipynb b/examples/MEXICO_retrain_sets copy.ipynb new file mode 100644 index 00000000..26286304 --- /dev/null +++ b/examples/MEXICO_retrain_sets copy.ipynb @@ -0,0 +1,506 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exemplo de uso para treinar o modelo\n", + "\n", + "Welcome to the Hygia Boilerplate! This resource is designed to help data scientists understand and utilize the full capabilities of the Hygia library. The Hygia library provides a comprehensive suite of tools for pre-processing, feature engineering, model training, and prediction. By using this boilerplate, you will gain a deeper understanding of how to effectively use the library to perform various tasks in the data science pipeline.\n", + "\n", + "Starting with pre-processing, the Hygia library provides functions for cleaning and transforming your data. This is an important step in preparing your data for analysis and modeling. The library also includes functions for feature engineering, allowing you to create new features and extract insights from your data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import hygia as hg\n", + "import time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chose your model based on the configs sets below" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "set_0 = {\n", + " 'set_name': 'rforest_ksmash_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': True,\n", + " 'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n", + "}\n", + "set_1 = {\n", + " 'set_name': 'rforest_ksmash_regex_normal',\n", + " 'ignore_word_embedding': True,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "set_2 = {\n", + " 'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_3 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': True,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_4 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "chosen_set = set_0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classes instanciations\n", + "\n", + "As a starting point, when first using the library, it is recommended to initialize the pre-processing, feature engineering, annotate data, and new random forest classes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n", + "\u001b[1mlanguage -> \u001b[22mes\n", + "\u001b[1mdimensions -> \u001b[22m25\n" + ] + } + ], + "source": [ + "\n", + "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n", + "augment_data = hg.AugmentData(country=\"MEXICO\")\n", + "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n", + " ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n", + " ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n", + " ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n", + " ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n", + " )\n", + "annotate_data = hg.AnnotateData()\n", + "new_rf_model = hg.RandomForestModel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data\n", + "\n", + "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n", + "\n", + "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file.\n", + "\n", + "NOTE: Please check if the file_path matches your data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n", + "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add new columns\n", + "\n", + "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n", + "\n", + "1. Concatenate address\n", + "2. All features columns:\n", + " - Key Smash\n", + " - Regex\n", + " - Word Embedding\n", + "\n", + "NOTE: Please check if the columns names matches your data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n", + "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n", + "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n" + ] + } + ], + "source": [ + "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n", + "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n", + "df = feature_engineering.extract_features(df, concatened_column_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check new columns names" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_repeated_bigram_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_unique_char_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_0_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_1_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_2_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_3_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_4_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_5_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_6_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_7_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_8_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_9_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_10_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_11_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_12_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_13_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_14_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_15_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_16_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_17_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_18_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_19_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_20_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_21_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_22_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_23_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_we_24_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n", + "all_features_columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Select Features\n", + "- remove word embeddings\n", + "- remove key smash feature: ratio_of_numeric_digits_squared" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "selected_features = all_features_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Annotate data\n", + "\n", + "The Hygia library has a dedicated class to assist in the process of annotating data using keyboard smashing threshold. This information can then be used to improve the performance of machine learning models by providing more relevant training data. The use of the Hygia library's annotation functions is a key step in ensuring that your data is ready for analysis and can lead to more accurate and reliable results." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning annotate data with configs below...\u001b[37m\n", + "\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': ['above', 1.0], 'count_sequence_squared_consonants': ['above', 1.999], 'count_sequence_squared_special_characters': ['above', 2.2499], 'ratio_of_numeric_digits_squared': ['above', 2.9], 'average_of_char_count_squared': ['above', 2.78], 'shannon_entropy': ['below', 1.0], 'repeated_bigram_ratio': ['above', 1.7058], 'unique_char_ratio': ['below', 1.15789]}\n", + "column -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n" + ] + }, + { + "data": { + "text/plain": [ + "valid 1337828\n", + "key_smash 645\n", + "contains_email 567\n", + "contains_exactly_the_word_test 177\n", + "only_special_characters 144\n", + "contains_context_invalid_words 128\n", + "contains_exactly_the_word_dell 125\n", + "only_numbers 106\n", + "only_one_char 14\n", + "contains_exactly_invalid_words 10\n", + "is_substring_of_column_name 3\n", + "contains_date 1\n", + "empty 1\n", + "Name: target, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "key_smash_thresholds = {\n", + " 'count_sequence_squared_vowels': ['above', 1.00],\n", + " 'count_sequence_squared_consonants':['above', 1.999],\n", + " 'count_sequence_squared_special_characters': ['above', 2.2499],\n", + " 'ratio_of_numeric_digits_squared': ['above', 2.9],\n", + " 'average_of_char_count_squared': ['above', 2.78],\n", + " 'shannon_entropy' : ['below', 1.0],\n", + " 'repeated_bigram_ratio' : ['above', 1.7058],\n", + " 'unique_char_ratio' : ['below', 1.15789],\n", + "}\n", + "\n", + "\n", + "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n", + "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "valid 2511552\n", + "contains_context_invalid_words 3079\n", + "key_smash 1472\n", + "only_special_characters 1291\n", + "contains_email 1045\n", + "contains_exactly_the_word_test 667\n", + "contains_exactly_the_word_dell 553\n", + "only_one_char 287\n", + "only_numbers 239\n", + "empty 71\n", + "contains_exactly_invalid_words 26\n", + "is_substring_of_column_name 12\n", + "contains_date 2\n", + "Name: target, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['target'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiment: retrain model\n", + "\n", + "In addition to pre-processing and feature engineering, the Hygia library provides tools for training and retraining models. You can use the available models, or train your own using the functions provided. Once you have trained your model, you can use the prediction function to make predictions based on your data. Finally, the library includes functions for saving your model, so that you can use it again in the future." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mtranning model...\u001b[37m\n", + "\u001b[32mdone\u001b[37m\n", + "\u001b[33mget model score...\u001b[37m\n", + "\u001b[1maccuracy -> \u001b[22m0.9857142857142858\n", + "\u001b[1mprecision -> \u001b[22m0.967741935483871\n", + "\u001b[1mrecall -> \u001b[22m0.972972972972973\n", + "\u001b[1mf1 -> \u001b[22m0.9703504043126685\n" + ] + } + ], + "source": [ + "scores = new_rf_model.train_and_get_scores(df, concatened_column_name, selected_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predict using pre-trained model\n", + "\n", + "After retraining the model you can make the prediction and save the results." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning model...\u001b[37m\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0 1337014\n", + "1.0 2735\n", + "Name: prediction, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['prediction'] = new_rf_model.predict(df[selected_features], concatened_column_name)\n", + "df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Save model and predicted data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mexporting model and normalization absolutes...\u001b[37m\n" + ] + } + ], + "source": [ + "new_rf_model.export_model(f\"../data/models/{chosen_set['model_output']}\",\n", + " f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df[df['prediction'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n", + " .drop_duplicates(subset=[concatened_column_name]) \\\n", + " .to_csv(f\"../data/tmp/{time.strftime('%Y%m%d-%H%M%S')}prediction_{chosen_set['set_name']}.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We hope that this boilerplate provides you with a clear understanding of the capabilities of the Hygia library and inspires you to explore its full potential. With its comprehensive suite of tools, the Hygia library is a valuable resource for any data scientist looking to streamline their workflow and perform high-quality data analysis and modeling." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/MEXICO_retrain_sets.ipynb b/examples/MEXICO_retrain_sets.ipynb new file mode 100644 index 00000000..bde5e47b --- /dev/null +++ b/examples/MEXICO_retrain_sets.ipynb @@ -0,0 +1,482 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exemplo de uso para treinar o modelo\n", + "\n", + "Welcome to the Hygia Boilerplate! This resource is designed to help data scientists understand and utilize the full capabilities of the Hygia library. The Hygia library provides a comprehensive suite of tools for pre-processing, feature engineering, model training, and prediction. By using this boilerplate, you will gain a deeper understanding of how to effectively use the library to perform various tasks in the data science pipeline.\n", + "\n", + "Starting with pre-processing, the Hygia library provides functions for cleaning and transforming your data. This is an important step in preparing your data for analysis and modeling. The library also includes functions for feature engineering, allowing you to create new features and extract insights from your data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import hygia as hg\n", + "import time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chose your model based on the configs sets below" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "set_0 = {\n", + " 'set_name': 'rforest_ksmash_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': True,\n", + " 'model_output': 'RandomForest_Ksmash_Word_Embedding_Regex_Enrichments_Normalization.pkl',\n", + "}\n", + "set_1 = {\n", + " 'set_name': 'rforest_ksmash_regex_normal',\n", + " 'ignore_word_embedding': True,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "set_2 = {\n", + " 'set_name': 'rforest_ksmash_shannon_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_3 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': True,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "set_4 = {\n", + " 'set_name': 'rforest_ksmash_shannon_bigram_unique_wembedding_regex_normal',\n", + " 'ignore_word_embedding': False,\n", + " 'ignore_shannon_entropy': False,\n", + " 'ignore_repeated_bigram_ratio': False,\n", + " 'ignore_unique_char_ratio': False,\n", + " 'model_output': 'RandomForest_Ksmash_Shannon_Bigram_Unique_Word_Embedding_Regex_Enrichments_Normalization.pkl'\n", + "}\n", + "\n", + "chosen_set = set_1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classes instanciations\n", + "\n", + "As a starting point, when first using the library, it is recommended to initialize the pre-processing, feature engineering, annotate data, and new random forest classes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n", + "\u001b[1mlanguage -> \u001b[22mes\n", + "\u001b[1mdimensions -> \u001b[22m25\n" + ] + } + ], + "source": [ + "\n", + "pre_process_data = hg.PreProcessData(country=\"MEXICO\")\n", + "augment_data = hg.AugmentData(country=\"MEXICO\")\n", + "feature_engineering = hg.FeatureEngineering(country=\"MEXICO\",\n", + " ignore_word_embedding=chosen_set.get('ignore_word_embedding'),\n", + " ignore_shannon_entropy=chosen_set.get('ignore_shannon_entropy'),\n", + " ignore_repeated_bigram_ratio=chosen_set.get('ignore_repeated_bigram_ratio'),\n", + " ignore_unique_char_ratio=chosen_set.get('ignore_unique_char_ratio'),\n", + " )\n", + "annotate_data = hg.AnnotateData()\n", + "new_rf_model = hg.RandomForestModel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data\n", + "\n", + "To showcase the capabilities of the Hygia library, we have provided a small sample of context-free data. However, the library is designed to handle a wide range of data types and can be customized to meet the unique needs of different datasets.\n", + "\n", + "We have leveraged the pandas library to read in the sample data, which is stored in a .csv file format. The following code block provides an example of how to import the pandas library and read in the sample data file.\n", + "\n", + "NOTE: Please check if the file_path matches your data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = '../data/tmp/AI_LATA_ADDRESS_MEX_modificado.csv'\n", + "df = pd.read_csv(file_path, sep='¨', nrows=None, engine='python')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add new columns\n", + "\n", + "The Hygia library is designed to meet the needs of data scientists, and as such, it generates new columns in the data provided to better facilitate the data analysis process. This helps users keep track of the pre-processing steps taken on the data and the features generated. Two distinct types of columns are generated:\n", + "\n", + "1. Concatenate address\n", + "2. All features columns:\n", + " - Key Smash\n", + " - Regex\n", + " - Word Embedding\n", + "\n", + "NOTE: Please check if the columns names matches your data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aliases indified: \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2 -> \u001b[22m['STREET_ADDRESS_1', 'STREET_ADDRESS_2']\n", + "handle null values in the column \u001b[1mconcat_STREET_ADDRESS_1_STREET_ADDRESS_2\u001b[22m\n", + "extract features from -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n" + ] + } + ], + "source": [ + "concatened_column_name = 'concat_STREET_ADDRESS_1_STREET_ADDRESS_2'\n", + "df = pre_process_data.pre_process_data(df, ['STREET_ADDRESS_1', 'STREET_ADDRESS_2'], concatened_column_name)\n", + "df = feature_engineering.extract_features(df, concatened_column_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check new columns names" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['feature_ks_count_sequence_squared_vowels_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_consonants_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_count_sequence_squared_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_average_of_char_count_squared_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_shannon_entropy_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_repeated_bigram_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_ks_unique_char_ratio_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_context_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_dell_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_the_word_test_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_numbers_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_special_characters_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_email_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_url_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_date_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_contains_exactly_invalid_words_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_is_substring_of_column_name_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_one_char_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_only_white_spaces_concat_STREET_ADDRESS_1_STREET_ADDRESS_2',\n", + " 'feature_re_empty_concat_STREET_ADDRESS_1_STREET_ADDRESS_2']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_features_columns = [col for col in df if col.startswith('feature_ks') or col.startswith('feature_we') or col.startswith('feature_re')]\n", + "all_features_columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Select Features\n", + "- remove word embeddings\n", + "- remove key smash feature: ratio_of_numeric_digits_squared" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "selected_features = all_features_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Annotate data\n", + "\n", + "The Hygia library has a dedicated class to assist in the process of annotating data using keyboard smashing threshold. This information can then be used to improve the performance of machine learning models by providing more relevant training data. The use of the Hygia library's annotation functions is a key step in ensuring that your data is ready for analysis and can lead to more accurate and reliable results." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning annotate data with configs below...\u001b[37m\n", + "\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': ['above', 1.0], 'count_sequence_squared_consonants': ['above', 1.999], 'count_sequence_squared_special_characters': ['above', 2.2499], 'ratio_of_numeric_digits_squared': ['above', 2.9], 'average_of_char_count_squared': ['above', 2.78], 'shannon_entropy': ['below', 1.0], 'repeated_bigram_ratio': ['above', 1.7058], 'unique_char_ratio': ['below', 1.15789]}\n", + "column -> concat_STREET_ADDRESS_1_STREET_ADDRESS_2\n" + ] + }, + { + "data": { + "text/plain": [ + "valid 1337757\n", + "key_smash 716\n", + "contains_email 567\n", + "contains_exactly_the_word_test 177\n", + "only_special_characters 144\n", + "contains_context_invalid_words 128\n", + "contains_exactly_the_word_dell 125\n", + "only_numbers 106\n", + "only_one_char 14\n", + "contains_exactly_invalid_words 10\n", + "is_substring_of_column_name 3\n", + "contains_date 1\n", + "empty 1\n", + "Name: target, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "key_smash_thresholds = {\n", + " 'count_sequence_squared_vowels': ['above', 1.00],\n", + " 'count_sequence_squared_consonants':['above', 1.999],\n", + " 'count_sequence_squared_special_characters': ['above', 2.2499],\n", + " 'ratio_of_numeric_digits_squared': ['above', 2.9],\n", + " 'average_of_char_count_squared': ['above', 2.78],\n", + " 'shannon_entropy' : ['below', 1.0],\n", + " 'repeated_bigram_ratio' : ['above', 1.7058],\n", + " 'unique_char_ratio' : ['below', 1.15789],\n", + "}\n", + "\n", + "\n", + "df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds)\n", + "df.drop_duplicates(subset=[concatened_column_name])['target'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "valid 2510903\n", + "contains_context_invalid_words 3079\n", + "key_smash 2121\n", + "only_special_characters 1291\n", + "contains_email 1045\n", + "contains_exactly_the_word_test 667\n", + "contains_exactly_the_word_dell 553\n", + "only_one_char 287\n", + "only_numbers 239\n", + "empty 71\n", + "contains_exactly_invalid_words 26\n", + "is_substring_of_column_name 12\n", + "contains_date 2\n", + "Name: target, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['target'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiment: retrain model\n", + "\n", + "In addition to pre-processing and feature engineering, the Hygia library provides tools for training and retraining models. You can use the available models, or train your own using the functions provided. Once you have trained your model, you can use the prediction function to make predictions based on your data. Finally, the library includes functions for saving your model, so that you can use it again in the future." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mtranning model...\u001b[37m\n", + "\u001b[32mdone\u001b[37m\n", + "\u001b[33mget model score...\u001b[37m\n", + "\u001b[1maccuracy -> \u001b[22m0.991389913899139\n", + "\u001b[1mprecision -> \u001b[22m0.9774774774774775\n", + "\u001b[1mrecall -> \u001b[22m0.9908675799086758\n", + "\u001b[1mf1 -> \u001b[22m0.9841269841269841\n" + ] + } + ], + "source": [ + "scores = new_rf_model.train_and_get_scores(df, concatened_column_name, selected_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predict using pre-trained model\n", + "\n", + "After retraining the model you can make the prediction and save the results." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mrunning model...\u001b[37m\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0 1338136\n", + "1.0 1613\n", + "Name: prediction, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['prediction'] = new_rf_model.predict(df[selected_features], concatened_column_name)\n", + "df.drop_duplicates(subset=[concatened_column_name])['prediction'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Save model and predicted data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mexporting model and normalization absolutes...\u001b[37m\n" + ] + } + ], + "source": [ + "new_rf_model.export_model(f\"../data/models/{chosen_set['model_output']}\",\n", + " f\"../data/models/normalization_absolutes_{chosen_set['set_name']}.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df[df['prediction'] == 1][[concatened_column_name, 'target', 'prediction']] \\\n", + " .drop_duplicates(subset=[concatened_column_name]) \\\n", + " .to_csv(f\"../data/tmp/{time.strftime('%Y%m%d-%H%M%S')}prediction_{chosen_set['set_name']}.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We hope that this boilerplate provides you with a clear understanding of the capabilities of the Hygia library and inspires you to explore its full potential. With its comprehensive suite of tools, the Hygia library is a valuable resource for any data scientist looking to streamline their workflow and perform high-quality data analysis and modeling." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "acd904f7927719ac3bd428a31e6feadbc6c298bbba280a82d6227cca902ecf8e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/hygia/data_pipeline/annotate_data/annotate_data.py b/hygia/data_pipeline/annotate_data/annotate_data.py index 4234beee..bddc796a 100644 --- a/hygia/data_pipeline/annotate_data/annotate_data.py +++ b/hygia/data_pipeline/annotate_data/annotate_data.py @@ -13,11 +13,12 @@ class AnnotateData: annotate_data = hg.AnnotateData() key_smash_thresholds = { - 'count_sequence_squared_vowels': 1.00, - 'count_sequence_squared_consonants': 1.999, - 'count_sequence_squared_special_characters': 2.2499, - 'ratio_of_numeric_digits_squared': 2.9, - 'average_of_char_count_squared': 2.78, + 'count_sequence_squared_vowels': ['above', 1.00], + 'count_sequence_squared_consonants': ['above', 1.999], + 'count_sequence_squared_special_characters': ['above', 2.2499], + # 'ratio_of_numeric_digits_squared': ['above', 2.9], + 'average_of_char_count_squared': ['above', 2.78], + 'shannon_entropy' : ['below', 2.0] } df = annotate_data.annotate_data(df, concatened_column_name, key_smash_thresholds) @@ -42,13 +43,13 @@ def annotate_data(self, df, concatened_column_name, ks_thresholds): df['target'] = 'valid' - ks_colummns = [col for col in df if col.startswith('feature_ks')] + ks_colummns = [col for col in df if col.startswith('feature_ks') and col.endswith(concatened_column_name)] for ks_colummn in ks_colummns: - threshold = float("inf") - for th in ks_thresholds: - if th in ks_colummn: - threshold = ks_thresholds[th] - df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] >= threshold else x['target'], axis=1) + threshold = ks_thresholds[ks_colummn.replace('feature_ks_', '').replace(f'_{concatened_column_name}', '')] + if threshold[0] == 'above': + df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] >= threshold[1] else x['target'], axis=1) + elif threshold[0] == 'below': + df['target'] = df.apply(lambda x: 'key_smash' if x[ks_colummn] <= threshold[1] else x['target'], axis=1) re_colummns = [col for col in df if col.startswith('feature_re')] for re_colummn in re_colummns: diff --git a/hygia/data_pipeline/feature_engineering/feature_engineering.py b/hygia/data_pipeline/feature_engineering/feature_engineering.py index 794601d8..73f3e591 100644 --- a/hygia/data_pipeline/feature_engineering/feature_engineering.py +++ b/hygia/data_pipeline/feature_engineering/feature_engineering.py @@ -22,7 +22,17 @@ class FeatureEngineering: \endcode """ - def __init__(self, lang:str='es', dimensions:int=25, model:str='bytepair', country:str=None, context_words_file:str=None): + def __init__(self, lang:str='es', + dimensions:int=25, + model:str='bytepair', + country:str=None, + context_words_file:str=None, + ignore_word_embedding:bool=False, + ignore_ratio_of_numeric_digits_squared:bool=True, + ignore_shannon_entropy:bool=True, + ignore_repeated_bigram_ratio:bool=True, + ignore_unique_char_ratio:bool=True, + ignore_regex_features:bool=False) -> None: """ Initialize the FeatureEngineering class. @@ -34,8 +44,12 @@ def __init__(self, lang:str='es', dimensions:int=25, model:str='bytepair', count print(f'{Style.BRIGHT}language -> {Style.NORMAL}{lang}') print(f'{Style.BRIGHT}dimensions -> {Style.NORMAL}{dimensions}') - - self.key_smash = KeySmash() + self.ignore_word_embedding = ignore_word_embedding + self.ignore_regex_features = ignore_regex_features + self.key_smash = KeySmash(ignore_ratio_of_numeric_digits_squared, + ignore_shannon_entropy, + ignore_repeated_bigram_ratio, + ignore_unique_char_ratio) self.word_embedding = WordEmbedding(lang=lang, dimensions=dimensions, model=model) self.regex = Regex(country=country, context_words_file=context_words_file) @@ -62,6 +76,8 @@ def extract_features(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame: print(f'extract features from -> {text_column}') df = self.key_smash.extract_key_smash_features(df, text_column) - df = self.word_embedding.extract_word_embedding_features(df, text_column) - df = self.regex.extract_regex_features(df, text_column) + if not self.ignore_word_embedding: + df = self.word_embedding.extract_word_embedding_features(df, text_column) + if not self.ignore_regex_features: + df = self.regex.extract_regex_features(df, text_column) return df diff --git a/hygia/data_pipeline/feature_engineering/key_smash.py b/hygia/data_pipeline/feature_engineering/key_smash.py index 941ec1f6..323f6584 100644 --- a/hygia/data_pipeline/feature_engineering/key_smash.py +++ b/hygia/data_pipeline/feature_engineering/key_smash.py @@ -1,7 +1,8 @@ -from statistics import mean import pandas as pd +import numpy as np import re +MAX_STRING_LENGTH = 128 class KeySmash: """ A class for calculating metrics to indicate key smashing behavior in a text. @@ -20,13 +21,22 @@ class KeySmash: \endcode """ - def __init__(self): + def __init__(self, + ignore_ratio_of_numeric_digits_squared:bool=True, + ignore_shannon_entropy:bool=True, + ignore_repeated_bigram_ratio:bool=True, + ignore_unique_char_ratio:bool=True + ): """ Initialize the KeySmash class. """ + self.ignore_shannon_entropy = ignore_shannon_entropy + self.ignore_ratio_of_numeric_digits_squared = ignore_ratio_of_numeric_digits_squared + self.ignore_repeated_bigram_ratio = ignore_repeated_bigram_ratio + self.ignore_unique_char_ratio = ignore_unique_char_ratio self.char_sets = { "vowels": 'aeiouáéíóúãõ', - "consonants": 'bcdfghjklmnñpqrstvwxyz', + "consonants": 'bcdfghjklmnñpqrstvwxz', # except 'y' "special_characters": '!@#$%^¨|\'\"&*()_+:;~`´]}{[}ºª=-.¿¡' } @@ -160,18 +170,57 @@ def ratio_of_numeric_digits_squared(self, text:str) -> float: return num_of_numeric_digits / len(' '.join(text_list)) else: return 0 - - def _normalize_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + + def shannon_entropy(self, text:str) -> float: """ - Normalize a given column in a dataframe. + Calculates the Shannon entropy for the given string. + + \param text (Type: str) The text to extract the metric from. - \param df (Type: DataFrame) Dataframe to normalize the column in. - \param column (Type: str) Name of the column to be normalized. + \return (Type: float) Shannon entropy (min bits per byte-character). + """ + text = str(text) + text = text.replace(" ", "") + size = len(text) + if size < 2: + return 0.0 + unique_chars = set(text) + freq_dict = {char: text.count(char) for char in unique_chars} + freq_array = np.array(list(freq_dict.values()), dtype=float) + prob_array = freq_array / size + log_array = np.log2(prob_array) + ent = -np.sum(prob_array * log_array) + return ent + + def __count_repeated_bigrams(self, text:str): + bigrams = [text[i:i+2] for i in range(len(text)-1)] + unique_bigrams = set(bigrams) + count = len(bigrams) - len(unique_bigrams) + return count - \return (Type: DataFrame) The input dataframe with the normalized column. + def repeated_bigram_ratio(self, text:str) -> float: """ - return df[column] / df[column].abs().max() if df[column].abs().max() != 0.0 else 0.0 + Calculates the Repeated Bigrams Ratio for the given string. + \param text (Type: str) The text to extract the metric from. + + \return (Type: float) Repeated Bigrams Ratio (min bits per byte-character). + """ + repeated_bigram_count = self.__count_repeated_bigrams(text) + ratio = repeated_bigram_count * 1.0 / len(text) + 1 + return ratio + + def unique_char_ratio(self, text:str) -> float: + """ + Calculates the Unique Char Ratio for the given string. + + \param text (Type: str) The text to extract the metric from. + + \return (Type: float) Unique Char Ratio (min bits per byte-character). + """ + unique_chars = set(text) + ratio = len(unique_chars) / len(text) + 1 + return ratio def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.DataFrame: """ @@ -181,7 +230,15 @@ def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.Dat \param column_name (Type: str) Name of the column in the dataframe that contains the text data to extract features from. \param normalize (bool, optional) Indicates whether to normalize the key smash feature columns. Default is True. - \return (Type: DataFrame) The input dataframe with additional columns for key smash features: 'irregular_sequence_vowels', 'irregular_sequence_consonants', 'irregular_sequence_special_characters', 'number_count_metric', 'char_frequency_metric' + \return (Type: DataFrame) The input dataframe with additional columns for key smash features: + 'irregular_sequence_vowels', + 'irregular_sequence_consonants', + 'irregular_sequence_special_characters', + 'number_count_metric', + 'char_frequency_metric', + 'shannon_entropy', + 'repeated_bigram_ratio', + 'unique_char_ratio' Examples Use this function like this: @@ -198,7 +255,14 @@ def extract_key_smash_features(self, df:pd.DataFrame, column_name:str) -> pd.Dat df[f'feature_ks_count_sequence_squared_vowels_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'vowels') if len(x) > 0 else 0.0) df[f'feature_ks_count_sequence_squared_consonants_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'consonants') if len(x) > 0 else 0.0) df[f'feature_ks_count_sequence_squared_special_characters_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.count_sequence_squared(x, 'special_characters') if len(x) > 0 else 0.0) - df[f'feature_ks_ratio_of_numeric_digits_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.ratio_of_numeric_digits_squared(x) if len(x) > 0 else 0.0) + if not self.ignore_ratio_of_numeric_digits_squared: + df[f'feature_ks_ratio_of_numeric_digits_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.ratio_of_numeric_digits_squared(x) if len(x) > 0 else 0.0) df[f'feature_ks_average_of_char_count_squared_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.average_of_char_count_squared(x) if len(x) > 0 else 0.0) + if not self.ignore_shannon_entropy: + df[f'feature_ks_shannon_entropy_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.shannon_entropy(x) if len(x) > 0 else 0.0) + if not self.ignore_repeated_bigram_ratio: + df[f'feature_ks_repeated_bigram_ratio_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.repeated_bigram_ratio(x) if len(x) > 0 else 0.0) + if not self.ignore_unique_char_ratio: + df[f'feature_ks_unique_char_ratio_{column_name}'] = df[column_name].fillna('').apply(lambda x: self.unique_char_ratio(x) if len(x) > 0 else 0.0) return df \ No newline at end of file diff --git a/hygia/data_pipeline/feature_engineering/word_embedding.py b/hygia/data_pipeline/feature_engineering/word_embedding.py index 704d372e..ab309f68 100644 --- a/hygia/data_pipeline/feature_engineering/word_embedding.py +++ b/hygia/data_pipeline/feature_engineering/word_embedding.py @@ -35,9 +35,9 @@ def __init__(self, lang: str = 'es', dimensions: int = 25, model: str = 'bytepai self.lang = lang self.dimensions = dimensions self.model = model - self.word_embedding_model = self._load_model() + self.word_embedding_model = self.__load_model() - def _load_model(self) -> Any: + def __load_model(self) -> Any: """ Load the word embedding model. @@ -56,7 +56,7 @@ def _load_model(self) -> Any: else: raise ValueError - def _pre_embedding(self, text: str) -> str: + def __pre_embedding(self, text: str) -> str: text = ' '.join(e for e in text.split() if e.isalpha() and len(e) >= 3 and not e.isspace()) return text @@ -85,7 +85,7 @@ def get_embedding(self, text: str) -> np.ndarray: empty_vector = [0.0] * self.dimensions - text = self._pre_embedding(text) + text = self.__pre_embedding(text) # White space string if len(text.strip().split()) == 0: diff --git a/hygia/data_pipeline/model/random_forest.py b/hygia/data_pipeline/model/random_forest.py index 7fe0e818..664bc054 100644 --- a/hygia/data_pipeline/model/random_forest.py +++ b/hygia/data_pipeline/model/random_forest.py @@ -22,7 +22,11 @@ class RandomForestModel: scores \endcode """ - def __init__(self, model_file=None, normalization_absolutes_file=None ,n_estimators=100, max_depth=None, random_state=0, normalize=True): + def __init__(self, model_file=None, + normalization_absolutes_file=None, + n_estimators=100, max_depth=None, + random_state=0, + normalize=True) -> None: """ Initialize the RandomForestModel class. @@ -44,7 +48,7 @@ def __init__(self, model_file=None, normalization_absolutes_file=None ,n_estimat self.random_state = random_state self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, random_state=self.random_state) - def _get_absolute_maximums(self, df, features_columns_to_normalize, concatened_column_name): + def __get_absolute_maximums(self, df, features_columns_to_normalize, concatened_column_name): if self.normalization_absolutes: return absolutes_dict = {} @@ -53,7 +57,7 @@ def _get_absolute_maximums(self, df, features_columns_to_normalize, concatened_c absolutes_dict[column.replace(f"_{concatened_column_name}", '')] = [absolute_maximum] if absolute_maximum else [1.0] self.normalization_absolutes = pd.DataFrame(absolutes_dict) - def _normalization(self, df, features_columns_to_normalize, concatened_column_name): + def __normalization(self, df, features_columns_to_normalize, concatened_column_name): if not self.normalize: return df for features_column_to_normalize in features_columns_to_normalize: @@ -83,8 +87,8 @@ def train_and_get_scores(self, df, concatened_column_name, all_features_columns, # Normalization key_smash_features_columns = [column for column in all_features_columns if column.startswith('feature_ks')] - self._get_absolute_maximums(df_balanced, key_smash_features_columns, concatened_column_name) - df_balanced_normalized = self._normalization(df_balanced.copy(), key_smash_features_columns, concatened_column_name) + self.__get_absolute_maximums(df_balanced, key_smash_features_columns, concatened_column_name) + df_balanced_normalized = self.__normalization(df_balanced.copy(), key_smash_features_columns, concatened_column_name) # Train/Test split X = df_balanced_normalized[[*all_features_columns]].values @@ -123,7 +127,7 @@ def predict(self, X, concatened_column_name): print(f'{Fore.YELLOW}running model...{Fore.WHITE}') key_smash_features_columns = [column for column in X.columns if column.startswith('feature_ks')] - X = self._normalization(X.copy(), key_smash_features_columns, concatened_column_name) + X = self.__normalization(X.copy(), key_smash_features_columns, concatened_column_name) return self.model.predict(X.values) diff --git a/hygia/data_pipeline/pre_process_data/pre_process_data.py b/hygia/data_pipeline/pre_process_data/pre_process_data.py index 807ef8b8..13132a77 100644 --- a/hygia/data_pipeline/pre_process_data/pre_process_data.py +++ b/hygia/data_pipeline/pre_process_data/pre_process_data.py @@ -7,7 +7,8 @@ class PreProcessData: """ This class presents a series of functions that help in data pre-processing. As concatenate columns, replace abbreviation, and etc. - + + Some abbreviations were taken from this website: https://en.wikipedia.org/wiki/Template:Mexico_State-Abbreviation_Codes Examples - Use this class like this: @@ -67,14 +68,14 @@ def handle_nulls(self, df, column_name): """ print(f'handle null values in the column {Style.BRIGHT}{column_name}{Style.NORMAL}') - df[column_name].fillna('').astype(str) + df[column_name] = df[column_name].fillna('').astype(str) return df def handle_extra_spaces(self, df, column_name:str) -> str: df[column_name] = df[column_name].apply(lambda x: ' '.join(x.split())) return df - def _replace_abbreviation(self, text:str) -> str: + def __replace_abbreviation(self, text:str) -> str: """ Function that identifies abbreviations and according to the dictionary changes the names @@ -93,7 +94,7 @@ def handle_abreviations(self, df, column_name): \param column_name (Type: str) Column name to check """ - df[column_name] = df[column_name].apply(lambda x: self._replace_abbreviation(x)) + df[column_name] = df[column_name].apply(lambda x: self.__replace_abbreviation(x)) return df def pre_process_data(self, df, columns_to_concat=None, column_name=None): diff --git a/tests/data_pipeline/annotate_data/test_annotate_data.py b/tests/data_pipeline/annotate_data/test_annotate_data.py index 5e485929..dde4cf81 100644 --- a/tests/data_pipeline/annotate_data/test_annotate_data.py +++ b/tests/data_pipeline/annotate_data/test_annotate_data.py @@ -16,11 +16,12 @@ def test_annotate_data(self): }) key_smash_thresholds = { - 'count_sequence_squared_vowels': 0.9, - 'count_sequence_squared_consonants': 0.9, - 'count_sequence_squared_special_characters': 0.9, - 'ratio_of_numeric_digits_squared': 0.9, - 'average_of_char_count_squared': 0.9, + 'count_sequence_squared_vowels': ['above', 1.00], + 'count_sequence_squared_consonants':['above', 1.999], + 'count_sequence_squared_special_characters': ['above', 2.2499], + # 'ratio_of_numeric_digits_squared': ['above', 2.9], + 'average_of_char_count_squared': ['above', 2.78], + 'shannon_entropy' : ['below', 2.0] } result = self.annotate_data.annotate_data(df, concatened_column_name='concat_address', ks_thresholds=key_smash_thresholds) diff --git a/tests/data_pipeline/feature_engineering/test_feature_engineering.py b/tests/data_pipeline/feature_engineering/test_feature_engineering.py index 3c973ecf..564de197 100644 --- a/tests/data_pipeline/feature_engineering/test_feature_engineering.py +++ b/tests/data_pipeline/feature_engineering/test_feature_engineering.py @@ -20,6 +20,5 @@ def test_extract_features(self, feature_engineering, dataframe): assert 'feature_ks_count_sequence_squared_vowels_text_column' in df.columns assert 'feature_ks_count_sequence_squared_consonants_text_column' in df.columns assert 'feature_ks_count_sequence_squared_special_characters_text_column' in df.columns - assert 'feature_ks_ratio_of_numeric_digits_squared_text_column' in df.columns assert 'feature_ks_average_of_char_count_squared_text_column' in df.columns assert 'feature_we_0_text_column' in df.columns diff --git a/tests/data_pipeline/feature_engineering/test_key_smash.py b/tests/data_pipeline/feature_engineering/test_key_smash.py index e72bcec6..bb5bb88b 100644 --- a/tests/data_pipeline/feature_engineering/test_key_smash.py +++ b/tests/data_pipeline/feature_engineering/test_key_smash.py @@ -5,7 +5,7 @@ class TestKeySmash: def setup_method(self): - self.key_smash = KeySmash() + self.key_smash = KeySmash(ignore_shannon_entropy=False) @pytest.mark.parametrize("data, expected_output", [ ("PUENTECILLA KM. 1.7", 1.121212121212121), @@ -16,7 +16,8 @@ def test_average_of_char_count_squared(self, data, expected_output): @pytest.mark.parametrize("data, opt, expected_output", [ ("PUENTECILLA KM. 1.7", "vowels", 0.0), - ("ASDASD XXXX", "consonants", 2.272727272727273) + ("ASDASD XXXX", "consonants", 2.272727272727273), + ("ABC123 !@#$%", "special_characters", 2.0833333333333335) ]) def test_count_sequence_squared(self, data, opt, expected_output): assert self.key_smash.count_sequence_squared(data, opt) == expected_output @@ -29,6 +30,33 @@ def test_count_sequence_squared(self, data, opt, expected_output): def test_ratio_of_numeric_digits_squared(self, data, expected_output): assert self.key_smash.ratio_of_numeric_digits_squared(data) == expected_output + @pytest.mark.parametrize("data, expected_output", [ + ("PUENTECILLA KM. 1.7",3.7345216647797517), + ("ASDASD XXXX",1.9219280948873623), + ("AS AA",0.8112781244591328), + ("XX XX",-0.0) + ]) + def test_shannon_entropy(self, data, expected_output): + assert self.key_smash.shannon_entropy(data) == expected_output + + @pytest.mark.parametrize("data, expected_output", [ + ("PUENTECILLA KM. 1.7",1.0), + ("ASDASD XXXX",1.3636363636363638), + ("AAAAAA AAAA",1.6363636363636362), + ("XX XX",1.2) + ]) + def test_repeated_bigram_ratio(self, data, expected_output): + assert self.key_smash.repeated_bigram_ratio(data) == expected_output + + @pytest.mark.parametrize("data, expected_output", [ + ("PUENTECILLA KM. 1.7",1.7894736842105263), + ("ASDASD XXXX",1.4545454545454546), + ("AAAAAA AAAA",1.1818181818181819), + ("XX XX",1.4) + ]) + def test_unique_char_ratio(self, data, expected_output): + assert self.key_smash.unique_char_ratio(data) == expected_output + def test_extract_key_smash_features(self): df = pd.DataFrame({"text_column": ["abcdefgh", "ijklmnop", "qrstuvwxyz"]}) result = self.key_smash.extract_key_smash_features(df, "text_column") @@ -36,6 +64,6 @@ def test_extract_key_smash_features(self): assert 'feature_ks_count_sequence_squared_vowels_text_column' in result.columns assert 'feature_ks_count_sequence_squared_consonants_text_column' in result.columns assert 'feature_ks_count_sequence_squared_special_characters_text_column' in result.columns - assert 'feature_ks_ratio_of_numeric_digits_squared_text_column' in result.columns assert 'feature_ks_average_of_char_count_squared_text_column' in result.columns + assert 'feature_ks_shannon_entropy_text_column' in result.columns assert result.shape[1] == 6 # Ensure no extra columns are added \ No newline at end of file diff --git a/tests/data_pipeline/feature_engineering/test_word_embedding.py b/tests/data_pipeline/feature_engineering/test_word_embedding.py index 1c1d688f..5bdfd300 100644 --- a/tests/data_pipeline/feature_engineering/test_word_embedding.py +++ b/tests/data_pipeline/feature_engineering/test_word_embedding.py @@ -1,26 +1,25 @@ import pytest import pandas as pd import numpy as np -from whatlies.language import BytePairLanguage from hygia import WordEmbedding class TestWordEmbedding: - def setup_method(self): + @pytest.fixture(autouse=True) + def setup_class(self): self.word_embedding = WordEmbedding() - def test_load_model(self): - assert isinstance(self.word_embedding._load_model(), BytePairLanguage) - + def test_pre_embedding(self): + text = 'A test with ABC123 AVENUE' + pre_embedding = self.word_embedding._WordEmbedding__pre_embedding(text) + assert pre_embedding == 'test with AVENUE' + def test_get_embedding(self): embedding = self.word_embedding.get_embedding("This is a sample text.") assert isinstance(embedding, np.ndarray) - def test_pre_embedding(self): - assert self.word_embedding._pre_embedding("A test with ABC123 AVENUE") == "test with AVENUE" - def test_extract_word_embedding_features(self): df = pd.DataFrame({"text_column": ["This is a sample text.", "Another sample text."]}) result = self.word_embedding.extract_word_embedding_features(df, "text_column") assert isinstance(result, pd.DataFrame) - assert any(col.startswith("feature_we_") for col in result.columns) \ No newline at end of file + assert any(col.startswith("feature_we_") for col in result.columns) diff --git a/tests/data_pipeline/model/test_random_forest.py b/tests/data_pipeline/model/test_random_forest.py index 433bcc75..4d9dff47 100644 --- a/tests/data_pipeline/model/test_random_forest.py +++ b/tests/data_pipeline/model/test_random_forest.py @@ -1,10 +1,20 @@ import pandas as pd +from sklearn.datasets import make_classification from hygia import RandomForestModel class TestRandomForestModel: - def setup_method(self): - self.random_forest = RandomForestModel() - - def test_random_forest(self): - # TODO improve model tests - assert self.random_forest \ No newline at end of file + def test_random_forest_model(self): + X, y = make_classification(random_state=42) + + columns = ['feature_'+str(i) for i in range(X.shape[1])] + df = pd.DataFrame(X, columns=columns) + df['target'] = ['valid' if label == 0 else 'key_smash' for label in y] + + model = RandomForestModel(normalize=False) + + scores = model.train_and_get_scores(df, 'target', columns) + + assert scores['accuracy'] >= 0.0 and scores['accuracy'] <= 1 + assert scores['precision'] >= 0.0 and scores['precision'] <= 1 + assert scores['recall'] >= 0.0 and scores['recall'] <= 1 + assert scores['f1'] >= 0.0 and scores['f1'] <= 1 diff --git a/tests/data_pipeline/pre_process_data/test_pre_process_data.py b/tests/data_pipeline/pre_process_data/test_pre_process_data.py index ea54d05a..1b223816 100644 --- a/tests/data_pipeline/pre_process_data/test_pre_process_data.py +++ b/tests/data_pipeline/pre_process_data/test_pre_process_data.py @@ -1,23 +1,43 @@ import pytest - +import pandas as pd from hygia.data_pipeline.pre_process_data.pre_process_data import PreProcessData -from hygia.paths.paths import root_path -@pytest.mark.parametrize("abbreviation, expected_replacement", [ - ('NO', "NUMBER"), - ('no', "NUMBER"), - ('no123', "NUMBER123"), - ('no 123', "NUMBER 123"), - ('123 no', "123 NUMBER"), - ('not', "not"), - ('NOT', "NOT"), - ('ono', "ono") -]) class TestPreProcessData: - def test_replace_abbreviation_coutry(self, abbreviation, expected_replacement): - pre_process_data = PreProcessData(country='MEXICO') - assert pre_process_data._replace_abbreviation(abbreviation) == expected_replacement - - def test_replace_abbreviation_abbreviations_file(self, abbreviation, expected_replacement): - pre_process_data = PreProcessData(abbreviations_file=root_path + '/data/dicts/mexico_abbreviations.csv') - assert pre_process_data._replace_abbreviation(abbreviation) == expected_replacement + def setup_method(self): + self.pre_processor = PreProcessData(country='MEXICO') + + def test_concatenate_columns(self): + data = {'A': ['a', 'b', 'c'], 'B': ['d', 'e', 'f']} + df = pd.DataFrame(data) + expected_output = ['a d', 'b e', 'c f'] + output = self.pre_processor.concatenate_columns(df, ['A', 'B'], 'C') + assert list(output['C']) == expected_output + + def test_handle_nulls(self): + data = {'A': ['a', 'b', None]} + df = pd.DataFrame(data) + expected_output = ['a', 'b', ''] + output = self.pre_processor.handle_nulls(df, 'A') + assert list(output['A']) == expected_output + + def test_handle_extra_spaces(self): + data = {'A': [' a ', ' b ', ' c']} + df = pd.DataFrame(data) + expected_output = ['a', 'b', 'c'] + output = self.pre_processor.handle_extra_spaces(df, 'A') + assert list(output['A']) == expected_output + + def test_handle_abreviations(self): + data = {'A': ['CDMX', 'MZ', 'BCN']} + df = pd.DataFrame(data) + expected_output = ['Ciudad de México', 'MANZANA', 'Baja California'] + output = self.pre_processor.handle_abreviations(df, 'A') + assert list(output['A']) == expected_output + + def test_pre_process_data(self): + data = {'A': [' a ', ' b ', ' c'], 'B': ['d', 'e', 'f'], 'C': ['NLE', 'CANCUN', 'NLE Monterrey']} + df = pd.DataFrame(data) + expected_output = ['a d', 'b e', 'c f'] + output = self.pre_processor.pre_process_data(df, ['A', 'B'], 'D') + assert 'D' in output.columns + assert list(output['D']) == expected_output