From f7781441e74eed00f2bff5e111aa78f4a93cc5f5 Mon Sep 17 00:00:00 2001 From: Ana Hoffmann Date: Mon, 5 Jun 2023 14:04:35 -0300 Subject: [PATCH] #98 Fixed problems with PRE PROCESSING, FEATURE ENGINEERING and ANNOTATE DATA --- config/default_config2.yaml | 47 +++++++++++++++++++++++++++++++++++++ examples/yaml_example.ipynb | 44 +++++++++++++++++++++++++++++++--- 2 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 config/default_config2.yaml diff --git a/config/default_config2.yaml b/config/default_config2.yaml new file mode 100644 index 00000000..25a621f4 --- /dev/null +++ b/config/default_config2.yaml @@ -0,0 +1,47 @@ +dag: + id: basic_example + description: DAG de teste + output_folder: '../data/tmp/' + data_path: + - '../data/data_example.csv' + data_config: + separator: ',' + engine: 'python' + encoding: 'utf-8' + nrows: 100000 + + pre_processing: + aliases: + - street: ['ADDRESS', 'ZIPCODE'] + + feature_engineering: + - input: + columns: + - 'street' + features: + word_embedding: + data_lang: 'es' + street: + dimensions: 25 + + annotate_data: + - input: + columns: + - 'street' + thresholds: + count_sequence_squared_vowels: 1.00 + count_sequence_squared_consonants: 1.999 + count_sequence_squared_special_characters: 2.2499 + ratio_of_numeric_digits_squared: 2.9 + average_of_char_count_squared: 2.78 + + model: + random_forest: + - input: + trained_model_file: '../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl' + type: 'address' + columns: + - 'street' + thresholds: + test_size: 0.3 + n_estimators: 100 diff --git a/examples/yaml_example.ipynb b/examples/yaml_example.ipynb index 20ebdb61..46cbcb6e 100644 --- a/examples/yaml_example.ipynb +++ b/examples/yaml_example.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -11,13 +12,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[35m------ HYGIA ------\u001b[37m\n", + "------------------------------\n", + "\u001b[47m\u001b[30mRunning PRE PROCESSING...\u001b[40m\u001b[37m\n", + "aliases indified: \u001b[1mstreet -> \u001b[22m['ADDRESS', 'ZIPCODE']\n", + "handle null values in the column \u001b[1mstreet\u001b[22m\n", + "------------------------------\n", + "\u001b[47m\u001b[30mRunning FEATURE ENGINEERING...\u001b[40m\u001b[37m\n", + "handle null values in the column \u001b[1mstreet\u001b[22m\n", + "\u001b[33mrunning feature engineering with configs below...\u001b[37m\n", + "\u001b[1mlanguage -> \u001b[22mes\n", + "\u001b[1mdimensions -> \u001b[22m25\n", + "extract features from -> street\n", + "------------------------------\n", + "\u001b[47m\u001b[30mRunning ANNOTATE DATA...\u001b[40m\u001b[37m\n", + "\u001b[33mrunning annotate data with configs below...\u001b[37m\n", + "\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': 1.0, 'count_sequence_squared_consonants': 1.999, 'count_sequence_squared_special_characters': 2.2499, 'ratio_of_numeric_digits_squared': 2.9, 'average_of_char_count_squared': 2.78, 'ksmash_sequence_vowels': 1.0, 'ksmash_sequence_consonants': 1.999, 'ksmash_sequence_special_characters': 2.2499, 'ksmash_numbers': 2.9, 'ksmash_char_frequence': 2.78}\n", + "column -> street\n" + ] + }, + { + "ename": "TypeError", + "evalue": "'float' object is not subscriptable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mhygia\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mhg\u001b[39;00m\n\u001b[1;32m 3\u001b[0m config_file \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m../config/default_config2.yaml\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m result \u001b[39m=\u001b[39m hg\u001b[39m.\u001b[39;49mrun_with_config(config_file)\n\u001b[1;32m 5\u001b[0m result \n", + "File \u001b[0;32m~/hygia/hygia/main.py:79\u001b[0m, in \u001b[0;36mrun_with_config\u001b[0;34m(yaml_path)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m columns: \n\u001b[1;32m 78\u001b[0m thresholds \u001b[39m=\u001b[39m annotate_data_config[\u001b[39m'\u001b[39m\u001b[39mthresholds\u001b[39m\u001b[39m'\u001b[39m]\n\u001b[0;32m---> 79\u001b[0m df \u001b[39m=\u001b[39m annotateData()\u001b[39m.\u001b[39;49mannotate_data(df, column, thresholds) \n\u001b[1;32m 81\u001b[0m \u001b[39m# Model\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m30\u001b[39m\u001b[39m*\u001b[39m\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m)\n", + "File \u001b[0;32m~/hygia/hygia/data_pipeline/annotate_data/annotate_data.py:49\u001b[0m, in \u001b[0;36mAnnotateData.annotate_data\u001b[0;34m(self, df, concatened_column_name, ks_thresholds)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[39mfor\u001b[39;00m ks_colummn \u001b[39min\u001b[39;00m ks_colummns:\n\u001b[1;32m 48\u001b[0m threshold \u001b[39m=\u001b[39m ks_thresholds[ks_colummn\u001b[39m.\u001b[39mreplace(\u001b[39m'\u001b[39m\u001b[39mfeature_ks_\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m---> 49\u001b[0m \u001b[39mif\u001b[39;00m threshold[\u001b[39m0\u001b[39;49m] \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39mabove\u001b[39m\u001b[39m'\u001b[39m:\n\u001b[1;32m 50\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mtarget\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mapply(\u001b[39mlambda\u001b[39;00m x: \u001b[39m'\u001b[39m\u001b[39mkey_smash\u001b[39m\u001b[39m'\u001b[39m \u001b[39mif\u001b[39;00m x[ks_colummn] \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m threshold[\u001b[39m1\u001b[39m] \u001b[39melse\u001b[39;00m x[\u001b[39m'\u001b[39m\u001b[39mtarget\u001b[39m\u001b[39m'\u001b[39m], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m) \n\u001b[1;32m 51\u001b[0m \u001b[39melif\u001b[39;00m threshold[\u001b[39m0\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39mbelow\u001b[39m\u001b[39m'\u001b[39m:\n", + "\u001b[0;31mTypeError\u001b[0m: 'float' object is not subscriptable" + ] + } + ], "source": [ "import hygia as hg\n", "\n", - "config_file = '../config/default_config.yaml'\n", + "config_file = '../config/default_config2.yaml'\n", "result = hg.run_with_config(config_file)\n", "result " ]