Skip to content

Commit

Permalink
#98 Fixed problems with PRE PROCESSING, FEATURE ENGINEERING and ANNOT…
Browse files Browse the repository at this point in the history
…ATE DATA
  • Loading branch information
AnHoff committed Jun 5, 2023
1 parent 496deb7 commit f778144
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 3 deletions.
47 changes: 47 additions & 0 deletions config/default_config2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
dag:
id: basic_example
description: DAG de teste
output_folder: '../data/tmp/'
data_path:
- '../data/data_example.csv'
data_config:
separator: ','
engine: 'python'
encoding: 'utf-8'
nrows: 100000

pre_processing:
aliases:
- street: ['ADDRESS', 'ZIPCODE']

feature_engineering:
- input:
columns:
- 'street'
features:
word_embedding:
data_lang: 'es'
street:
dimensions: 25

annotate_data:
- input:
columns:
- 'street'
thresholds:
count_sequence_squared_vowels: 1.00
count_sequence_squared_consonants: 1.999
count_sequence_squared_special_characters: 2.2499
ratio_of_numeric_digits_squared: 2.9
average_of_char_count_squared: 2.78

model:
random_forest:
- input:
trained_model_file: '../data/models/RandomForest_Ksmash_WordEmbedding_Regex.pkl'
type: 'address'
columns:
- 'street'
thresholds:
test_size: 0.3
n_estimators: 100
44 changes: 41 additions & 3 deletions examples/yaml_example.ipynb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -11,13 +12,50 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[35m------ HYGIA ------\u001b[37m\n",
"------------------------------\n",
"\u001b[47m\u001b[30mRunning PRE PROCESSING...\u001b[40m\u001b[37m\n",
"aliases indified: \u001b[1mstreet -> \u001b[22m['ADDRESS', 'ZIPCODE']\n",
"handle null values in the column \u001b[1mstreet\u001b[22m\n",
"------------------------------\n",
"\u001b[47m\u001b[30mRunning FEATURE ENGINEERING...\u001b[40m\u001b[37m\n",
"handle null values in the column \u001b[1mstreet\u001b[22m\n",
"\u001b[33mrunning feature engineering with configs below...\u001b[37m\n",
"\u001b[1mlanguage -> \u001b[22mes\n",
"\u001b[1mdimensions -> \u001b[22m25\n",
"extract features from -> street\n",
"------------------------------\n",
"\u001b[47m\u001b[30mRunning ANNOTATE DATA...\u001b[40m\u001b[37m\n",
"\u001b[33mrunning annotate data with configs below...\u001b[37m\n",
"\u001b[1mthresholds -> \u001b[22m{'count_sequence_squared_vowels': 1.0, 'count_sequence_squared_consonants': 1.999, 'count_sequence_squared_special_characters': 2.2499, 'ratio_of_numeric_digits_squared': 2.9, 'average_of_char_count_squared': 2.78, 'ksmash_sequence_vowels': 1.0, 'ksmash_sequence_consonants': 1.999, 'ksmash_sequence_special_characters': 2.2499, 'ksmash_numbers': 2.9, 'ksmash_char_frequence': 2.78}\n",
"column -> street\n"
]
},
{
"ename": "TypeError",
"evalue": "'float' object is not subscriptable",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mhygia\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mhg\u001b[39;00m\n\u001b[1;32m 3\u001b[0m config_file \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m../config/default_config2.yaml\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m result \u001b[39m=\u001b[39m hg\u001b[39m.\u001b[39;49mrun_with_config(config_file)\n\u001b[1;32m 5\u001b[0m result \n",
"File \u001b[0;32m~/hygia/hygia/main.py:79\u001b[0m, in \u001b[0;36mrun_with_config\u001b[0;34m(yaml_path)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[39mfor\u001b[39;00m column \u001b[39min\u001b[39;00m columns: \n\u001b[1;32m 78\u001b[0m thresholds \u001b[39m=\u001b[39m annotate_data_config[\u001b[39m'\u001b[39m\u001b[39mthresholds\u001b[39m\u001b[39m'\u001b[39m]\n\u001b[0;32m---> 79\u001b[0m df \u001b[39m=\u001b[39m annotateData()\u001b[39m.\u001b[39;49mannotate_data(df, column, thresholds) \n\u001b[1;32m 81\u001b[0m \u001b[39m# Model\u001b[39;00m\n\u001b[1;32m 82\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m30\u001b[39m\u001b[39m*\u001b[39m\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m)\n",
"File \u001b[0;32m~/hygia/hygia/data_pipeline/annotate_data/annotate_data.py:49\u001b[0m, in \u001b[0;36mAnnotateData.annotate_data\u001b[0;34m(self, df, concatened_column_name, ks_thresholds)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[39mfor\u001b[39;00m ks_colummn \u001b[39min\u001b[39;00m ks_colummns:\n\u001b[1;32m 48\u001b[0m threshold \u001b[39m=\u001b[39m ks_thresholds[ks_colummn\u001b[39m.\u001b[39mreplace(\u001b[39m'\u001b[39m\u001b[39mfeature_ks_\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\u001b[39m.\u001b[39mreplace(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m{\u001b[39;00mconcatened_column_name\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)]\n\u001b[0;32m---> 49\u001b[0m \u001b[39mif\u001b[39;00m threshold[\u001b[39m0\u001b[39;49m] \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39mabove\u001b[39m\u001b[39m'\u001b[39m:\n\u001b[1;32m 50\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mtarget\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mapply(\u001b[39mlambda\u001b[39;00m x: \u001b[39m'\u001b[39m\u001b[39mkey_smash\u001b[39m\u001b[39m'\u001b[39m \u001b[39mif\u001b[39;00m x[ks_colummn] \u001b[39m>\u001b[39m\u001b[39m=\u001b[39m threshold[\u001b[39m1\u001b[39m] \u001b[39melse\u001b[39;00m x[\u001b[39m'\u001b[39m\u001b[39mtarget\u001b[39m\u001b[39m'\u001b[39m], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m) \n\u001b[1;32m 51\u001b[0m \u001b[39melif\u001b[39;00m threshold[\u001b[39m0\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m'\u001b[39m\u001b[39mbelow\u001b[39m\u001b[39m'\u001b[39m:\n",
"\u001b[0;31mTypeError\u001b[0m: 'float' object is not subscriptable"
]
}
],
"source": [
"import hygia as hg\n",
"\n",
"config_file = '../config/default_config.yaml'\n",
"config_file = '../config/default_config2.yaml'\n",
"result = hg.run_with_config(config_file)\n",
"result "
]
Expand Down

0 comments on commit f778144

Please sign in to comment.