-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparam.yaml
43 lines (40 loc) · 4.84 KB
/
param.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# unless specifically indicated all field accept a single value
general:
seed: 41 # used in parent selection, child conception (cross over) and mutation, all of which is single thread
algo: ga # use ga only for now
thread_number: 8 # the number of thread used in feature selection and fit computation
log_level: debug # possible values are trace, debug, info, warning or error
language: ter,bin,ratio,pow2 # possible values are ter,bin,ratio,pow2, see README.md for detail. A comma separated list (no spaces) is accepted, which means the initial population will be split
data_type: raw,prev,log # possible values are raw,prev,log, see README.md for detail. Same as above, comma separated list is fine.
data_type_minimum: 1e-5 # this is only usefull for data_type prevalence (where it is a threshold) or log (where it replaces values below)
fit: specificity # possible values are auc,specificity,sensitivity, see README.md for details
k_penalty: 0.01 # this menalty is deduced from fit function multiplied by k, the number of variables used in the model
overfit_penalty: 0.15 # setting this will trigger the removal of a random fold in the train population (change fold_number in CV to ajust), and the fit delta will be removed multiplied by this coefficient to the initial fit
fr_penalty: 0.5 # used only when fit is specificity or sensitivity, deduce (1 - symetrical metrics) x fr_penalty to fit
nb_best_model_to_test: 30 # nb of models to test in the last generation (default to 10, 0 means all models)
cv:
fold_number: 10 # used only if overfit_penalty is not null. used also in the not recommanded experimental (YOLO) 'ga+cv' algo.
data:
X: "samples/Qin2014/Xtrain.tsv" # the features of the train data set
y: "samples/Qin2014/Ytrain.tsv" # the class description of the train data set (0=class 0, 1=class 1 (the class to be predicted), 2=unknown status)
Xtest: "samples/Qin2014/Xtest.tsv" # the features of the test data set
ytest: "samples/Qin2014/Ytest.tsv" # the class description of the test data set
pvalue_method: wilcoxon # possible values are wilcoxon or studentt. wilcoxon is recommanded in most cases.
feature_minimal_prevalence_pct: 10 # per class, e.g. features are retained if any of the class reaches this level
feature_maximal_pvalue: 0.01 # features with differences less significant (p value above that threshold) than this will be removed
feature_minimal_feature_value: 0.0001 # features which mean is below that value are discarded
ga:
population_size: 2000 # the target number of models per generation (NB the real number may be below because of clone removal)
max_epochs: 500 # the maximum number of generation before stopping (note that you can stop manually before sending a kill -1 to the process)
min_epochs: 300 # the minimum number of generation to do
max_age_best_model: 100 # stoping after min_epochs and before max_epochs will occur only if the best model reaches this age
kmin: 1 # the minimal number of variables used in the initial population
kmax: 50 # the maximum number of varialbes used in the initial population (setting to 0 will remove any maximum)
select_elite_pct: 2 # the % of best models of previous generation retained: the lower the figure the more elitist you are
select_niche_pct: 20 # (optional default to 0) the % of best models of previous generation retained but split per language / data type (enable to maintain competition between language/data types)
select_random_pct: 2 # the % of opportunistic models of previous generation retained: this is split between all the languages/data_types present in the previous generation
mutated_children_pct: 80 # the % of children submitted to mutation
mutated_features_pct: 10 # the % of mutation per "gene" (e.g. potential variable), keep it mind that most mutation are "non sense", e.g. remove a variable
mutation_non_null_chance_pct: 20 # the % of "sense" mutation (e.g. the likeliness that a mutation may add a new variable)
feature_importance_permutations: 100 # not used in ga (used in experimental "don't use me" "ga+cv" algorithm)
keep_all_generation: false # keep this setting to false when using gpredomics as a binary (setting to true is interesting in gpredomicsR)