forked from jamesrobertlloyd/automl-phase-1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
328 lines (295 loc) · 15.1 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
from __future__ import division
#!/usr/bin/env python
#############################
# ChaLearn AutoML challenge #
#############################
# Usage: python run.py input_dir output_dir
# This sample code can be used either
# - to submit RESULTS deposited in the res/ subdirectory or
# - as a template for CODE submission.
#
# The input directory input_dir contains 5 subdirectories named by dataset,
# including:
# dataname/dataname_feat.type -- the feature type "Numerical", "Binary", or "Categorical" (Note: if this file is abscent, get the feature type from the dataname.info file)
# dataname/dataname_public.info -- parameters of the data and task, including metric and time_budget
# dataname/dataname_test.data -- training, validation and test data (solutions/target values are given for training data only)
# dataname/dataname_train.data
# dataname/dataname_train.solution
# dataname/dataname_valid.data
#
# The output directory will receive the predicted values (no subdirectories):
# dataname_test_000.predict -- Provide predictions at regular intervals to make sure you get some results even if the program crashes
# dataname_test_001.predict
# dataname_test_002.predict
# ...
# dataname_valid_000.predict
# dataname_valid_001.predict
# dataname_valid_002.predict
# ...
#
# Result submission:
# =================
# Search for @RESULT to locate that part of the code.
# ** Always keep this code. **
# If the subdirectory res/ contains result files (predicted values)
# the code just copies them to the output and does not train/test models.
# If no results are found, a model is trained and tested (see code submission).
#
# Code submission:
# ===============
# Search for @CODE to locate that part of the code.
# ** You may keep or modify this template or subtitute your own code. **
# The program saves predictions regularly. This way the program produces
# at least some results if it dies (or is terminated) prematurely.
# This also allows us to plot learning curves. The last result is used by the
# scoring program.
# We implemented 2 classes:
# 1) DATA LOADING:
# ------------
# Use/modify
# D = DataManager(basename, input_dir, ...)
# to load and preprocess data.
# Missing values --
# Our default method for replacing missing values is trivial: they are replaced by 0.
# We also add extra indicator features where missing values occurred. This doubles the number of features.
# Categorical variables --
# The location of potential Categorical variable is indicated in D.feat_type.
# NOTHING special is done about them in this sample code.
# Feature selection --
# We only implemented an ad hoc feature selection filter efficient for the
# dorothea dataset to show that performance improves significantly
# with that filter. It takes effect only for binary classification problems with sparse
# matrices as input and unbalanced classes.
# 2) LEARNING MACHINE:
# ----------------
# Use/modify
# M = MyAutoML(D.info, ...)
# to create a model.
# Number of base estimators --
# Our models are ensembles. Adding more estimators may improve their accuracy.
# Use M.model.n_estimators = num
# Training --
# M.fit(D.data['X_train'], D.data['Y_train'])
# Fit the parameters and hyper-parameters (all inclusive!)
# What we implemented hard-codes hyper-parameters, you probably want to
# optimize them. Also, we made a somewhat arbitrary choice of models in
# for the various types of data, just to give some baseline results.
# You probably want to do better model selection and/or add your own models.
# Testing --
# Y_valid = M.predict(D.data['X_valid'])
# Y_test = M.predict(D.data['X_test'])
#
# ALL INFORMATION, SOFTWARE, DOCUMENTATION, AND DATA ARE PROVIDED "AS-IS".
# ISABELLE GUYON, CHALEARN, AND/OR OTHER ORGANIZERS OR CODE AUTHORS DISCLAIM
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY PARTICULAR PURPOSE, AND THE
# WARRANTY OF NON-INFRIGEMENT OF ANY THIRD PARTY'S INTELLECTUAL PROPERTY RIGHTS.
# IN NO EVENT SHALL ISABELLE GUYON AND/OR OTHER ORGANIZERS BE LIABLE FOR ANY SPECIAL,
# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, MATERIALS,
# PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE.
#
# Main contributors: Isabelle Guyon and Arthur Pesah, March-October 2014
# Originally inspired by code code: Ben Hamner, Kaggle, March 2013
# Modified by Ivan Judson and Christophe Poulain, Microsoft, December 2013
# =========================== BEGIN USER OPTIONS ==============================
# Verbose mode:
##############
# Recommended to keep verbose = True: shows various progression messages
verbose = True # outputs messages to stdout and stderr for debug purposes
# Debug level:
##############
# 0: run the code normally, using the time budget of the tasks
# 1: run the code normally, but limits the time to max_time
# 2: run everything, but do not train, generate random outputs in max_time
# 3: stop before the loop on datasets
# 4: just list the directories and program version
debug_mode = 0
# Time budget
#############
# Maximum time of training in seconds PER DATASET (there are 5 datasets).
# The code should keep track of time spent and NOT exceed the time limit
# in the dataset "info" file, stored in D.info['time_budget'], see code below.
# If debug >=1, you can decrease the maximum time (in sec) with this variable:
max_time = 90
# Maximum number of cycles
##########################
# Your training algorithm may be fast, so you may want to limit anyways the
# number of points on your learning curve (this is on a log scale, so each
# point uses twice as many time than the previous one.)
max_cycle = 50
# ZIP your code
###############
# You can create a code submission archive, ready to submit, with zipme = True.
# This is meant to be used on your LOCAL server.
import datetime
zipme = True # use this flag to enable zipping of your code submission
the_date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
submission_filename = 'phase0_submission_' + the_date
# I/O defaults
##############
# Use default location for the input and output data:
# If no arguments to run.py are provided, this is where the data will be found
# and the results written to. Change the root_dir to your local directory.
import os
# import sys
root_dir = os.path.dirname(__file__)
# root_dir = "/Users/isabelleguyon/Documents/Projects/Codalab/AutoMLcompetition/StartingKit/"
# default_input_dir = os.path.join(root_dir, '..', 'data', 'dsss_bin_class_fold_01')
default_input_dir = os.path.join(root_dir, '..', 'data', 'phase_0')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dss_bin_class_fold_01', 'cv_rf_msl')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dss_bin_class_fold_01', 'cv_rf_gbm')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'automl')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'automl_rf')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf_gbm')
# default_output_dir = os.path.join(root_dir, '..', 'predictions', 'dsss_bin_class_fold_01', 'ft_cv_rf_gbm_v2')
default_output_dir = os.path.join(root_dir, 'res')
# =========================== END USER OPTIONS ================================
# Version of the sample code
# Change in 1.1: time is measured by time.time(), not time.clock(): we keep track of wall time
# Change in 1,2: James Robert Lloyd taking control of code
# Change in 1.3: Prediction code run in separate process to keep an eye of memory and time
version = 1.3
# General purpose functions
# import os
from sys import argv, path
import numpy as np
import time
overall_start = time.time()
# Our directories
# Note: On cadalab, there is an extra sub-directory called "program"
running_on_codalab = False
run_dir = os.path.abspath(".")
codalab_run_dir = os.path.join(run_dir, "program")
if os.path.isdir(codalab_run_dir):
run_dir=codalab_run_dir
running_on_codalab = True
print "Running on Codalab!"
lib_dir = os.path.join(run_dir, "lib")
res_dir = os.path.join(run_dir, "res")
# Our libraries
path.append (run_dir)
path.append (lib_dir)
from automl_lib import data_io # general purpose input/output functions
from automl_lib.data_io import vprint # print only in verbose mode
from automl_lib.data_manager import DataManager # load/save data and get info about them
from automl_lib.models import MyAutoML # example model from scikit learn
if debug_mode >= 4 or running_on_codalab: # Show library version and directory structure
data_io.show_version()
data_io.show_dir(run_dir)
from multiprocessing import Process
import psutil
import util
import automl
import traceback
# Define function to be called to start process
def run_automl(input_dir, output_dir, data_name, time_budget):
print('input_dir = "%s"' % input_dir)
print('output_dir = "%s"' % output_dir)
print('data_name = "%s"' % data_name)
print('time_budget = %s' % time_budget)
try:
# automl.data_doubling_rf(input_dir, output_dir, data_name, time_budget, 20)
# automl.cv_growing_rf(input_dir, output_dir, data_name, time_budget)
# automl.cv_growing_rf_gbm(input_dir, output_dir, data_name, time_budget)
# automl.competition_example(input_dir, output_dir, data_name, time_budget)
# automl.competition_example_only_rf(input_dir, output_dir, data_name, time_budget)
# automl.freeze_thaw_cv_rf(input_dir, output_dir, data_name, time_budget)
# automl.freeze_thaw_cv_rf_gbm(input_dir, output_dir, data_name, time_budget, compute_quantum=10)
automl.automl_phase_0(input_dir, output_dir, data_name, time_budget)
except:
traceback.print_exc()
# =========================== BEGIN PROGRAM ================================
if __name__=="__main__" and debug_mode<4:
#### Check whether everything went well (no time exceeded)
execution_success = True
#### INPUT/OUTPUT: Get input and output directory names
if len(argv)==1: # Use the default input and output directories if no arguments are provided
input_dir = default_input_dir
output_dir = default_output_dir
else:
input_dir = argv[1]
output_dir = os.path.abspath(argv[2]);
# Move old results and create a new output directory
data_io.mvdir(output_dir, output_dir+'_'+the_date)
data_io.mkdir(output_dir)
#### INVENTORY DATA (and sort dataset names alphabetically)
datanames = data_io.inventory_data(input_dir)
#### DEBUG MODE: Show dataset list and STOP
if debug_mode>=3:
data_io.show_io(input_dir, output_dir)
print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')
data_io.write_list(datanames)
datanames = [] # Do not proceed with learning and testing
# ==================== @RESULT SUBMISSION (KEEP THIS) =====================
# Always keep this code to enable result submission of pre-calculated results
# deposited in the res/ subdirectory.
if len(datanames)>0:
vprint( verbose, "************************************************************************")
vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******")
vprint( verbose, "************************************************************************")
OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE!
if OK:
vprint( verbose, "[+] Success")
datanames = [] # Do not proceed with learning and testing
else:
vprint( verbose, "======== Some missing results on current datasets!")
vprint( verbose, "======== Proceeding to train/test:\n")
# =================== End @RESULT SUBMISSION (KEEP THIS) ==================
if zipme:
vprint( verbose, "========= Zipping this directory to prepare for submit ==============")
data_io.zipdir(submission_filename + '.zip', ".")
# ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) =================
for basename in datanames:
print('Processing dataset : ' + basename)
# Keep track of time
start_time = time.time()
# Write a file to record start time
open(os.path.join(output_dir, basename + '.firstpost'), 'wb').close()
print('\nStarting\n')
# Read time budget
with open(os.path.join(input_dir, basename, basename + '_public.info'), 'r') as info_file:
for line in info_file:
if line.startswith('time_budget'):
time_budget = int(line.split('=')[-1])
print('Time budget = %ds' % time_budget)
# Start separate process to analyse file
p = Process(target=run_automl, args=(input_dir, output_dir, basename, time_budget - (time.time() - start_time)))
p.start()
# Monitor the process, checking to see if it is complete or using too much memory
while True:
time.sleep(1)
if p.is_alive():
try:
process_mem = util.resident_memory_usage(p.pid)
except:
print('Cannot get memory of process %d :(' % p.pid)
else:
system_mem = psutil.virtual_memory().total
prop_mem = process_mem / system_mem
if prop_mem > 0.9:
print('Memory usage of process %d too damn high' % p.pid)
p.terminate()
p.join()
print('Process %d terminated' % p.pid)
break
print('Proportion of memory used by process %d : %02.1f%%' %
(p.pid, 100 * (process_mem / system_mem)))
if (time.time() - start_time) > (time_budget - 10):
print('Time limit approaching - terminating')
p.terminate()
p.join()
print('Process %d terminated' % p.pid)
break
else:
print('Remaining time budget = %s' % (time_budget - (time.time() - start_time)))
else:
print('Process terminated of its own accord')
break
print('\nFinished\n')
if running_on_codalab:
if execution_success:
exit(0)
else:
exit(1)