forked from h2oai/h2o-3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathh2o_sandbox.py
executable file
·336 lines (297 loc) · 15.5 KB
/
h2o_sandbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/python
import sys, itertools, os, re, glob
# use glob.glob. it uses os.listdir() and fnmatch.fnmatch() ..so it's unix style pattern match
def check_sandbox_for_errors(LOG_DIR=None, python_test_name='',
cloudShutdownIsError=False, sandboxIgnoreErrors=False, pattern=None, verbose=False):
# show the parameters
### print "check_sandbox_for_errors:", locals()
# gets set below on error (returned)
errorFound = False
if not LOG_DIR:
LOG_DIR = './sandbox'
if not os.path.exists(LOG_DIR):
if verbose:
print "directory", LOG_DIR, "not found"
return
# FIX! wait for h2o to flush to files? how?
# Dump any assertion or error line to the screen
# Both "passing" and failing tests??? I guess that's good.
# if you find a problem, just keep printing till the end, in that file.
# The stdout/stderr is shared for the entire cloud session?
# so don't want to dump it multiple times?
# glob gives full path, so we have to strip to match os.listdir()
goodLogsList = []
# if we're using a pattern, ignore the "done" files
if pattern:
if verbose:
print "Only looking at files that match pattern:", pattern
# search whatever the pattern says
# need to exclude directories (syn_datasets)
tempFileList = glob.glob(LOG_DIR + "/" + pattern)
# have to remove all the line count temp files
# ignore the json file we copy there also (anything eding in json)
for filename in tempFileList:
if os.path.isfile(filename) and not re.search('doneToLine', filename) and not re.search('\.json$', filename):
goodLogsList.append(os.path.basename(filename))
if len(goodLogsList)==0:
raise Exception("Unexpected: h2o_sandbox found 0 files in %s that matched the pattern: %s" % \
(LOG_DIR, pattern) )
else:
tempFileList = os.listdir(LOG_DIR)
if verbose:
print "tempFileList:", tempFileList
# don't search the R stdout/stderr
# this matches the python h2o captured stdout/stderr, and also any downloaded h2o logs
# not the commands.log
for filename in tempFileList:
# for h2o on hadoop, in the common unit test stuff, we download zipped logs from h2o
# at the end and expand them. They will be in sandbox like this, because of the names h2o creates
# in the zip (I flatten it in sandbox): h2o_172.16.2.178_54321.log
# So look for that pattern too!
# we log roll so the h2o.*logs can end in a number
if re.search('h2o.*stdout|h2o.*stderr|h2o.*\.log.*', filename) and not re.search('doneToLine', filename):
goodLogsList.append(filename)
if verbose:
print "goodLogsList:", goodLogsList
if len(goodLogsList)==0:
# let this go...sh2junit.py apparently calls h2o_sandbox() looking for h2o logs?
emsg = "Unexpected: h2o_sandbox found 0 files in %s that matched the stdout/stderr or log pattern" % LOG_DIR
if sandboxIgnoreErrors:
print emsg
return
else:
# FIX! have to figure out what to do when there are logs available to check for h2o on hadoop
# and when to not care if they're not there
pass
# raise Exception(emsg)
if verbose:
print "h2o_sandbox: checking", len(goodLogsList), "files"
errLines = []
for filename in goodLogsList:
goodLogPath = LOG_DIR + "/" + filename
goodLog = open(goodLogPath, "r")
if verbose:
print "\nScraping this log:", goodLogPath
# if we've already walked it, there will be a matching file
# with the last line number we checked
try:
with open(LOG_DIR + "/doneToLine." + filename) as f:
# if multiple processes are checking, this file isn't locked
# if it's empty, treat it as zero
r = f.readline().rstrip()
if not r or r=="":
doneToLine = 0
else:
try:
doneToLine = int(r)
except:
raise Exception("%s/doneToLine.%s is corrupted (multiprocess issue?): %s" % \
(LOG_DIR, filename, r))
except IOError:
# no file
doneToLine = 0
# if we're using a pattern, ignore the doneToLine stuff (always start at 0
if pattern:
doneToLine = 0
# just in case error/assert is lower or upper case
# FIX! aren't we going to get the cloud building info failure messages
# oh well...if so ..it's a bug! "killing" is temp to detect jar mismatch error
regex1String = 'found multiple|exception|error|ERRR|assert|killing|killed|required ports|FATAL'
if cloudShutdownIsError:
regex1String += '|shutdown command'
regex1 = re.compile(regex1String, re.IGNORECASE)
regex2 = re.compile('Caused',re.IGNORECASE)
# regex3 = re.compile('warn|info|TCP', re.IGNORECASE)
# FIX! temp to avoid the INFO in jan's latest logging. don't print any info?
# don't want the tcp_active in the cloud status. Ok to not look for tcp stuff now
# regex3 = re.compile('warn|TCP', re.IGNORECASE)
regex3 = re.compile('warn|Retrying after IO error', re.IGNORECASE)
# many hdfs/apache messages have 'error' in the text. treat as warning if they have '[WARN]'
# i.e. they start with:
# [WARN]
# if we started due to "warning" ...then if we hit exception, we don't want to stop
# we want that to act like a new beginning. Maybe just treat "warning" and "info" as
# single line events? that's better
printing = 0 # "printing" is per file.
lines = 0 # count per file! errLines accumulates for multiple files.
currentLine = 0
log_python_test_name = None
if verbose:
print "Using doneToLine line marker %s with %s" % (doneToLine, goodLogPath)
for line in goodLog:
currentLine += 1
m = re.search('(python_test_name:) (.*)', line)
if m:
log_python_test_name = m.group(2)
# if log_python_test_name == python_test_name):
# print "Found log_python_test_name:", log_python_test_name
# don't check if we've already checked
if currentLine <= doneToLine:
continue
# if log_python_test_name and (log_python_test_name != python_test_name):
# print "h2o_sandbox.py: ignoring because wrong test name:", currentLine
# JIT reporting looks like this..don't detect that as an error
printSingleWarning = False
foundBad = False
if not ' bytes)' in line:
# no multiline FSM on this
printSingleWarning = regex3.search(line)
# 13190 280 ### sun.nio.ch.DatagramChannelImpl::ensureOpen (16 bytes)
# don't detect these class loader info messags as errors
#[Loaded java.lang.Error from /usr/lib/jvm/java-7-oracle/jre/lib/rt.jar]
foundBadPartial = regex1.search(line)
foundBad = foundBadPartial and not (
('classification error is') or
('STARTING TEST' in line) or # R test named ..._pop_assert..
('INFO:' in line and 'Error' in line) or
('Failed to instantiate schema class:' in line) or
('WARNING: found non-Schema Iced field:' in line) or # arno has 'errors' as a field
('ti-UDP-R ERRR:' in line) or # from Shutdown AIOOBE
('ti-UDP-R FATAL:' in line) or # from Shutdown
('Skipping field that lacks an annotation' in line) or # can have DeepLearningModel$Errors
('python_test_name' in line) or
('Retrying after IO error' in line) or
('Error on' in line) or
# temporary hack. getting these on shutdown in multi-machine
# ApiWatch ERRR WATER: ApiPortWatchdog:
# Failed trying to connect to REST API IP and Port (/10.73.149.39:54323, 30000 ms)
('ApiPortWatchdog' in line) or
('Error reduced' in line) or
('out-of-bag error estimation' in line) or
('reconstruction error' in line) or
('Prediction error' in line) or
(('Act/Prd' in line) and ('Error' in line)) or
(('AUC' in line) and ('Gini' in line) and ('Precision' in line)) or
('Error on training data' in line) or
('Error on validation data' in line) or
# These are real!
# ('water.DException' in line) or
# the manyfiles data has eRRr in a warning about test/train data
('WARN SCORM' in line) or
# ignore the long, long lines that the JStack prints as INFO
('stack_traces' in line) or
# shows up as param to url for h2o
('out_of_bag_error_estimate' in line) or
# R stdout confusion matrix. Probably need to figure out how to exclude R logs
('Training Error' in line) or
# now from GBM
('Mean Squared Error' in line) or
('Error' in line and 'Actual' in line) or
# fvec
('prediction error' in line) or
('errors on' in line) or
# R
('class.error' in line) or
# original RF
('error rate' in line) or
('[Loaded ' in line) or
('[WARN]' in line) or
('CalcSquareErrorsTasks' in line))
if (printing==0 and foundBad):
printing = 1
lines = 1
elif (printing==1):
lines += 1
# if we've been printing, stop when you get to another error
# keep printing if the pattern match for the condition
# is on a line with "Caused" in it ("Caused by")
# only use caused for overriding an end condition
foundCaused = regex2.search(line)
# since the "at ..." lines may have the "bad words" in them, we also don't want
# to stop if a line has " *at " at the beginning.
# Update: Assertion can be followed by Exception.
# Make sure we keep printing for a min of 4 lines
foundAt = re.match(r'[\t ]+at ',line)
if foundBad and (lines>10) and not (foundCaused or foundAt):
printing = 2
if (printing==1):
# to avoid extra newline from print. line already has one
errLines.append(line)
sys.stdout.write(line)
# don't double print if warning
elif (printSingleWarning):
# don't print these lines
if not (
('Unable to load native-hadoop library' in line) or
('stack_traces' in line) or
('Multiple local IPs detected' in line) or
('[Loaded ' in line) or
('RestS3Service' in line) ):
sys.stdout.write(line)
goodLog.close()
# remember what you've checked so far, with a file that matches, plus a suffix
# this is for the case of multiple tests sharing the same log files
# only want the test that caused the error to report it. (not flat the subsequent ones as fail)
# overwrite if exists
with open(LOG_DIR + "/" + "doneToLine." + filename, "w") as f:
f.write(str(currentLine) + "\n")
sys.stdout.flush()
# already has \n in each line
# doing this kludge to put multiple line message in the python traceback,
# so it will be reported by jenkins. The problem with printing it to stdout
# is that we're in the tearDown class, and jenkins won't have this captured until
# after it thinks the test is done (tearDown is separate from the test)
# we probably could have a tearDown with the test rather than the class, but we
# would have to update all tests.
if len(errLines)!=0:
# check if the lines all start with INFO: or have "apache" in them
justInfo = 0
for e in errLines:
# very hacky. try to ignore the captured broken pipe exceptions.
# if any line has this, ignore the whole group (may miss something)
if "Broken pipe" in e:
justInfo = 1
# if every line has this (beginning of line match)
elif justInfo==0 and not re.match("INFO:", e):
justInfo = 2
if justInfo==2:
emsg1 = " check_sandbox_for_errors: Errors in sandbox stdout or stderr (or R stdout/stderr).\n" + \
"Could have occurred at any prior time\n\n"
emsg2 = "".join(errLines)
errorFound = True
errorMessage = str(python_test_name) + emsg1 + emsg2
# just print if using the pattern match
if pattern or sandboxIgnoreErrors:
print "###############################################################################################"
print errorMessage
print "###############################################################################################"
else:
raise Exception(errorMessage)
if errorFound:
return errorMessage
else:
if verbose:
print "h2o_sandbox: h2o logs seem okay"
return
#*********************************************************************************************
# for use from the command line
if __name__ == "__main__":
arg_names = ['me', 'LOG_DIR', 'python_test_name', 'cloudShutdownIsError', 'sandboxIgnoreErrors', 'verbose']
# https://docs.python.org/2/library/itertools.html
# Nice..Learning itertools stuff:
# izip_longest: Make an iterator that aggregates elements from each of the iterables.
# If the iterables are of uneven length, missing values are filled-in with fillvalue.
# Iteration continues until the longest iterable is exhausted.
args = dict(itertools.izip_longest(arg_names, sys.argv))
# if you're running this from the command line, remove any existing doneToLine markers
if not args['LOG_DIR']:
LOG_DIR = './sandbox'
else:
LOG_DIR = args['LOG_DIR']
if os.path.exists(LOG_DIR):
print "Checking for any marker files to remove first.." +\
"(multi-test cloud log scrape uses and we always leave the droppings)"
for f in glob.glob(LOG_DIR + '/*doneToLine*'):
print "cleaning marker file:", f
os.remove(f)
# if you call from the command line, we'll just pass the first two positionally.
# here's a low budget argsparse :) (args are optional!)
errorMessage = check_sandbox_for_errors(
LOG_DIR=LOG_DIR,
python_test_name=args['python_test_name'],
cloudShutdownIsError=args['cloudShutdownIsError'],
sandboxIgnoreErrors=True,
verbose=True,
)
# sandboxIgnoreErrors=args['sandboxIgnoreErrors'],
# verbose=args['verbose'],