VirtualBox

source: vbox/trunk/src/VBox/ValidationKit/testmanager/batch/virtual_test_sheriff.py@ 81115

Last change on this file since 81115 was 81115, checked in by vboxsync, 5 years ago

vsheriff: disabled the non-functional reset code as it screws up my comments on testboxmem1 and is way to eager to reset stuff (fortunatly that doesn't work because of incorrect password for 99 of the boxes).

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 84.4 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# $Id: virtual_test_sheriff.py 81115 2019-10-05 01:42:17Z vboxsync $
4# pylint: disable=line-too-long
5
6"""
7Virtual Test Sheriff.
8
9Duties:
10 - Try to a assign failure reasons to recently failed tests.
11 - Reboot or disable bad test boxes.
12
13"""
14
15from __future__ import print_function;
16
17__copyright__ = \
18"""
19Copyright (C) 2012-2019 Oracle Corporation
20
21This file is part of VirtualBox Open Source Edition (OSE), as
22available from http://www.virtualbox.org. This file is free software;
23you can redistribute it and/or modify it under the terms of the GNU
24General Public License (GPL) as published by the Free Software
25Foundation, in version 2 as it comes in the "COPYING" file of the
26VirtualBox OSE distribution. VirtualBox OSE is distributed in the
27hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
28
29The contents of this file may alternatively be used under the terms
30of the Common Development and Distribution License Version 1.0
31(CDDL) only, as it comes in the "COPYING.CDDL" file of the
32VirtualBox OSE distribution, in which case the provisions of the
33CDDL are applicable instead of those of the GPL.
34
35You may elect to license modified versions of this file under the
36terms and conditions of either the GPL or the CDDL or both.
37"""
38__version__ = "$Revision: 81115 $"
39
40
41# Standard python imports
42import hashlib;
43import os;
44import re;
45import smtplib;
46import subprocess;
47import sys;
48from email.mime.multipart import MIMEMultipart;
49from email.mime.text import MIMEText;
50from email.utils import COMMASPACE;
51
52if sys.version_info[0] >= 3:
53 from io import StringIO as StringIO; # pylint: disable=import-error,no-name-in-module,useless-import-alias
54else:
55 from StringIO import StringIO as StringIO; # pylint: disable=import-error,no-name-in-module,useless-import-alias
56from optparse import OptionParser; # pylint: disable=deprecated-module
57from PIL import Image; # pylint: disable=import-error
58
59# Add Test Manager's modules path
60g_ksTestManagerDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))));
61sys.path.append(g_ksTestManagerDir);
62
63# Test Manager imports
64from common import utils;
65from testmanager.core.db import TMDatabaseConnection;
66from testmanager.core.build import BuildDataEx;
67from testmanager.core.failurereason import FailureReasonLogic;
68from testmanager.core.testbox import TestBoxLogic, TestBoxData;
69from testmanager.core.testcase import TestCaseDataEx;
70from testmanager.core.testgroup import TestGroupData;
71from testmanager.core.testset import TestSetLogic, TestSetData;
72from testmanager.core.testresults import TestResultLogic, TestResultFileData;
73from testmanager.core.testresultfailures import TestResultFailureLogic, TestResultFailureData;
74from testmanager.core.useraccount import UserAccountLogic;
75from testmanager.config import g_ksSmtpHost, g_kcSmtpPort, g_ksAlertFrom, \
76 g_ksAlertSubject, g_asAlertList, g_ksLomPassword;
77
78# Python 3 hacks:
79if sys.version_info[0] >= 3:
80 xrange = range; # pylint: disable=redefined-builtin,invalid-name
81
82
83class VirtualTestSheriffCaseFile(object):
84 """
85 A failure investigation case file.
86
87 """
88
89
90 ## Max log file we'll read into memory. (256 MB)
91 kcbMaxLogRead = 0x10000000;
92
93 def __init__(self, oSheriff, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase):
94 self.oSheriff = oSheriff;
95 self.oTestSet = oTestSet; # TestSetData
96 self.oTree = oTree; # TestResultDataEx
97 self.oBuild = oBuild; # BuildDataEx
98 self.oTestBox = oTestBox; # TestBoxData
99 self.oTestGroup = oTestGroup; # TestGroupData
100 self.oTestCase = oTestCase; # TestCaseDataEx
101 self.sMainLog = ''; # The main log file. Empty string if not accessible.
102 self.sSvcLog = ''; # The VBoxSVC log file. Empty string if not accessible.
103
104 # Generate a case file name.
105 self.sName = '#%u: %s' % (self.oTestSet.idTestSet, self.oTestCase.sName,)
106 self.sLongName = '#%u: "%s" on "%s" running %s %s (%s), "%s" by %s, using %s %s %s r%u' \
107 % ( self.oTestSet.idTestSet,
108 self.oTestCase.sName,
109 self.oTestBox.sName,
110 self.oTestBox.sOs,
111 self.oTestBox.sOsVersion,
112 self.oTestBox.sCpuArch,
113 self.oTestBox.sCpuName,
114 self.oTestBox.sCpuVendor,
115 self.oBuild.oCat.sProduct,
116 self.oBuild.oCat.sBranch,
117 self.oBuild.oCat.sType,
118 self.oBuild.iRevision, );
119
120 # Investigation notes.
121 self.tReason = None; # None or one of the ktReason_XXX constants.
122 self.dReasonForResultId = {}; # Reason assignments indexed by idTestResult.
123 self.dCommentForResultId = {}; # Comment assignments indexed by idTestResult.
124
125 #
126 # Reason.
127 #
128
129 def noteReason(self, tReason):
130 """ Notes down a possible reason. """
131 self.oSheriff.dprint(u'noteReason: %s -> %s' % (self.tReason, tReason,));
132 self.tReason = tReason;
133 return True;
134
135 def noteReasonForId(self, tReason, idTestResult, sComment = None):
136 """ Notes down a possible reason for a specific test result. """
137 self.oSheriff.dprint(u'noteReasonForId: %u: %s -> %s%s'
138 % (idTestResult, self.dReasonForResultId.get(idTestResult, None), tReason,
139 (u' (%s)' % (sComment,)) if sComment is not None else ''));
140 self.dReasonForResultId[idTestResult] = tReason;
141 if sComment is not None:
142 self.dCommentForResultId[idTestResult] = sComment;
143 return True;
144
145
146 #
147 # Test classification.
148 #
149
150 def isVBoxTest(self):
151 """ Test classification: VirtualBox (using the build) """
152 return self.oBuild.oCat.sProduct.lower() in [ 'virtualbox', 'vbox' ];
153
154 def isVBoxUnitTest(self):
155 """ Test case classification: The unit test doing all our testcase/*.cpp stuff. """
156 return self.isVBoxTest() \
157 and (self.oTestCase.sName.lower() == 'unit tests' or self.oTestCase.sName.lower() == 'misc: unit tests');
158
159 def isVBoxInstallTest(self):
160 """ Test case classification: VirtualBox Guest installation test. """
161 return self.isVBoxTest() \
162 and self.oTestCase.sName.lower().startswith('install:');
163
164 def isVBoxUnattendedInstallTest(self):
165 """ Test case classification: VirtualBox Guest installation test. """
166 return self.isVBoxTest() \
167 and self.oTestCase.sName.lower().startswith('uinstall:');
168
169 def isVBoxUSBTest(self):
170 """ Test case classification: VirtualBox USB test. """
171 return self.isVBoxTest() \
172 and self.oTestCase.sName.lower().startswith('usb:');
173
174 def isVBoxStorageTest(self):
175 """ Test case classification: VirtualBox Storage test. """
176 return self.isVBoxTest() \
177 and self.oTestCase.sName.lower().startswith('storage:');
178
179 def isVBoxGAsTest(self):
180 """ Test case classification: VirtualBox Guest Additions test. """
181 return self.isVBoxTest() \
182 and ( self.oTestCase.sName.lower().startswith('guest additions')
183 or self.oTestCase.sName.lower().startswith('ga\'s tests'));
184
185 def isVBoxAPITest(self):
186 """ Test case classification: VirtualBox API test. """
187 return self.isVBoxTest() \
188 and self.oTestCase.sName.lower().startswith('api:');
189
190 def isVBoxBenchmarkTest(self):
191 """ Test case classification: VirtualBox Benchmark test. """
192 return self.isVBoxTest() \
193 and self.oTestCase.sName.lower().startswith('benchmark:');
194
195 def isVBoxSmokeTest(self):
196 """ Test case classification: Smoke test. """
197 return self.isVBoxTest() \
198 and self.oTestCase.sName.lower().startswith('smoketest');
199
200 def isVBoxSerialTest(self):
201 """ Test case classification: Smoke test. """
202 return self.isVBoxTest() \
203 and self.oTestCase.sName.lower().startswith('serial:');
204
205
206 #
207 # Utility methods.
208 #
209
210 def getMainLog(self):
211 """
212 Tries to read the main log file since this will be the first source of information.
213 """
214 if self.sMainLog:
215 return self.sMainLog;
216 (oFile, oSizeOrError, _) = self.oTestSet.openFile('main.log', 'rb');
217 if oFile is not None:
218 try:
219 self.sMainLog = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
220 except Exception as oXcpt:
221 self.oSheriff.vprint(u'Error reading main log file: %s' % (oXcpt,))
222 self.sMainLog = '';
223 else:
224 self.oSheriff.vprint(u'Error opening main log file: %s' % (oSizeOrError,));
225 return self.sMainLog;
226
227 def getLogFile(self, oFile):
228 """
229 Tries to read the given file as a utf-8 log file.
230 oFile is a TestFileDataEx instance.
231 Returns empty string if problems opening or reading the file.
232 """
233 sContent = '';
234 (oFile, oSizeOrError, _) = self.oTestSet.openFile(oFile.sFile, 'rb');
235 if oFile is not None:
236 try:
237 sContent = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
238 except Exception as oXcpt:
239 self.oSheriff.vprint(u'Error reading the "%s" log file: %s' % (oFile.sFile, oXcpt,))
240 else:
241 self.oSheriff.vprint(u'Error opening the "%s" log file: %s' % (oFile.sFile, oSizeOrError,));
242 return sContent;
243
244 def getSvcLog(self):
245 """
246 Tries to read the VBoxSVC log file as it typically not associated with a failing test result.
247 Note! Returns the first VBoxSVC log file we find.
248 """
249 if not self.sSvcLog:
250 aoSvcLogFiles = self.oTree.getListOfLogFilesByKind(TestResultFileData.ksKind_LogReleaseSvc);
251 if aoSvcLogFiles:
252 self.sSvcLog = self.getLogFile(aoSvcLogFiles[0]);
253 return self.sSvcLog;
254
255 def getScreenshotSha256(self, oFile):
256 """
257 Tries to read the given screenshot file, uncompress it, and do SHA-2
258 on the raw pixels.
259 Returns SHA-2 digest string on success, None on failure.
260 """
261 (oImgFile, _, _) = self.oTestSet.openFile(oFile.sFile, 'rb');
262 try:
263 abImageFile = oImgFile.read();
264 except Exception as oXcpt:
265 self.oSheriff.vprint(u'Error reading the "%s" image file: %s' % (oFile.sFile, oXcpt,))
266 else:
267 try:
268 oImage = Image.open(StringIO(abImageFile));
269 except Exception as oXcpt:
270 self.oSheriff.vprint(u'Error opening the "%s" image bytes using PIL.Image.open: %s' % (oFile.sFile, oXcpt,))
271 else:
272 try:
273 oHash = hashlib.sha256();
274 oHash.update(oImage.tostring());
275 except Exception as oXcpt:
276 self.oSheriff.vprint(u'Error hashing the uncompressed image bytes for "%s": %s' % (oFile.sFile, oXcpt,))
277 else:
278 return oHash.hexdigest();
279 return None;
280
281
282
283 def isSingleTestFailure(self):
284 """
285 Figure out if this is a single test failing or if it's one of the
286 more complicated ones.
287 """
288 if self.oTree.cErrors == 1:
289 return True;
290 if self.oTree.deepCountErrorContributers() <= 1:
291 return True;
292 return False;
293
294
295
296class VirtualTestSheriff(object): # pylint: disable=too-few-public-methods
297 """
298 Add build info into Test Manager database.
299 """
300
301 ## The user account for the virtual sheriff.
302 ksLoginName = 'vsheriff';
303
304 def __init__(self):
305 """
306 Parse command line.
307 """
308 self.oDb = None;
309 self.tsNow = None;
310 self.oTestResultLogic = None;
311 self.oTestSetLogic = None;
312 self.oFailureReasonLogic = None; # FailureReasonLogic;
313 self.oTestResultFailureLogic = None; # TestResultFailureLogic
314 self.oLogin = None;
315 self.uidSelf = -1;
316 self.oLogFile = None;
317 self.asBsodReasons = [];
318 self.asUnitTestReasons = [];
319
320 oParser = OptionParser();
321 oParser.add_option('--start-hours-ago', dest = 'cStartHoursAgo', metavar = '<hours>', default = 0, type = 'int',
322 help = 'When to start specified as hours relative to current time. Defauls is right now.', );
323 oParser.add_option('--hours-period', dest = 'cHoursBack', metavar = '<period-in-hours>', default = 2, type = 'int',
324 help = 'Work period specified in hours. Defauls is 2 hours.');
325 oParser.add_option('--real-run-back', dest = 'fRealRun', action = 'store_true', default = False,
326 help = 'Whether to commit the findings to the database. Default is a dry run.');
327 oParser.add_option('--testset', dest = 'aidTestSets', metavar = '<id>', default = [], type = 'int', action = 'append',
328 help = 'Only investigate this one. Accumulates IDs when repeated.');
329 oParser.add_option('-q', '--quiet', dest = 'fQuiet', action = 'store_true', default = False,
330 help = 'Quiet execution');
331 oParser.add_option('-l', '--log', dest = 'sLogFile', metavar = '<logfile>', default = None,
332 help = 'Where to log messages.');
333 oParser.add_option('--debug', dest = 'fDebug', action = 'store_true', default = False,
334 help = 'Enables debug mode.');
335
336 (self.oConfig, _) = oParser.parse_args();
337
338 if self.oConfig.sLogFile:
339 self.oLogFile = open(self.oConfig.sLogFile, "a");
340 self.oLogFile.write('VirtualTestSheriff: $Revision: 81115 $ \n');
341
342
343 def eprint(self, sText):
344 """
345 Prints error messages.
346 Returns 1 (for exit code usage.)
347 """
348 print('error: %s' % (sText,));
349 if self.oLogFile is not None:
350 self.oLogFile.write((u'error: %s\n' % (sText,)).encode('utf-8'));
351 return 1;
352
353 def dprint(self, sText):
354 """
355 Prints debug info.
356 """
357 if self.oConfig.fDebug:
358 if not self.oConfig.fQuiet:
359 print('debug: %s' % (sText, ));
360 if self.oLogFile is not None:
361 self.oLogFile.write((u'debug: %s\n' % (sText,)).encode('utf-8'));
362 return 0;
363
364 def vprint(self, sText):
365 """
366 Prints verbose info.
367 """
368 if not self.oConfig.fQuiet:
369 print('info: %s' % (sText,));
370 if self.oLogFile is not None:
371 self.oLogFile.write((u'info: %s\n' % (sText,)).encode('utf-8'));
372 return 0;
373
374 def getFailureReason(self, tReason):
375 """ Gets the failure reason object for tReason. """
376 return self.oFailureReasonLogic.cachedLookupByNameAndCategory(tReason[1], tReason[0]);
377
378 def selfCheck(self):
379 """ Does some self checks, looking up things we expect to be in the database and such. """
380 rcExit = 0;
381 for sAttr in dir(self.__class__):
382 if sAttr.startswith('ktReason_'):
383 tReason = getattr(self.__class__, sAttr);
384 oFailureReason = self.getFailureReason(tReason);
385 if oFailureReason is None:
386 rcExit = self.eprint(u'Failed to find failure reason "%s" in category "%s" in the database!'
387 % (tReason[1], tReason[0],));
388
389 # Check the user account as well.
390 if self.oLogin is None:
391 oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
392 if oLogin is None:
393 rcExit = self.eprint(u'Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
394 return rcExit;
395
396 def sendEmailAlert(self, uidAuthor, sBodyText):
397 """
398 Sends email alert.
399 """
400
401 # Get author email
402 self.oDb.execute('SELECT sEmail FROM Users WHERE uid=%s', (uidAuthor,));
403 sFrom = self.oDb.fetchOne();
404 if sFrom is not None:
405 sFrom = sFrom[0];
406 else:
407 sFrom = g_ksAlertFrom;
408
409 # Gather recipient list.
410 asEmailList = [];
411 for sUser in g_asAlertList:
412 self.oDb.execute('SELECT sEmail FROM Users WHERE sUsername=%s', (sUser,));
413 sEmail = self.oDb.fetchOne();
414 if sEmail:
415 asEmailList.append(sEmail[0]);
416 if not asEmailList:
417 return self.eprint('No email addresses to send alter to!');
418
419 # Compose the message.
420 oMsg = MIMEMultipart();
421 oMsg['From'] = sFrom;
422 oMsg['To'] = COMMASPACE.join(asEmailList);
423 oMsg['Subject'] = g_ksAlertSubject;
424 oMsg.attach(MIMEText(sBodyText, 'plain'))
425
426 # Try send it.
427 try:
428 oSMTP = smtplib.SMTP(g_ksSmtpHost, g_kcSmtpPort);
429 oSMTP.sendmail(sFrom, asEmailList, oMsg.as_string())
430 oSMTP.quit()
431 except smtplib.SMTPException as oXcpt:
432 return self.eprint('Failed to send mail: %s' % (oXcpt,));
433
434 return 0;
435
436 def badTestBoxManagement(self):
437 """
438 Looks for bad test boxes and first tries once to reboot them then disables them.
439 """
440 rcExit = 0;
441
442 #
443 # We skip this entirely if we're running in the past and not in harmless debug mode.
444 #
445 if self.oConfig.cStartHoursAgo != 0 \
446 and (not self.oConfig.fDebug or self.oConfig.fRealRun):
447 return rcExit;
448 tsNow = self.tsNow if self.oConfig.fDebug else None;
449 cHoursBack = self.oConfig.cHoursBack if self.oConfig.fDebug else 2;
450 oTestBoxLogic = TestBoxLogic(self.oDb);
451
452 #
453 # Generate a list of failures reasons we consider bad-testbox behavior.
454 #
455 aidFailureReasons = [
456 self.getFailureReason(self.ktReason_Host_DriverNotLoaded).idFailureReason,
457 self.getFailureReason(self.ktReason_Host_DriverNotUnloading).idFailureReason,
458 self.getFailureReason(self.ktReason_Host_DriverNotCompilable).idFailureReason,
459 self.getFailureReason(self.ktReason_Host_InstallationFailed).idFailureReason,
460 ];
461
462 #
463 # Get list of bad test boxes for given period and check them out individually.
464 #
465 aidBadTestBoxes = self.oTestSetLogic.fetchBadTestBoxIds(cHoursBack = cHoursBack, tsNow = tsNow,
466 aidFailureReasons = aidFailureReasons);
467 for idTestBox in aidBadTestBoxes:
468 # Skip if the testbox is already disabled or has a pending reboot command.
469 try:
470 oTestBox = TestBoxData().initFromDbWithId(self.oDb, idTestBox);
471 except Exception as oXcpt:
472 rcExit = self.eprint('Failed to get data for test box #%u in badTestBoxManagement: %s' % (idTestBox, oXcpt,));
473 continue;
474 if not oTestBox.fEnabled:
475 self.dprint(u'badTestBoxManagement: Skipping test box #%u (%s) as it has been disabled already.'
476 % ( idTestBox, oTestBox.sName, ));
477 continue;
478 if oTestBox.enmPendingCmd != TestBoxData.ksTestBoxCmd_None:
479 self.dprint(u'badTestBoxManagement: Skipping test box #%u (%s) as it has a command pending: %s'
480 % ( idTestBox, oTestBox.sName, oTestBox.enmPendingCmd));
481 continue;
482
483 # Get the most recent testsets for this box (descending on tsDone) and see how bad it is.
484 aoSets = self.oTestSetLogic.fetchSetsForTestBox(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow);
485 cOkay = 0;
486 cBad = 0;
487 iFirstOkay = len(aoSets);
488 for iSet, oSet in enumerate(aoSets):
489 if oSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
490 cBad += 1;
491 else:
492 # Check for bad failure reasons.
493 oFailure = None;
494 if oSet.enmStatus in TestSetData.kasBadTestStatuses:
495 (oTree, _ ) = self.oTestResultLogic.fetchResultTree(oSet.idTestSet)
496 aoFailedResults = oTree.getListOfFailures();
497 for oFailedResult in aoFailedResults:
498 oFailure = self.oTestResultFailureLogic.getById(oFailedResult.idTestResult);
499 if oFailure is not None and oFailure.idFailureReason in aidFailureReasons:
500 break;
501 else:
502 oFailure = None;
503 if oFailure is not None:
504 cBad += 1;
505 else:
506 # This is an okay test result then.
507 ## @todo maybe check the elapsed time here, it could still be a bad run?
508 cOkay += 1;
509 if iFirstOkay > iSet:
510 iFirstOkay = iSet;
511 if iSet > 10:
512 break;
513
514 # We react if there are two or more bad-testbox statuses at the head of the
515 # history and at least three in the last 10 results.
516 if iFirstOkay >= 2 and cBad > 2:
517 if oTestBoxLogic.hasTestBoxRecentlyBeenRebooted(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow):
518 sComment = u'Disabling testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u' \
519 % (idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay);
520 self.vprint(sComment);
521 self.sendEmailAlert(self.uidSelf, sComment);
522 if self.oConfig.fRealRun is True:
523 try:
524 oTestBoxLogic.disableTestBox(idTestBox, self.uidSelf, fCommit = True,
525 sComment = 'Automatically disabled (iFirstOkay=%u cBad=%u cOkay=%u)'
526 % (iFirstOkay, cBad, cOkay),);
527 except Exception as oXcpt:
528 rcExit = self.eprint(u'Error disabling testbox #%u (%u): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
529 else:
530 sComment = u'Rebooting testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u' \
531 % (idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay);
532 self.vprint(sComment);
533 self.sendEmailAlert(self.uidSelf, sComment);
534 if self.oConfig.fRealRun is True:
535 try:
536 oTestBoxLogic.rebootTestBox(idTestBox, self.uidSelf, fCommit = True,
537 sComment = 'Automatically rebooted (iFirstOkay=%u cBad=%u cOkay=%u)'
538 % (iFirstOkay, cBad, cOkay),);
539 except Exception as oXcpt:
540 rcExit = self.eprint(u'Error rebooting testbox #%u (%s): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
541 else:
542 self.dprint(u'badTestBoxManagement: #%u (%s) looks ok: iFirstOkay=%u cBad=%u cOkay=%u'
543 % ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
544
545 ## @todo r=bird: review + rewrite;
546 ## - no selecting here, that belongs in the core/*.py files.
547 ## - preserve existing comments.
548 ## - doing way too much in the try/except block.
549 ## - No password quoting in the sshpass command that always fails (127).
550 ## - Timeout is way to low. testboxmem1 need more than 10 min to take a dump, ages to
551 ## get thru POST and another 5 just to time out in grub. Should be an hour or so.
552 ## Besides, it need to be constant elsewhere in the file, not a variable here.
553 ##
554 ##
555 ## Reset hanged testboxes
556 ##
557 #cStatusTimeoutMins = 10;
558 #
559 #self.oDb.execute('SELECT TestBoxStatuses.idTestBox\n'
560 # ' FROM TestBoxStatuses, TestBoxes\n'
561 # ' WHERE TestBoxStatuses.tsUpdated >= (CURRENT_TIMESTAMP - interval \'%s hours\')\n'
562 # ' AND TestBoxStatuses.tsUpdated < (CURRENT_TIMESTAMP - interval \'%s minutes\')\n'
563 # ' AND TestBoxStatuses.idTestBox = TestBoxes.idTestBox\n'
564 # ' AND Testboxes.tsExpire = \'infinity\'::timestamp', (cHoursBack,cStatusTimeoutMins));
565 #for idTestBox in self.oDb.fetchAll():
566 # idTestBox = idTestBox[0];
567 # try:
568 # oTestBox = TestBoxData().initFromDbWithId(self.oDb, idTestBox);
569 # except Exception as oXcpt:
570 # rcExit = self.eprint('Failed to get data for test box #%u in badTestBoxManagement: %s' % (idTestBox, oXcpt,));
571 # continue;
572 # # Skip if the testbox is already disabled, already reset or there's no iLOM
573 # if not oTestBox.fEnabled or oTestBox.ipLom is None or oTestBox.sComment is not None and oTestBox.sComment.find('Automatically reset') >= 0:
574 # self.dprint(u'badTestBoxManagement: Skipping test box #%u (%s) as it has been disabled already.'
575 # % ( idTestBox, oTestBox.sName, ));
576 # continue;
577 # ## @todo get iLOM credentials from a table?
578 # sCmd = 'sshpass -p%s ssh -oStrictHostKeyChecking=no root@%s show /SP && reset /SYS' % (g_ksLomPassword, oTestBox.ipLom,);
579 # try:
580 # oPs = subprocess.Popen(sCmd, stdout=subprocess.PIPE, shell=True);
581 # sStdout = oPs.communicate()[0];
582 # iRC = oPs.wait();
583 #
584 # oTestBox.sComment = 'Automatically reset (iRC=%u sStdout=%s)' % (iRC, sStdout,);
585 # oTestBoxLogic.editEntry(oTestBox, self.uidSelf, fCommit = True);
586 #
587 # sComment = u'Reset testbox #%u (%s) - iRC=%u sStduot=%s' % ( idTestBox, oTestBox.sName, iRC, sStdout);
588 # self.vprint(sComment);
589 # self.sendEmailAlert(self.uidSelf, sComment);
590 #
591 # except Exception as oXcpt:
592 # rcExit = self.eprint(u'Error resetting testbox #%u (%s): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
593 #
594 return rcExit;
595
596
597 ## @name Failure reasons we know.
598 ## @{
599 ktReason_Add_CopyToGuest_Timeout = ( 'Additions', 'CopyToGuest Timeout' );
600 ktReason_Add_FlushViewOfFile = ( 'Additions', 'FlushViewOfFile' );
601 ktReason_Add_Mmap_Coherency = ( 'Additions', 'mmap coherency' );
602 ktReason_BSOD_Recovery = ( 'BSOD', 'Recovery' );
603 ktReason_BSOD_Automatic_Repair = ( 'BSOD', 'Automatic Repair' );
604 ktReason_BSOD_0000007F = ( 'BSOD', '0x0000007F' );
605 ktReason_BSOD_000000D1 = ( 'BSOD', '0x000000D1' );
606 ktReason_BSOD_C0000225 = ( 'BSOD', '0xC0000225 (boot)' );
607 ktReason_Guru_Generic = ( 'Guru Meditations', 'Generic Guru Meditation' );
608 ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_INSTR_NOT_IMPLEMENTED' );
609 ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_ASPECT_NOT_IMPLEMENTED' );
610 ktReason_Guru_VERR_TRPM_DONT_PANIC = ( 'Guru Meditations', 'VERR_TRPM_DONT_PANIC' );
611 ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED = ( 'Guru Meditations', 'VERR_PGM_PHYS_PAGE_RESERVED' );
612 ktReason_Guru_VERR_VMX_INVALID_GUEST_STATE = ( 'Guru Meditations', 'VERR_VMX_INVALID_GUEST_STATE' );
613 ktReason_Guru_VINF_EM_TRIPLE_FAULT = ( 'Guru Meditations', 'VINF_EM_TRIPLE_FAULT' );
614 ktReason_Host_HostMemoryLow = ( 'Host', 'HostMemoryLow' );
615 ktReason_Host_DriverNotLoaded = ( 'Host', 'Driver not loaded' );
616 ktReason_Host_DriverNotUnloading = ( 'Host', 'Driver not unloading' );
617 ktReason_Host_DriverNotCompilable = ( 'Host', 'Driver not compilable' );
618 ktReason_Host_InstallationFailed = ( 'Host', 'Installation failed' );
619 ktReason_Host_NotSignedWithBuildCert = ( 'Host', 'Not signed with build cert' );
620 ktReason_Host_DoubleFreeHeap = ( 'Host', 'Double free or corruption' );
621 ktReason_Host_LeftoverService = ( 'Host', 'Leftover service' );
622 ktReason_Host_Reboot_OSX_Watchdog_Timeout = ( 'Host Reboot', 'OSX Watchdog Timeout' );
623 ktReason_Host_Modprobe_Failed = ( 'Host', 'Modprobe failed' );
624 ktReason_Host_Install_Hang = ( 'Host', 'Install hang' );
625 ktReason_Host_NetworkMisconfiguration = ( 'Host', 'Network misconfiguration' );
626 ktReason_Host_TSTInfo_Accuracy_OOR = ( 'Host', 'TSTInfo accuracy out of range' );
627 ktReason_Networking_Nonexistent_host_nic = ( 'Networking', 'Nonexistent host networking interface' );
628 ktReason_Networking_VERR_INTNET_FLT_IF_NOT_FOUND = ( 'Networking', 'VERR_INTNET_FLT_IF_NOT_FOUND' );
629 ktReason_OSInstall_GRUB_hang = ( 'O/S Install', 'GRUB hang' );
630 ktReason_OSInstall_Udev_hang = ( 'O/S Install', 'udev hang' );
631 ktReason_OSInstall_Sata_no_BM = ( 'O/S Install', 'SATA busmaster bit not set' );
632 ktReason_Panic_BootManagerC000000F = ( 'Panic', 'Hardware Changed' );
633 ktReason_Panic_MP_BIOS_IO_APIC = ( 'Panic', 'MP-BIOS/IO-APIC' );
634 ktReason_Panic_HugeMemory = ( 'Panic', 'Huge memory assertion' );
635 ktReason_Panic_IOAPICDoesntWork = ( 'Panic', 'IO-APIC and timer does not work' );
636 ktReason_Panic_TxUnitHang = ( 'Panic', 'Tx Unit Hang' );
637 ktReason_API_std_bad_alloc = ( 'API / (XP)COM', 'std::bad_alloc' );
638 ktReason_API_Digest_Mismatch = ( 'API / (XP)COM', 'Digest mismatch' );
639 ktReason_API_MoveVM_SharingViolation = ( 'API / (XP)COM', 'MoveVM sharing violation' );
640 ktReason_API_MoveVM_InvalidParameter = ( 'API / (XP)COM', 'MoveVM invalid parameter' );
641 ktReason_API_Open_Session_Failed = ( 'API / (XP)COM', 'Open session failed' );
642 ktReason_XPCOM_Exit_Minus_11 = ( 'API / (XP)COM', 'exit -11' );
643 ktReason_XPCOM_VBoxSVC_Hang = ( 'API / (XP)COM', 'VBoxSVC hang' );
644 ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption = ( 'API / (XP)COM', 'VBoxSVC hang + heap corruption' );
645 ktReason_XPCOM_NS_ERROR_CALL_FAILED = ( 'API / (XP)COM', 'NS_ERROR_CALL_FAILED' );
646 ktReason_BootManager_Image_corrupt = ( 'Unknown', 'BOOTMGR Image corrupt' );
647 ktReason_Unknown_Heap_Corruption = ( 'Unknown', 'Heap corruption' );
648 ktReason_Unknown_Reboot_Loop = ( 'Unknown', 'Reboot loop' );
649 ktReason_Unknown_File_Not_Found = ( 'Unknown', 'File not found' );
650 ktReason_Unknown_HalReturnToFirmware = ( 'Unknown', 'HalReturnToFirmware' );
651 ktReason_Unknown_VM_Crash = ( 'Unknown', 'VM crash' );
652 ktReason_Unknown_VM_Start_Error = ( 'Unknown', 'VM Start Error' );
653 ktReason_Unknown_VM_Runtime_Error = ( 'Unknown', 'VM Runtime Error' );
654 ktReason_VMM_kvm_lock_spinning = ( 'VMM', 'kvm_lock_spinning' );
655 ktReason_Ignore_Buggy_Test_Driver = ( 'Ignore', 'Buggy test driver' );
656 ktReason_Ignore_Stale_Files = ( 'Ignore', 'Stale files' );
657 ktReason_Buggy_Build_Broken_Build = ( 'Broken Build', 'Buggy build' );
658 ktReason_GuestBug_CompizVBoxQt = ( 'Guest Bug', 'Compiz + VirtualBox Qt GUI crash' );
659 ## @}
660
661 ## BSOD category.
662 ksBsodCategory = 'BSOD';
663 ## Special reason indicating that the flesh and blood sheriff has work to do.
664 ksBsodAddNew = 'Add new BSOD';
665
666 ## Unit test category.
667 ksUnitTestCategory = 'Unit';
668 ## Special reason indicating that the flesh and blood sheriff has work to do.
669 ksUnitTestAddNew = 'Add new';
670
671 ## Used for indica that we shouldn't report anything for this test result ID and
672 ## consider promoting the previous error to test set level if it's the only one.
673 ktHarmless = ( 'Probably', 'Caused by previous error' );
674
675
676 def caseClosed(self, oCaseFile):
677 """
678 Reports the findings in the case and closes it.
679 """
680 #
681 # Log it and create a dReasonForReasultId we can use below.
682 #
683 dCommentForResultId = oCaseFile.dCommentForResultId;
684 if oCaseFile.dReasonForResultId:
685 # Must weed out ktHarmless.
686 dReasonForResultId = {};
687 for idKey, tReason in oCaseFile.dReasonForResultId.items():
688 if tReason is not self.ktHarmless:
689 dReasonForResultId[idKey] = tReason;
690 if not dReasonForResultId:
691 self.vprint(u'TODO: Closing %s without a real reason, only %s.'
692 % (oCaseFile.sName, oCaseFile.dReasonForResultId));
693 return False;
694
695 # Try promote to single reason.
696 atValues = dReasonForResultId.values();
697 fSingleReason = True;
698 if len(dReasonForResultId) == 1 and dReasonForResultId.keys()[0] != oCaseFile.oTestSet.idTestResult:
699 self.dprint(u'Promoting single reason to whole set: %s' % (atValues[0],));
700 elif len(dReasonForResultId) > 1 and len(atValues) == atValues.count(atValues[0]):
701 self.dprint(u'Merged %d reasons to a single one: %s' % (len(atValues), atValues[0]));
702 else:
703 fSingleReason = False;
704 if fSingleReason:
705 dReasonForResultId = { oCaseFile.oTestSet.idTestResult: atValues[0], };
706 if dCommentForResultId:
707 dCommentForResultId = { oCaseFile.oTestSet.idTestResult: dCommentForResultId.values()[0], };
708 elif oCaseFile.tReason is not None:
709 dReasonForResultId = { oCaseFile.oTestSet.idTestResult: oCaseFile.tReason, };
710 else:
711 self.vprint(u'Closing %s without a reason - this should not happen!' % (oCaseFile.sName,));
712 return False;
713
714 self.vprint(u'Closing %s with following reason%s: %s'
715 % ( oCaseFile.sName, 's' if dReasonForResultId > 0 else '', dReasonForResultId, ));
716
717 #
718 # Add the test failure reason record(s).
719 #
720 for idTestResult, tReason in dReasonForResultId.items():
721 oFailureReason = self.getFailureReason(tReason);
722 if oFailureReason is not None:
723 sComment = 'Set by $Revision: 81115 $' # Handy for reverting later.
724 if idTestResult in dCommentForResultId:
725 sComment += ': ' + dCommentForResultId[idTestResult];
726
727 oAdd = TestResultFailureData();
728 oAdd.initFromValues(idTestResult = idTestResult,
729 idFailureReason = oFailureReason.idFailureReason,
730 uidAuthor = self.uidSelf,
731 idTestSet = oCaseFile.oTestSet.idTestSet,
732 sComment = sComment,);
733 if self.oConfig.fRealRun:
734 try:
735 self.oTestResultFailureLogic.addEntry(oAdd, self.uidSelf, fCommit = True);
736 except Exception as oXcpt:
737 self.eprint(u'caseClosed: Exception "%s" while adding reason %s for %s'
738 % (oXcpt, oAdd, oCaseFile.sLongName,));
739 else:
740 self.eprint(u'caseClosed: Cannot locate failure reason: %s / %s' % ( tReason[0], tReason[1],));
741 return True;
742
743 #
744 # Tools for assiting log parsing.
745 #
746
747 @staticmethod
748 def matchFollowedByLines(sStr, off, asFollowingLines):
749 """ Worker for isThisFollowedByTheseLines. """
750
751 # Advance off to the end of the line.
752 off = sStr.find('\n', off);
753 if off < 0:
754 return False;
755 off += 1;
756
757 # Match each string with the subsequent lines.
758 for iLine, sLine in enumerate(asFollowingLines):
759 offEnd = sStr.find('\n', off);
760 if offEnd < 0:
761 return iLine + 1 == len(asFollowingLines) and sStr.find(sLine, off) < 0;
762 if sLine and sStr.find(sLine, off, offEnd) < 0:
763 return False;
764
765 # next line.
766 off = offEnd + 1;
767
768 return True;
769
770 @staticmethod
771 def isThisFollowedByTheseLines(sStr, sFirst, asFollowingLines):
772 """
773 Looks for a line contining sFirst which is then followed by lines
774 with the strings in asFollowingLines. (No newline chars anywhere!)
775 Returns True / False.
776 """
777 off = sStr.find(sFirst, 0);
778 while off >= 0:
779 if VirtualTestSheriff.matchFollowedByLines(sStr, off, asFollowingLines):
780 return True;
781 off = sStr.find(sFirst, off + 1);
782 return False;
783
784 @staticmethod
785 def findAndReturnRestOfLine(sHaystack, sNeedle):
786 """
787 Looks for sNeedle in sHaystack.
788 Returns The text following the needle up to the end of the line.
789 Returns None if not found.
790 """
791 if sHaystack is None:
792 return None;
793 off = sHaystack.find(sNeedle);
794 if off < 0:
795 return None;
796 off += len(sNeedle)
797 offEol = sHaystack.find('\n', off);
798 if offEol < 0:
799 offEol = len(sHaystack);
800 return sHaystack[off:offEol]
801
802 @staticmethod
803 def findInAnyAndReturnRestOfLine(asHaystacks, sNeedle):
804 """
805 Looks for sNeedle in zeroe or more haystacks (asHaystack).
806 Returns The text following the first needed found up to the end of the line.
807 Returns None if not found.
808 """
809 for sHaystack in asHaystacks:
810 sRet = VirtualTestSheriff.findAndReturnRestOfLine(sHaystack, sNeedle);
811 if sRet is not None:
812 return sRet;
813 return None;
814
815
816 #
817 # The investigative units.
818 #
819
820 katSimpleInstallUninstallMainLogReasons = [
821 # ( Whether to stop on hit, reason tuple, needle text. )
822 ( False, ktReason_Host_LeftoverService,
823 'SERVICE_NAME: vbox' ),
824 ( False, ktReason_Host_LeftoverService,
825 'Seems installation was skipped. Old version lurking behind? Not the fault of this build/test run!'),
826 ];
827
828 kdatSimpleInstallUninstallMainLogReasonsPerOs = {
829 'darwin': [
830 # ( Whether to stop on hit, reason tuple, needle text. )
831 ( True, ktReason_Host_DriverNotUnloading,
832 'Can\'t remove kext org.virtualbox.kext.VBoxDrv; services failed to terminate - 0xe00002c7' ),
833 ],
834 'linux': [
835 # ( Whether to stop on hit, reason tuple, needle text. )
836 ( True, ktReason_Host_DriverNotCompilable,
837 'This system is not currently set up to build kernel modules' ),
838 ( True, ktReason_Host_DriverNotCompilable,
839 'This system is currently not set up to build kernel modules' ),
840 ( True, ktReason_Host_InstallationFailed,
841 'vboxdrv.sh: failed: Look at /var/log/vbox-install.log to find out what went wrong.' ),
842 ( True, ktReason_Host_DriverNotUnloading,
843 'Cannot unload module vboxdrv'),
844 ],
845 'solaris': [
846 # ( Whether to stop on hit, reason tuple, needle text. )
847 ( True, ktReason_Host_DriverNotUnloading, 'can\'t unload the module: Device busy' ),
848 ( True, ktReason_Host_DriverNotUnloading, 'Unloading: Host module ...FAILED!' ),
849 ( True, ktReason_Host_DriverNotUnloading, 'Unloading: NetFilter (Crossbow) module ...FAILED!' ),
850 ( True, ktReason_Host_InstallationFailed, 'svcadm: Couldn\'t bind to svc.configd.' ),
851 ( True, ktReason_Host_InstallationFailed, 'pkgadd: ERROR: postinstall script did not complete successfully' ),
852 ],
853 };
854
855
856 def investigateInstallUninstallFailure(self, oCaseFile, oFailedResult, sResultLog, fInstall):
857 """
858 Investigates an install or uninstall failure.
859
860 We lump the two together since the installation typically also performs
861 an uninstall first and will be seeing similar issues to the uninstall.
862 """
863 self.dprint(u'%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
864
865 if fInstall and oFailedResult.enmStatus == TestSetData.ksTestStatus_TimedOut:
866 oCaseFile.noteReasonForId(self.ktReason_Host_Install_Hang, oFailedResult.idTestResult)
867 return True;
868
869 atSimple = self.katSimpleInstallUninstallMainLogReasons;
870 if oCaseFile.oTestBox.sOs in self.kdatSimpleInstallUninstallMainLogReasonsPerOs:
871 atSimple = self.kdatSimpleInstallUninstallMainLogReasonsPerOs[oCaseFile.oTestBox.sOs] + atSimple;
872
873 fFoundSomething = False;
874 for fStopOnHit, tReason, sNeedle in atSimple:
875 if sResultLog.find(sNeedle) > 0:
876 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
877 if fStopOnHit:
878 return True;
879 fFoundSomething = True;
880
881 return fFoundSomething if fFoundSomething else None;
882
883
884 def investigateBadTestBox(self, oCaseFile):
885 """
886 Checks out bad-testbox statuses.
887 """
888 _ = oCaseFile;
889 return False;
890
891
892 def investigateVBoxUnitTest(self, oCaseFile):
893 """
894 Checks out a VBox unittest problem.
895 """
896
897 #
898 # Process simple test case failures first, using their name as reason.
899 # We do the reason management just like for BSODs.
900 #
901 cRelevantOnes = 0;
902 sMainLog = oCaseFile.getMainLog();
903 aoFailedResults = oCaseFile.oTree.getListOfFailures();
904 for oFailedResult in aoFailedResults:
905 if oFailedResult is oCaseFile.oTree:
906 self.vprint('TODO: toplevel failure');
907 cRelevantOnes += 1
908
909 elif oFailedResult.sName == 'Installing VirtualBox':
910 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
911 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = True)
912 cRelevantOnes += 1
913
914 elif oFailedResult.sName == 'Uninstalling VirtualBox':
915 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
916 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = False)
917 cRelevantOnes += 1
918
919 elif oFailedResult.oParent is not None:
920 # Get the 2nd level node because that's where we'll find the unit test name.
921 while oFailedResult.oParent.oParent is not None:
922 oFailedResult = oFailedResult.oParent;
923
924 # Only report a failure once.
925 if oFailedResult.idTestResult not in oCaseFile.dReasonForResultId:
926 sKey = oFailedResult.sName;
927 if sKey.startswith('testcase/'):
928 sKey = sKey[9:];
929 if sKey in self.asUnitTestReasons:
930 tReason = ( self.ksUnitTestCategory, sKey );
931 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
932 else:
933 self.dprint(u'Unit test failure "%s" not found in %s;' % (sKey, self.asUnitTestReasons));
934 tReason = ( self.ksUnitTestCategory, self.ksUnitTestAddNew );
935 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sKey);
936 cRelevantOnes += 1
937 else:
938 self.vprint(u'Internal error: expected oParent to NOT be None for %s' % (oFailedResult,));
939
940 #
941 # If we've caught all the relevant ones by now, report the result.
942 #
943 if len(oCaseFile.dReasonForResultId) >= cRelevantOnes:
944 return self.caseClosed(oCaseFile);
945 return False;
946
947 def extractGuestCpuStack(self, sInfoText):
948 """
949 Extracts the guest CPU stacks from the input file.
950
951 Returns a dictionary keyed by the CPU number, value being a list of
952 raw stack lines (no header).
953 Returns empty dictionary if no stacks where found.
954 """
955 dRet = {};
956 off = 0;
957 while True:
958 # Find the stack.
959 offStart = sInfoText.find('=== start guest stack VCPU ', off);
960 if offStart < 0:
961 break;
962 offEnd = sInfoText.find('=== end guest stack', offStart + 20);
963 if offEnd >= 0:
964 offEnd += 3;
965 else:
966 offEnd = sInfoText.find('=== start guest stack VCPU', offStart + 20);
967 if offEnd < 0:
968 offEnd = len(sInfoText);
969
970 sStack = sInfoText[offStart : offEnd];
971 sStack = sStack.replace('\r',''); # paranoia
972 asLines = sStack.split('\n');
973
974 # Figure the CPU.
975 asWords = asLines[0].split();
976 if len(asWords) < 6 or not asWords[5].isdigit():
977 break;
978 iCpu = int(asWords[5]);
979
980 # Add it and advance.
981 dRet[iCpu] = [sLine.rstrip() for sLine in asLines[2:-1]]
982 off = offEnd;
983 return dRet;
984
985 def investigateInfoKvmLockSpinning(self, oCaseFile, sInfoText, dLogs):
986 """ Investigates kvm_lock_spinning deadlocks """
987 #
988 # Extract the stacks. We need more than one CPU to create a deadlock.
989 #
990 dStacks = self.extractGuestCpuStack(sInfoText);
991 self.dprint('kvm_lock_spinning: found %s stacks' % (len(dStacks),));
992 if len(dStacks) >= 2:
993 #
994 # Examin each of the stacks. Each must have kvm_lock_spinning in
995 # one of the first three entries.
996 #
997 cHits = 0;
998 for iCpu in dStacks:
999 asBacktrace = dStacks[iCpu];
1000 for iFrame in xrange(min(3, len(asBacktrace))):
1001 if asBacktrace[iFrame].find('kvm_lock_spinning') >= 0:
1002 cHits += 1;
1003 break;
1004 self.dprint('kvm_lock_spinning: %s/%s hits' % (cHits, len(dStacks),));
1005 if cHits == len(dStacks):
1006 return (True, self.ktReason_VMM_kvm_lock_spinning);
1007
1008 _ = dLogs; _ = oCaseFile;
1009 return (False, None);
1010
1011 def investigateInfoHalReturnToFirmware(self, oCaseFile, sInfoText, dLogs):
1012 """ Investigates HalReturnToFirmware hangs """
1013 del oCaseFile
1014 del sInfoText
1015 del dLogs
1016 # hope that's sufficient
1017 return (True, self.ktReason_Unknown_HalReturnToFirmware);
1018
1019 ## Things we search a main or VM log for to figure out why something went bust.
1020 ## @note DO NOT ADD MORE STUFF HERE!
1021 ## Please use katSimpleMainLogReasons and katSimpleVmLogReasons instead!
1022 katSimpleMainAndVmLogReasonsDeprecated = [
1023 # ( Whether to stop on hit, reason tuple, needle text. )
1024 ( False, ktReason_Guru_Generic, 'GuruMeditation' ),
1025 ( False, ktReason_Guru_Generic, 'Guru Meditation' ),
1026 ( True, ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED, 'VERR_IEM_INSTR_NOT_IMPLEMENTED' ),
1027 ( True, ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED, 'VERR_IEM_ASPECT_NOT_IMPLEMENTED' ),
1028 ( True, ktReason_Guru_VERR_TRPM_DONT_PANIC, 'VERR_TRPM_DONT_PANIC' ),
1029 ( True, ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED, 'VERR_PGM_PHYS_PAGE_RESERVED' ),
1030 ( True, ktReason_Guru_VERR_VMX_INVALID_GUEST_STATE, 'VERR_VMX_INVALID_GUEST_STATE' ),
1031 ( True, ktReason_Guru_VINF_EM_TRIPLE_FAULT, 'VINF_EM_TRIPLE_FAULT' ),
1032 ( True, ktReason_Networking_Nonexistent_host_nic,
1033 'rc=E_FAIL text="Nonexistent host networking interface, name \'eth0\' (VERR_INTERNAL_ERROR)"' ),
1034 ( True, ktReason_Networking_VERR_INTNET_FLT_IF_NOT_FOUND,
1035 'Failed to attach the network LUN (VERR_INTNET_FLT_IF_NOT_FOUND)' ),
1036 ( True, ktReason_Host_Reboot_OSX_Watchdog_Timeout, ': "OSX Watchdog Timeout: ' ),
1037 ( False, ktReason_XPCOM_NS_ERROR_CALL_FAILED,
1038 'Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))' ),
1039 ( True, ktReason_API_std_bad_alloc, 'Unexpected exception: std::bad_alloc' ),
1040 ( True, ktReason_Host_HostMemoryLow, 'HostMemoryLow' ),
1041 ( True, ktReason_Host_HostMemoryLow, 'Failed to procure handy pages; rc=VERR_NO_MEMORY' ),
1042 ( True, ktReason_Unknown_File_Not_Found,
1043 'Error: failed to start machine. Error message: File not found. (VERR_FILE_NOT_FOUND)' ),
1044 ( True, ktReason_Unknown_File_Not_Found, # lump it in with file-not-found for now.
1045 'Error: failed to start machine. Error message: Not supported. (VERR_NOT_SUPPORTED)' ),
1046 ( False, ktReason_Unknown_VM_Crash, 'txsDoConnectViaTcp: Machine state: Aborted' ),
1047 ( True, ktReason_Host_Modprobe_Failed, 'Kernel driver not installed' ),
1048 ( True, ktReason_OSInstall_Sata_no_BM, 'PCHS=14128/14134/8224' ),
1049 ( True, ktReason_Host_DoubleFreeHeap, 'double free or corruption' ),
1050 #( False, ktReason_Unknown_VM_Start_Error, 'VMSetError: ' ), - false positives for stuff like:
1051 # "VMSetError: VD: Backend 'VBoxIsoMaker' does not support async I/O"
1052 ( False, ktReason_Unknown_VM_Start_Error, 'error: failed to open session for' ),
1053 ( False, ktReason_Unknown_VM_Runtime_Error, 'Console: VM runtime error: fatal=true' ),
1054 ];
1055
1056 ## This we search a main log for to figure out why something went bust.
1057 katSimpleMainLogReasons = [
1058 # ( Whether to stop on hit, reason tuple, needle text. )
1059 ];
1060
1061 ## This we search a VM log for to figure out why something went bust.
1062 katSimpleVmLogReasons = [
1063 # ( Whether to stop on hit, reason tuple, needle text. )
1064 ];
1065
1066 ## Things we search a VBoxHardening.log file for to figure out why something went bust.
1067 katSimpleVBoxHardeningLogReasons = [
1068 # ( Whether to stop on hit, reason tuple, needle text. )
1069 ( True, ktReason_Host_DriverNotLoaded, 'Error opening VBoxDrvStub: STATUS_OBJECT_NAME_NOT_FOUND' ),
1070 ( True, ktReason_Host_NotSignedWithBuildCert, 'Not signed with the build certificate' ),
1071 ( True, ktReason_Host_TSTInfo_Accuracy_OOR, 'RTCRTSPTSTINFO::Accuracy::Millis: Out of range' ),
1072 ( False, ktReason_Unknown_VM_Crash, 'Quitting: ExitCode=0xc0000005 (rcNtWait=' ),
1073 ];
1074
1075 ## Things we search a kernel.log file for to figure out why something went bust.
1076 katSimpleKernelLogReasons = [
1077 # ( Whether to stop on hit, reason tuple, needle text. )
1078 ( True, ktReason_Panic_HugeMemory, 'mm/huge_memory.c:1988' ),
1079 ( True, ktReason_Panic_IOAPICDoesntWork, 'IO-APIC + timer doesn\'t work' ),
1080 ( True, ktReason_Panic_TxUnitHang, 'Detected Tx Unit Hang' ),
1081 ( True, ktReason_GuestBug_CompizVBoxQt, 'error 4 in libQt5CoreVBox' ),
1082 ( True, ktReason_GuestBug_CompizVBoxQt, 'error 4 in libgtk-3' ),
1083 ];
1084
1085 ## Things we search the _RIGHT_ _STRIPPED_ vgatext for.
1086 katSimpleVgaTextReasons = [
1087 # ( Whether to stop on hit, reason tuple, needle text. )
1088 ( True, ktReason_Panic_MP_BIOS_IO_APIC,
1089 "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n\n" ),
1090 ( True, ktReason_Panic_MP_BIOS_IO_APIC,
1091 "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"
1092 "...trying to set up timer (IRQ0) through the 8259A ... failed.\n"
1093 "...trying to set up timer as Virtual Wire IRQ... failed.\n"
1094 "...trying to set up timer as ExtINT IRQ... failed :(.\n"
1095 "Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug\n"
1096 "and send a report. Then try booting with the 'noapic' option\n"
1097 "\n" ),
1098 ( True, ktReason_OSInstall_GRUB_hang,
1099 "-----\nGRUB Loading stage2..\n\n\n\n" ),
1100 ( True, ktReason_OSInstall_GRUB_hang,
1101 "-----\nGRUB Loading stage2...\n\n\n\n" ), # the 3 dot hang appears to be less frequent
1102 ( True, ktReason_OSInstall_GRUB_hang,
1103 "-----\nGRUB Loading stage2....\n\n\n\n" ), # the 4 dot hang appears to be very infrequent
1104 ( True, ktReason_OSInstall_GRUB_hang,
1105 "-----\nGRUB Loading stage2.....\n\n\n\n" ), # the 5 dot hang appears to be more frequent again
1106 ( True, ktReason_OSInstall_Udev_hang,
1107 "\nStarting udev:\n\n\n\n" ),
1108 ( True, ktReason_OSInstall_Udev_hang,
1109 "\nStarting udev:\n------" ),
1110 ( True, ktReason_Panic_BootManagerC000000F,
1111 "Windows failed to start. A recent hardware or software change might be the" ),
1112 ( True, ktReason_BootManager_Image_corrupt,
1113 "BOOTMGR image is corrupt. The system cannot boot." ),
1114 ];
1115
1116 ## Things we search for in the info.txt file. Require handlers for now.
1117 katInfoTextHandlers = [
1118 # ( Trigger text, handler method )
1119 ( "kvm_lock_spinning", investigateInfoKvmLockSpinning ),
1120 ( "HalReturnToFirmware", investigateInfoHalReturnToFirmware ),
1121 ];
1122
1123 ## Mapping screenshot/failure SHA-256 hashes to failure reasons.
1124 katSimpleScreenshotHashReasons = [
1125 # ( Whether to stop on hit, reason tuple, lowercased sha-256 of PIL.Image.tostring output )
1126 ( True, ktReason_BSOD_Recovery, '576f8e38d62b311cac7e3dc3436a0d0b9bd8cfd7fa9c43aafa95631520a45eac' ),
1127 ( True, ktReason_BSOD_Automatic_Repair, 'c6a72076cc619937a7a39cfe9915b36d94cee0d4e3ce5ce061485792dcee2749' ),
1128 ( True, ktReason_BSOD_Automatic_Repair, '26c4d8a724ff2c5e1051f3d5b650dbda7b5fdee0aa3e3c6059797f7484a515df' ),
1129 ( True, ktReason_BSOD_0000007F, '57e1880619e13042a87100e7a38c8974b85ce3866501be621bea0cc696bb2c63' ),
1130 ( True, ktReason_BSOD_000000D1, '134621281f00a3f8aeeb7660064bffbf6187ed56d5852142328d0bcb18ef0ede' ),
1131 ( True, ktReason_BSOD_000000D1, '279f11258150c9d2fef041eca65501f3141da8df39256d8f6377e897e3b45a93' ),
1132 ( True, ktReason_BSOD_C0000225, 'bd13a144be9dcdfb16bc863ff4c8f02a86e263c174f2cd5ffd27ca5f3aa31789' ),
1133 ( True, ktReason_BSOD_C0000225, '8348b465e7ee9e59dd4e785880c57fd8677de05d11ac21e786bfde935307b42f' ),
1134 ( True, ktReason_BSOD_C0000225, '1316e1fc818a73348412788e6910b8c016f237d8b4e15b20caf4a866f7a7840e' ),
1135 ( True, ktReason_BSOD_C0000225, '54e0acbff365ce20a85abbe42bcd53647b8b9e80c68e45b2cd30e86bf177a0b5' ),
1136 ( True, ktReason_BSOD_C0000225, '50fec50b5199923fa48b3f3e782687cc381e1c8a788ebda14e6a355fbe3bb1b3' ),
1137 ];
1138
1139
1140 def scanLog(self, asLogs, atNeedles, oCaseFile, idTestResult):
1141 """
1142 Scans for atNeedles in sLog.
1143
1144 Returns True if a stop-on-hit neelde was found.
1145 Returns None if a no-stop reason was found.
1146 Returns False if no hit.
1147 """
1148 fRet = False;
1149 for fStopOnHit, tReason, oNeedle in atNeedles:
1150 fMatch = False;
1151 if utils.isString(oNeedle):
1152 for sLog in asLogs:
1153 if sLog:
1154 fMatch |= sLog.find(oNeedle) > 0;
1155 else:
1156 for sLog in asLogs:
1157 if sLog:
1158 fMatch |= oNeedle.search(sLog) is not None;
1159 if fMatch:
1160 oCaseFile.noteReasonForId(tReason, idTestResult);
1161 if fStopOnHit:
1162 return True;
1163 fRet = None;
1164 return fRet;
1165
1166
1167 def investigateGATest(self, oCaseFile, oFailedResult, sResultLog):
1168 """
1169 Investigates a failed VM run.
1170 """
1171 enmReason = None;
1172 if oFailedResult.sName == 'mmap':
1173 if sResultLog.find('FsPerf: Flush issue at offset ') >= 0:
1174 enmReason = self.ktReason_Add_Mmap_Coherency;
1175 elif sResultLog.find('FlushViewOfFile') >= 0:
1176 enmReason = self.ktReason_Add_FlushViewOfFile;
1177 elif oFailedResult.sName == 'Copy to guest':
1178 if sResultLog.find('*** abort action ***') >= 0:
1179 enmReason = self.ktReason_Add_CopyToGuest_Timeout;
1180
1181 if enmReason is not None:
1182 return oCaseFile.noteReasonForId(enmReason, oFailedResult.idTestResult);
1183
1184 self.vprint(u'TODO: Cannot place GA failure idTestResult=%u - %s' % (oFailedResult.idTestResult, oFailedResult.sName,));
1185 self.dprint(u'%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
1186 return False;
1187
1188 def isResultFromGATest(self, oFailedResult):
1189 """
1190 Checks if this result and corresponding log snippet looks like a GA test run.
1191 """
1192 while oFailedResult is not None:
1193 if oFailedResult.sName in [ 'Guest Control', 'Shared Folders', 'FsPerf', ]:
1194 return True;
1195 oFailedResult = oFailedResult.oParent;
1196 return False;
1197
1198
1199 def investigateVMResult(self, oCaseFile, oFailedResult, sResultLog):
1200 """
1201 Investigates a failed VM run.
1202 """
1203
1204 def investigateLogSet():
1205 """
1206 Investigates the current set of VM related logs.
1207 """
1208 self.dprint('investigateLogSet: log lengths: result %u, VM %u, kernel %u, vga text %u, info text %u, hard %u'
1209 % ( len(sResultLog if sResultLog else ''),
1210 len(sVMLog if sVMLog else ''),
1211 len(sKrnlLog if sKrnlLog else ''),
1212 len(sVgaText if sVgaText else ''),
1213 len(sInfoText if sInfoText else ''),
1214 len(sNtHardLog if sNtHardLog else ''),));
1215
1216 #self.dprint(u'main.log<<<\n%s\n<<<\n' % (sResultLog,));
1217 #self.dprint(u'vbox.log<<<\n%s\n<<<\n' % (sVMLog,));
1218 #self.dprint(u'krnl.log<<<\n%s\n<<<\n' % (sKrnlLog,));
1219 #self.dprint(u'vgatext.txt<<<\n%s\n<<<\n' % (sVgaText,));
1220 #self.dprint(u'info.txt<<<\n%s\n<<<\n' % (sInfoText,));
1221 #self.dprint(u'hard.txt<<<\n%s\n<<<\n' % (sNtHardLog,));
1222
1223 # TODO: more
1224
1225 #
1226 # Look for BSODs. Some stupid stupid inconsistencies in reason and log messages here, so don't try prettify this.
1227 #
1228 sDetails = self.findInAnyAndReturnRestOfLine([ sVMLog, sResultLog ],
1229 'GIM: HyperV: Guest indicates a fatal condition! P0=');
1230 if sDetails is not None:
1231 # P0=%#RX64 P1=%#RX64 P2=%#RX64 P3=%#RX64 P4=%#RX64 "
1232 sKey = sDetails.split(' ', 1)[0];
1233 try: sKey = '0x%08X' % (int(sKey, 16),);
1234 except: pass;
1235 if sKey in self.asBsodReasons:
1236 tReason = ( self.ksBsodCategory, sKey );
1237 elif sKey.lower() in self.asBsodReasons: # just in case.
1238 tReason = ( self.ksBsodCategory, sKey.lower() );
1239 else:
1240 self.dprint(u'BSOD "%s" not found in %s;' % (sKey, self.asBsodReasons));
1241 tReason = ( self.ksBsodCategory, self.ksBsodAddNew );
1242 return oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sDetails.strip());
1243
1244 fFoundSomething = False;
1245
1246 #
1247 # Look for linux panic.
1248 #
1249 if sKrnlLog is not None:
1250 fRet = self.scanLog([sKrnlLog,], self.katSimpleKernelLogReasons, oCaseFile, oFailedResult.idTestResult);
1251 if fRet is True:
1252 return fRet;
1253 fFoundSomething |= fRet is None;
1254
1255 #
1256 # Loop thru the simple stuff.
1257 #
1258
1259 # Main log.
1260 fRet = self.scanLog([sResultLog,], self.katSimpleMainLogReasons, oCaseFile, oFailedResult.idTestResult);
1261 if fRet is True:
1262 return fRet;
1263 fFoundSomething |= fRet is None;
1264
1265 # VM log.
1266 fRet = self.scanLog([sVMLog,], self.katSimpleVmLogReasons, oCaseFile, oFailedResult.idTestResult);
1267 if fRet is True:
1268 return fRet;
1269 fFoundSomething |= fRet is None;
1270
1271 # Old main + vm log.
1272 fRet = self.scanLog([sResultLog, sVMLog], self.katSimpleMainAndVmLogReasonsDeprecated,
1273 oCaseFile, oFailedResult.idTestResult);
1274 if fRet is True:
1275 return fRet;
1276 fFoundSomething |= fRet is None;
1277
1278 # Continue with vga text.
1279 if sVgaText:
1280 fRet = self.scanLog([sVgaText,], self.katSimpleVgaTextReasons, oCaseFile, oFailedResult.idTestResult);
1281 if fRet is True:
1282 return fRet;
1283 fFoundSomething |= fRet is None;
1284
1285 # Continue with screen hashes.
1286 if sScreenHash is not None:
1287 for fStopOnHit, tReason, sHash in self.katSimpleScreenshotHashReasons:
1288 if sScreenHash == sHash:
1289 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1290 if fStopOnHit:
1291 return True;
1292 fFoundSomething = True;
1293
1294 # Check VBoxHardening.log.
1295 if sNtHardLog is not None:
1296 fRet = self.scanLog([sNtHardLog,], self.katSimpleVBoxHardeningLogReasons, oCaseFile, oFailedResult.idTestResult);
1297 if fRet is True:
1298 return fRet;
1299 fFoundSomething |= fRet is None;
1300
1301 #
1302 # Complicated stuff.
1303 #
1304 dLogs = {
1305 'sVMLog': sVMLog,
1306 'sNtHardLog': sNtHardLog,
1307 'sScreenHash': sScreenHash,
1308 'sKrnlLog': sKrnlLog,
1309 'sVgaText': sVgaText,
1310 'sInfoText': sInfoText,
1311 };
1312
1313 # info.txt.
1314 if sInfoText:
1315 for sNeedle, fnHandler in self.katInfoTextHandlers:
1316 if sInfoText.find(sNeedle) > 0:
1317 (fStop, tReason) = fnHandler(self, oCaseFile, sInfoText, dLogs);
1318 if tReason is not None:
1319 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1320 if fStop:
1321 return True;
1322 fFoundSomething = True;
1323
1324 #
1325 # Check for repeated reboots...
1326 #
1327 if sVMLog is not None:
1328 cResets = sVMLog.count('Changing the VM state from \'RUNNING\' to \'RESETTING\'');
1329 if cResets > 10:
1330 return oCaseFile.noteReasonForId(self.ktReason_Unknown_Reboot_Loop, oFailedResult.idTestResult,
1331 sComment = 'Counted %s reboots' % (cResets,));
1332
1333 return fFoundSomething;
1334
1335 #
1336 # Check if we got any VM or/and kernel logs. Treat them as sets in
1337 # case we run multiple VMs here (this is of course ASSUMING they
1338 # appear in the order that terminateVmBySession uploads them).
1339 #
1340 cTimes = 0;
1341 sVMLog = None;
1342 sNtHardLog = None;
1343 sScreenHash = None;
1344 sKrnlLog = None;
1345 sVgaText = None;
1346 sInfoText = None;
1347 for oFile in oFailedResult.aoFiles:
1348 if oFile.sKind == TestResultFileData.ksKind_LogReleaseVm:
1349 if 'VBoxHardening.log' not in oFile.sFile:
1350 if sVMLog is not None:
1351 if investigateLogSet() is True:
1352 return True;
1353 cTimes += 1;
1354 sInfoText = None;
1355 sVgaText = None;
1356 sKrnlLog = None;
1357 sScreenHash = None;
1358 sNtHardLog = None;
1359 sVMLog = oCaseFile.getLogFile(oFile);
1360 else:
1361 sNtHardLog = oCaseFile.getLogFile(oFile);
1362 elif oFile.sKind == TestResultFileData.ksKind_LogGuestKernel:
1363 sKrnlLog = oCaseFile.getLogFile(oFile);
1364 elif oFile.sKind == TestResultFileData.ksKind_InfoVgaText:
1365 sVgaText = '\n'.join([sLine.rstrip() for sLine in oCaseFile.getLogFile(oFile).split('\n')]);
1366 elif oFile.sKind == TestResultFileData.ksKind_InfoCollection:
1367 sInfoText = oCaseFile.getLogFile(oFile);
1368 elif oFile.sKind == TestResultFileData.ksKind_ScreenshotFailure:
1369 sScreenHash = oCaseFile.getScreenshotSha256(oFile);
1370 if sScreenHash is not None:
1371 sScreenHash = sScreenHash.lower();
1372 self.vprint(u'%s %s' % ( sScreenHash, oFile.sFile,));
1373
1374 if ( sVMLog is not None \
1375 or sNtHardLog is not None \
1376 or cTimes == 0) \
1377 and investigateLogSet() is True:
1378 return True;
1379
1380 return None;
1381
1382 def isResultFromVMRun(self, oFailedResult, sResultLog):
1383 """
1384 Checks if this result and corresponding log snippet looks like a VM run.
1385 """
1386
1387 # Look for startVmEx/ startVmAndConnectToTxsViaTcp and similar output in the log.
1388 if sResultLog.find(' startVm') > 0:
1389 return True;
1390
1391 # Any other indicators? No?
1392 _ = oFailedResult;
1393 return False;
1394
1395
1396 ## Things we search a VBoxSVC log for to figure out why something went bust.
1397 katSimpleSvcLogReasons = [
1398 # ( Whether to stop on hit, reason tuple, needle text. )
1399 ( False, ktReason_Unknown_VM_Crash, re.compile(r'Reaper.* exited normally: -1073741819 \(0xc0000005\)') ),
1400 ( False, ktReason_Unknown_VM_Crash, re.compile(r'Reaper.* was signalled: 11 \(0xb\)') ),
1401 ];
1402
1403 def investigateSvcLogForVMRun(self, oCaseFile, sSvcLog):
1404 """
1405 Check the VBoxSVC log for a single VM run.
1406 """
1407 if sSvcLog:
1408 fRet = self.scanLog([sSvcLog,], self.katSimpleSvcLogReasons, oCaseFile, oCaseFile.oTree.idTestResult);
1409 if fRet is True or fRet is None:
1410 return True;
1411 return False;
1412
1413 def investigateNtHardLogForVMRun(self, oCaseFile):
1414 """
1415 Check if the hardening log for a single VM run contains VM crash indications.
1416 """
1417 aoLogFiles = oCaseFile.oTree.getListOfLogFilesByKind(TestResultFileData.ksKind_LogReleaseVm);
1418 for oLogFile in aoLogFiles:
1419 if oLogFile.sFile.find('VBoxHardening.log') >= 0:
1420 sLog = oCaseFile.getLogFile(oLogFile);
1421 if sLog.find('Quitting: ExitCode=0xc0000005') >= 0:
1422 return oCaseFile.noteReasonForId(self.ktReason_Unknown_VM_Crash, oCaseFile.oTree.idTestResult);
1423 return False;
1424
1425
1426 def investigateVBoxVMTest(self, oCaseFile, fSingleVM):
1427 """
1428 Checks out a VBox VM test.
1429
1430 This is generic investigation of a test running one or more VMs, like
1431 for example a smoke test or a guest installation test.
1432
1433 The fSingleVM parameter is a hint, which probably won't come in useful.
1434 """
1435 _ = fSingleVM;
1436
1437 #
1438 # Get a list of test result failures we should be looking into and the main log.
1439 #
1440 aoFailedResults = oCaseFile.oTree.getListOfFailures();
1441 sMainLog = oCaseFile.getMainLog();
1442
1443 #
1444 # There are a set of errors ending up on the top level result record.
1445 # Should deal with these first.
1446 #
1447 if len(aoFailedResults) == 1 and aoFailedResults[0] == oCaseFile.oTree:
1448 # Check if we've just got that XPCOM client smoke test shutdown issue. This will currently always
1449 # be reported on the top result because vboxinstall.py doesn't add an error for it. It is easy to
1450 # ignore other failures in the test if we're not a little bit careful here.
1451 if sMainLog.find('vboxinstaller: Exit code: -11 (') > 0:
1452 oCaseFile.noteReason(self.ktReason_XPCOM_Exit_Minus_11);
1453 return self.caseClosed(oCaseFile);
1454
1455 # Hang after starting VBoxSVC (e.g. idTestSet=136307258)
1456 if self.isThisFollowedByTheseLines(sMainLog, 'oVBoxMgr=<vboxapi.VirtualBoxManager object at',
1457 (' Timeout: ', ' Attempting to abort child...',) ):
1458 if sMainLog.find('*** glibc detected *** /') > 0:
1459 oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption);
1460 else:
1461 oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang);
1462 return self.caseClosed(oCaseFile);
1463
1464 # Look for heap corruption without visible hang.
1465 if sMainLog.find('*** glibc detected *** /') > 0 \
1466 or sMainLog.find("-1073740940") > 0: # STATUS_HEAP_CORRUPTION / 0xc0000374
1467 oCaseFile.noteReason(self.ktReason_Unknown_Heap_Corruption);
1468 return self.caseClosed(oCaseFile);
1469
1470 # Out of memory w/ timeout.
1471 if sMainLog.find('sErrId=HostMemoryLow') > 0:
1472 oCaseFile.noteReason(self.ktReason_Host_HostMemoryLow);
1473 return self.caseClosed(oCaseFile);
1474
1475 # Stale files like vts_rm.exe (windows).
1476 offEnd = sMainLog.rfind('*** The test driver exits successfully. ***');
1477 if offEnd > 0 and sMainLog.find('[Error 145] The directory is not empty: ', offEnd) > 0:
1478 oCaseFile.noteReason(self.ktReason_Ignore_Stale_Files);
1479 return self.caseClosed(oCaseFile);
1480
1481 #
1482 # XPCOM screwup
1483 #
1484 if sMainLog.find('AttributeError: \'NoneType\' object has no attribute \'addObserver\'') > 0:
1485 oCaseFile.noteReason(self.ktReason_Buggy_Build_Broken_Build);
1486 return self.caseClosed(oCaseFile);
1487
1488 #
1489 # Go thru each failed result.
1490 #
1491 for oFailedResult in aoFailedResults:
1492 self.dprint(u'Looking at test result #%u - %s' % (oFailedResult.idTestResult, oFailedResult.getFullName(),));
1493 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
1494 if oFailedResult.sName == 'Installing VirtualBox':
1495 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = True)
1496
1497 elif oFailedResult.sName == 'Uninstalling VirtualBox':
1498 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = False)
1499
1500 elif self.isResultFromVMRun(oFailedResult, sResultLog):
1501 self.investigateVMResult(oCaseFile, oFailedResult, sResultLog);
1502
1503 elif self.isResultFromGATest(oFailedResult):
1504 self.investigateGATest(oCaseFile, oFailedResult, sResultLog);
1505
1506 elif sResultLog.find('most likely not unique') > 0:
1507 oCaseFile.noteReasonForId(self.ktReason_Host_NetworkMisconfiguration, oFailedResult.idTestResult)
1508 elif sResultLog.find('Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))') > 0:
1509 oCaseFile.noteReasonForId(self.ktReason_XPCOM_NS_ERROR_CALL_FAILED, oFailedResult.idTestResult);
1510
1511 elif sResultLog.find('The machine is not mutable (state is ') > 0:
1512 self.vprint('Ignoring "machine not mutable" error as it is probably due to an earlier problem');
1513 oCaseFile.noteReasonForId(self.ktHarmless, oFailedResult.idTestResult);
1514
1515 elif sResultLog.find('** error: no action was specified') > 0 \
1516 or sResultLog.find('(len(self._asXml, asText))') > 0:
1517 oCaseFile.noteReasonForId(self.ktReason_Ignore_Buggy_Test_Driver, oFailedResult.idTestResult);
1518
1519 else:
1520 self.vprint(u'TODO: Cannot place idTestResult=%u - %s' % (oFailedResult.idTestResult, oFailedResult.sName,));
1521 self.dprint(u'%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
1522
1523 #
1524 # Check VBoxSVC.log and VBoxHardening.log for VM crashes if inconclusive on single VM runs.
1525 #
1526 if fSingleVM and len(oCaseFile.dReasonForResultId) < len(aoFailedResults):
1527 self.dprint(u'Got %u out of %u - checking VBoxSVC.log...'
1528 % (len(oCaseFile.dReasonForResultId), len(aoFailedResults)));
1529 if self.investigateSvcLogForVMRun(oCaseFile, oCaseFile.getSvcLog()):
1530 return self.caseClosed(oCaseFile);
1531 if self.investigateNtHardLogForVMRun(oCaseFile):
1532 return self.caseClosed(oCaseFile);
1533
1534 #
1535 # Report home and close the case if we got them all, otherwise log it.
1536 #
1537 if len(oCaseFile.dReasonForResultId) >= len(aoFailedResults):
1538 return self.caseClosed(oCaseFile);
1539
1540 if oCaseFile.dReasonForResultId:
1541 self.vprint(u'TODO: Got %u out of %u - close, but no cigar. :-/'
1542 % (len(oCaseFile.dReasonForResultId), len(aoFailedResults)));
1543 else:
1544 self.vprint(u'XXX: Could not figure out anything at all! :-(');
1545 return False;
1546
1547
1548 ## Things we search a main log for to figure out why something in the API test went bust.
1549 katSimpleApiMainLogReasons = [
1550 # ( Whether to stop on hit, reason tuple, needle text. )
1551 ( True, ktReason_Networking_Nonexistent_host_nic,
1552 'rc=E_FAIL text="Nonexistent host networking interface, name \'eth0\' (VERR_INTERNAL_ERROR)"' ),
1553 ( False, ktReason_XPCOM_NS_ERROR_CALL_FAILED,
1554 'Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))' ),
1555 ( True, ktReason_API_std_bad_alloc, 'Unexpected exception: std::bad_alloc' ),
1556 ( True, ktReason_API_Digest_Mismatch, 'Digest mismatch (VERR_NOT_EQUAL)' ),
1557 ( True, ktReason_API_MoveVM_SharingViolation, 'rc=VBOX_E_IPRT_ERROR text="Could not copy the log file ' ),
1558 ( True, ktReason_API_MoveVM_InvalidParameter,
1559 'rc=VBOX_E_IPRT_ERROR text="Could not copy the setting file ' ),
1560 ( True, ktReason_API_Open_Session_Failed, 'error: failed to open session for' ),
1561 ];
1562
1563 def investigateVBoxApiTest(self, oCaseFile):
1564 """
1565 Checks out a VBox API test.
1566 """
1567
1568 #
1569 # Get a list of test result failures we should be looking into and the main log.
1570 #
1571 aoFailedResults = oCaseFile.oTree.getListOfFailures();
1572 sMainLog = oCaseFile.getMainLog();
1573
1574 #
1575 # Go thru each failed result.
1576 #
1577 for oFailedResult in aoFailedResults:
1578 self.dprint(u'Looking at test result #%u - %s' % (oFailedResult.idTestResult, oFailedResult.getFullName(),));
1579 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
1580 if oFailedResult.sName == 'Installing VirtualBox':
1581 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = True)
1582
1583 elif oFailedResult.sName == 'Uninstalling VirtualBox':
1584 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = False)
1585
1586 elif sResultLog.find('Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))') > 0:
1587 oCaseFile.noteReasonForId(self.ktReason_XPCOM_NS_ERROR_CALL_FAILED, oFailedResult.idTestResult);
1588
1589 else:
1590 fFoundSomething = False;
1591 for fStopOnHit, tReason, sNeedle in self.katSimpleApiMainLogReasons:
1592 if sResultLog.find(sNeedle) > 0:
1593 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1594 fFoundSomething = True;
1595 if fStopOnHit:
1596 break;
1597 if fFoundSomething:
1598 self.vprint(u'TODO: Cannot place idTestResult=%u - %s' % (oFailedResult.idTestResult, oFailedResult.sName,));
1599 self.dprint(u'%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
1600
1601 #
1602 # Report home and close the case if we got them all, otherwise log it.
1603 #
1604 if len(oCaseFile.dReasonForResultId) >= len(aoFailedResults):
1605 return self.caseClosed(oCaseFile);
1606
1607 if oCaseFile.dReasonForResultId:
1608 self.vprint(u'TODO: Got %u out of %u - close, but no cigar. :-/'
1609 % (len(oCaseFile.dReasonForResultId), len(aoFailedResults)));
1610 else:
1611 self.vprint(u'XXX: Could not figure out anything at all! :-(');
1612 return False;
1613
1614
1615 def reasoningFailures(self):
1616 """
1617 Guess the reason for failures.
1618 """
1619 #
1620 # Get a list of failed test sets without any assigned failure reason.
1621 #
1622 cGot = 0;
1623 if not self.oConfig.aidTestSets:
1624 aoTestSets = self.oTestSetLogic.fetchFailedSetsWithoutReason(cHoursBack = self.oConfig.cHoursBack,
1625 tsNow = self.tsNow);
1626 else:
1627 aoTestSets = [self.oTestSetLogic.getById(idTestSet) for idTestSet in self.oConfig.aidTestSets];
1628 for oTestSet in aoTestSets:
1629 self.dprint(u'----------------------------------- #%u, status %s -----------------------------------'
1630 % ( oTestSet.idTestSet, oTestSet.enmStatus,));
1631
1632 #
1633 # Open a case file and assign it to the right investigator.
1634 #
1635 (oTree, _ ) = self.oTestResultLogic.fetchResultTree(oTestSet.idTestSet);
1636 oBuild = BuildDataEx().initFromDbWithId( self.oDb, oTestSet.idBuild, oTestSet.tsCreated);
1637 oTestBox = TestBoxData().initFromDbWithGenId( self.oDb, oTestSet.idGenTestBox);
1638 oTestGroup = TestGroupData().initFromDbWithId( self.oDb, oTestSet.idTestGroup, oTestSet.tsCreated);
1639 oTestCase = TestCaseDataEx().initFromDbWithGenId( self.oDb, oTestSet.idGenTestCase, oTestSet.tsConfig);
1640
1641 oCaseFile = VirtualTestSheriffCaseFile(self, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase);
1642
1643 if oTestSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
1644 self.dprint(u'investigateBadTestBox is taking over %s.' % (oCaseFile.sLongName,));
1645 fRc = self.investigateBadTestBox(oCaseFile);
1646
1647 elif oCaseFile.isVBoxUnitTest():
1648 self.dprint(u'investigateVBoxUnitTest is taking over %s.' % (oCaseFile.sLongName,));
1649 fRc = self.investigateVBoxUnitTest(oCaseFile);
1650
1651 elif oCaseFile.isVBoxInstallTest() or oCaseFile.isVBoxUnattendedInstallTest():
1652 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1653 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1654
1655 elif oCaseFile.isVBoxUSBTest():
1656 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1657 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1658
1659 elif oCaseFile.isVBoxStorageTest():
1660 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1661 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1662
1663 elif oCaseFile.isVBoxGAsTest():
1664 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1665 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1666
1667 elif oCaseFile.isVBoxAPITest():
1668 self.dprint(u'investigateVBoxApiTest is taking over %s.' % (oCaseFile.sLongName,));
1669 fRc = self.investigateVBoxApiTest(oCaseFile);
1670
1671 elif oCaseFile.isVBoxBenchmarkTest():
1672 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1673 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
1674
1675 elif oCaseFile.isVBoxSmokeTest():
1676 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1677 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
1678
1679 elif oCaseFile.isVBoxSerialTest():
1680 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1681 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
1682
1683 else:
1684 self.vprint(u'reasoningFailures: Unable to classify test set: %s' % (oCaseFile.sLongName,));
1685 fRc = False;
1686 cGot += fRc is True;
1687
1688 self.vprint(u'reasoningFailures: Got %u out of %u' % (cGot, len(aoTestSets), ));
1689 return 0;
1690
1691
1692 def main(self):
1693 """
1694 The 'main' function.
1695 Return exit code (0, 1, etc).
1696 """
1697 # Database stuff.
1698 self.oDb = TMDatabaseConnection()
1699 self.oTestResultLogic = TestResultLogic(self.oDb);
1700 self.oTestSetLogic = TestSetLogic(self.oDb);
1701 self.oFailureReasonLogic = FailureReasonLogic(self.oDb);
1702 self.oTestResultFailureLogic = TestResultFailureLogic(self.oDb);
1703 self.asBsodReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksBsodCategory);
1704 self.asUnitTestReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksUnitTestCategory);
1705
1706 # Get a fix on our 'now' before we do anything..
1707 self.oDb.execute('SELECT CURRENT_TIMESTAMP - interval \'%s hours\'', (self.oConfig.cStartHoursAgo,));
1708 self.tsNow = self.oDb.fetchOne();
1709
1710 # If we're suppost to commit anything we need to get our user ID.
1711 rcExit = 0;
1712 if self.oConfig.fRealRun:
1713 self.oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
1714 if self.oLogin is None:
1715 rcExit = self.eprint('Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
1716 else:
1717 self.uidSelf = self.oLogin.uid;
1718
1719 #
1720 # Do the stuff.
1721 #
1722 if rcExit == 0:
1723 rcExit = self.selfCheck();
1724 if rcExit == 0:
1725 rcExit = self.badTestBoxManagement();
1726 rcExit2 = self.reasoningFailures();
1727 if rcExit == 0:
1728 rcExit = rcExit2;
1729 # Redo the bad testbox management after failure reasons have been assigned (got timing issues).
1730 if rcExit == 0:
1731 rcExit = self.badTestBoxManagement();
1732
1733 # Cleanup.
1734 self.oFailureReasonLogic = None;
1735 self.oTestResultFailureLogic = None;
1736 self.oTestSetLogic = None;
1737 self.oTestResultLogic = None;
1738 self.oDb.close();
1739 self.oDb = None;
1740 if self.oLogFile is not None:
1741 self.oLogFile.close();
1742 self.oLogFile = None;
1743 return rcExit;
1744
1745if __name__ == '__main__':
1746 sys.exit(VirtualTestSheriff().main());
1747
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette