VirtualBox

source: vbox/trunk/src/VBox/ValidationKit/testmanager/batch/virtual_test_sheriff.py@ 71714

Last change on this file since 71714 was 71690, checked in by vboxsync, 7 years ago

vsheriff: Detect another variant of RHEL5 udev hang.

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 65.9 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# $Id: virtual_test_sheriff.py 71690 2018-04-05 15:24:22Z vboxsync $
4# pylint: disable=C0301
5
6"""
7Virtual Test Sheriff.
8
9Duties:
10 - Try to a assign failure reasons to recently failed tests.
11 - Reboot or disable bad test boxes.
12
13"""
14
15from __future__ import print_function;
16
17__copyright__ = \
18"""
19Copyright (C) 2012-2017 Oracle Corporation
20
21This file is part of VirtualBox Open Source Edition (OSE), as
22available from http://www.virtualbox.org. This file is free software;
23you can redistribute it and/or modify it under the terms of the GNU
24General Public License (GPL) as published by the Free Software
25Foundation, in version 2 as it comes in the "COPYING" file of the
26VirtualBox OSE distribution. VirtualBox OSE is distributed in the
27hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
28
29The contents of this file may alternatively be used under the terms
30of the Common Development and Distribution License Version 1.0
31(CDDL) only, as it comes in the "COPYING.CDDL" file of the
32VirtualBox OSE distribution, in which case the provisions of the
33CDDL are applicable instead of those of the GPL.
34
35You may elect to license modified versions of this file under the
36terms and conditions of either the GPL or the CDDL or both.
37"""
38__version__ = "$Revision: 71690 $"
39
40
41# Standard python imports
42import sys;
43import os;
44import hashlib;
45if sys.version_info[0] >= 3:
46 from io import StringIO as StringIO; # pylint: disable=import-error,no-name-in-module
47else:
48 from StringIO import StringIO as StringIO; # pylint: disable=import-error,no-name-in-module
49from optparse import OptionParser; # pylint: disable=deprecated-module
50from PIL import Image; # pylint: disable=import-error
51
52# Add Test Manager's modules path
53g_ksTestManagerDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))));
54sys.path.append(g_ksTestManagerDir);
55
56# Test Manager imports
57from testmanager.core.db import TMDatabaseConnection;
58from testmanager.core.build import BuildDataEx;
59from testmanager.core.failurereason import FailureReasonLogic;
60from testmanager.core.testbox import TestBoxLogic, TestBoxData;
61from testmanager.core.testcase import TestCaseDataEx;
62from testmanager.core.testgroup import TestGroupData;
63from testmanager.core.testset import TestSetLogic, TestSetData;
64from testmanager.core.testresults import TestResultLogic, TestResultFileData;
65from testmanager.core.testresultfailures import TestResultFailureLogic, TestResultFailureData;
66from testmanager.core.useraccount import UserAccountLogic;
67
68# Python 3 hacks:
69if sys.version_info[0] >= 3:
70 xrange = range; # pylint: disable=redefined-builtin,invalid-name
71
72
73class VirtualTestSheriffCaseFile(object):
74 """
75 A failure investigation case file.
76
77 """
78
79
80 ## Max log file we'll read into memory. (256 MB)
81 kcbMaxLogRead = 0x10000000;
82
83 def __init__(self, oSheriff, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase):
84 self.oSheriff = oSheriff;
85 self.oTestSet = oTestSet; # TestSetData
86 self.oTree = oTree; # TestResultDataEx
87 self.oBuild = oBuild; # BuildDataEx
88 self.oTestBox = oTestBox; # TestBoxData
89 self.oTestGroup = oTestGroup; # TestGroupData
90 self.oTestCase = oTestCase; # TestCaseDataEx
91 self.sMainLog = ''; # The main log file. Empty string if not accessible.
92
93 # Generate a case file name.
94 self.sName = '#%u: %s' % (self.oTestSet.idTestSet, self.oTestCase.sName,)
95 self.sLongName = '#%u: "%s" on "%s" running %s %s (%s), "%s" by %s, using %s %s %s r%u' \
96 % ( self.oTestSet.idTestSet,
97 self.oTestCase.sName,
98 self.oTestBox.sName,
99 self.oTestBox.sOs,
100 self.oTestBox.sOsVersion,
101 self.oTestBox.sCpuArch,
102 self.oTestBox.sCpuName,
103 self.oTestBox.sCpuVendor,
104 self.oBuild.oCat.sProduct,
105 self.oBuild.oCat.sBranch,
106 self.oBuild.oCat.sType,
107 self.oBuild.iRevision, );
108
109 # Investigation notes.
110 self.tReason = None; # None or one of the ktReason_XXX constants.
111 self.dReasonForResultId = {}; # Reason assignments indexed by idTestResult.
112 self.dCommentForResultId = {}; # Comment assignments indexed by idTestResult.
113
114 #
115 # Reason.
116 #
117
118 def noteReason(self, tReason):
119 """ Notes down a possible reason. """
120 self.oSheriff.dprint(u'noteReason: %s -> %s' % (self.tReason, tReason,));
121 self.tReason = tReason;
122 return True;
123
124 def noteReasonForId(self, tReason, idTestResult, sComment = None):
125 """ Notes down a possible reason for a specific test result. """
126 self.oSheriff.dprint(u'noteReasonForId: %u: %s -> %s%s'
127 % (idTestResult, self.dReasonForResultId.get(idTestResult, None), tReason,
128 (u' (%s)' % (sComment,)) if sComment is not None else ''));
129 self.dReasonForResultId[idTestResult] = tReason;
130 if sComment is not None:
131 self.dCommentForResultId[idTestResult] = sComment;
132 return True;
133
134
135 #
136 # Test classification.
137 #
138
139 def isVBoxTest(self):
140 """ Test classification: VirtualBox (using the build) """
141 return self.oBuild.oCat.sProduct.lower() in [ 'virtualbox', 'vbox' ];
142
143 def isVBoxUnitTest(self):
144 """ Test case classification: The unit test doing all our testcase/*.cpp stuff. """
145 return self.isVBoxTest() \
146 and (self.oTestCase.sName.lower() == 'unit tests' or self.oTestCase.sName.lower() == 'misc: unit tests');
147
148 def isVBoxInstallTest(self):
149 """ Test case classification: VirtualBox Guest installation test. """
150 return self.isVBoxTest() \
151 and self.oTestCase.sName.lower().startswith('install:');
152
153 def isVBoxUSBTest(self):
154 """ Test case classification: VirtualBox USB test. """
155 return self.isVBoxTest() \
156 and self.oTestCase.sName.lower().startswith('usb:');
157
158 def isVBoxStorageTest(self):
159 """ Test case classification: VirtualBox Storage test. """
160 return self.isVBoxTest() \
161 and self.oTestCase.sName.lower().startswith('storage:');
162
163 def isVBoxGAsTest(self):
164 """ Test case classification: VirtualBox Guest Additions test. """
165 return self.isVBoxTest() \
166 and self.oTestCase.sName.lower().startswith('ga\'s tests');
167
168 def isVBoxAPITest(self):
169 """ Test case classification: VirtualBox API test. """
170 return self.isVBoxTest() \
171 and self.oTestCase.sName.lower().startswith('api:');
172
173 def isVBoxBenchmarkTest(self):
174 """ Test case classification: VirtualBox Benchmark test. """
175 return self.isVBoxTest() \
176 and self.oTestCase.sName.lower().startswith('benchmark:');
177
178 def isVBoxSmokeTest(self):
179 """ Test case classification: Smoke test. """
180 return self.isVBoxTest() \
181 and self.oTestCase.sName.lower().startswith('smoketest');
182
183
184 #
185 # Utility methods.
186 #
187
188 def getMainLog(self):
189 """
190 Tries to read the main log file since this will be the first source of information.
191 """
192 if self.sMainLog:
193 return self.sMainLog;
194 (oFile, oSizeOrError, _) = self.oTestSet.openFile('main.log', 'rb');
195 if oFile is not None:
196 try:
197 self.sMainLog = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
198 except Exception as oXcpt:
199 self.oSheriff.vprint(u'Error reading main log file: %s' % (oXcpt,))
200 self.sMainLog = '';
201 else:
202 self.oSheriff.vprint(u'Error opening main log file: %s' % (oSizeOrError,));
203 return self.sMainLog;
204
205 def getLogFile(self, oFile):
206 """
207 Tries to read the given file as a utf-8 log file.
208 oFile is a TestFileDataEx instance.
209 Returns empty string if problems opening or reading the file.
210 """
211 sContent = '';
212 (oFile, oSizeOrError, _) = self.oTestSet.openFile(oFile.sFile, 'rb');
213 if oFile is not None:
214 try:
215 sContent = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
216 except Exception as oXcpt:
217 self.oSheriff.vprint(u'Error reading the "%s" log file: %s' % (oFile.sFile, oXcpt,))
218 else:
219 self.oSheriff.vprint(u'Error opening the "%s" log file: %s' % (oFile.sFile, oSizeOrError,));
220 return sContent;
221
222 def getScreenshotSha256(self, oFile):
223 """
224 Tries to read the given screenshot file, uncompress it, and do SHA-2
225 on the raw pixels.
226 Returns SHA-2 digest string on success, None on failure.
227 """
228 (oImgFile, _, _) = self.oTestSet.openFile(oFile.sFile, 'rb');
229 try:
230 abImageFile = oImgFile.read();
231 except Exception as oXcpt:
232 self.oSheriff.vprint(u'Error reading the "%s" image file: %s' % (oFile.sFile, oXcpt,))
233 else:
234 try:
235 oImage = Image.open(StringIO(abImageFile));
236 except Exception as oXcpt:
237 self.oSheriff.vprint(u'Error opening the "%s" image bytes using PIL.Image.open: %s' % (oFile.sFile, oXcpt,))
238 else:
239 try:
240 oHash = hashlib.sha256();
241 oHash.update(oImage.tostring());
242 except Exception as oXcpt:
243 self.oSheriff.vprint(u'Error hashing the uncompressed image bytes for "%s": %s' % (oFile.sFile, oXcpt,))
244 else:
245 return oHash.hexdigest();
246 return None;
247
248
249
250 def isSingleTestFailure(self):
251 """
252 Figure out if this is a single test failing or if it's one of the
253 more complicated ones.
254 """
255 if self.oTree.cErrors == 1:
256 return True;
257 if self.oTree.deepCountErrorContributers() <= 1:
258 return True;
259 return False;
260
261
262
263class VirtualTestSheriff(object): # pylint: disable=R0903
264 """
265 Add build info into Test Manager database.
266 """
267
268 ## The user account for the virtual sheriff.
269 ksLoginName = 'vsheriff';
270
271 def __init__(self):
272 """
273 Parse command line.
274 """
275 self.oDb = None;
276 self.tsNow = None;
277 self.oTestResultLogic = None;
278 self.oTestSetLogic = None;
279 self.oFailureReasonLogic = None; # FailureReasonLogic;
280 self.oTestResultFailureLogic = None; # TestResultFailureLogic
281 self.oLogin = None;
282 self.uidSelf = -1;
283 self.oLogFile = None;
284 self.asBsodReasons = [];
285 self.asUnitTestReasons = [];
286
287 oParser = OptionParser();
288 oParser.add_option('--start-hours-ago', dest = 'cStartHoursAgo', metavar = '<hours>', default = 0, type = 'int',
289 help = 'When to start specified as hours relative to current time. Defauls is right now.', );
290 oParser.add_option('--hours-period', dest = 'cHoursBack', metavar = '<period-in-hours>', default = 2, type = 'int',
291 help = 'Work period specified in hours. Defauls is 2 hours.');
292 oParser.add_option('--real-run-back', dest = 'fRealRun', action = 'store_true', default = False,
293 help = 'Whether to commit the findings to the database. Default is a dry run.');
294 oParser.add_option('-q', '--quiet', dest = 'fQuiet', action = 'store_true', default = False,
295 help = 'Quiet execution');
296 oParser.add_option('-l', '--log', dest = 'sLogFile', metavar = '<logfile>', default = None,
297 help = 'Where to log messages.');
298 oParser.add_option('--debug', dest = 'fDebug', action = 'store_true', default = False,
299 help = 'Enables debug mode.');
300
301 (self.oConfig, _) = oParser.parse_args();
302
303 if self.oConfig.sLogFile:
304 self.oLogFile = open(self.oConfig.sLogFile, "a");
305 self.oLogFile.write('VirtualTestSheriff: $Revision: 71690 $ \n');
306
307
308 def eprint(self, sText):
309 """
310 Prints error messages.
311 Returns 1 (for exit code usage.)
312 """
313 print('error: %s' % (sText,));
314 if self.oLogFile is not None:
315 self.oLogFile.write((u'error: %s\n' % (sText,)).encode('utf-8'));
316 return 1;
317
318 def dprint(self, sText):
319 """
320 Prints debug info.
321 """
322 if self.oConfig.fDebug:
323 if not self.oConfig.fQuiet:
324 print('debug: %s' % (sText, ));
325 if self.oLogFile is not None:
326 self.oLogFile.write((u'debug: %s\n' % (sText,)).encode('utf-8'));
327 return 0;
328
329 def vprint(self, sText):
330 """
331 Prints verbose info.
332 """
333 if not self.oConfig.fQuiet:
334 print('info: %s' % (sText,));
335 if self.oLogFile is not None:
336 self.oLogFile.write((u'info: %s\n' % (sText,)).encode('utf-8'));
337 return 0;
338
339 def getFailureReason(self, tReason):
340 """ Gets the failure reason object for tReason. """
341 return self.oFailureReasonLogic.cachedLookupByNameAndCategory(tReason[1], tReason[0]);
342
343 def selfCheck(self):
344 """ Does some self checks, looking up things we expect to be in the database and such. """
345 rcExit = 0;
346 for sAttr in dir(self.__class__):
347 if sAttr.startswith('ktReason_'):
348 tReason = getattr(self.__class__, sAttr);
349 oFailureReason = self.getFailureReason(tReason);
350 if oFailureReason is None:
351 rcExit = self.eprint(u'Failed to find failure reason "%s" in category "%s" in the database!'
352 % (tReason[1], tReason[0],));
353
354 # Check the user account as well.
355 if self.oLogin is None:
356 oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
357 if oLogin is None:
358 rcExit = self.eprint(u'Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
359 return rcExit;
360
361
362
363 def badTestBoxManagement(self):
364 """
365 Looks for bad test boxes and first tries once to reboot them then disables them.
366 """
367 rcExit = 0;
368
369 #
370 # We skip this entirely if we're running in the past and not in harmless debug mode.
371 #
372 if self.oConfig.cStartHoursAgo != 0 \
373 and (not self.oConfig.fDebug or self.oConfig.fRealRun):
374 return rcExit;
375 tsNow = self.tsNow if self.oConfig.fDebug else None;
376 cHoursBack = self.oConfig.cHoursBack if self.oConfig.fDebug else 2;
377 oTestBoxLogic = TestBoxLogic(self.oDb);
378
379 #
380 # Generate a list of failures reasons we consider bad-testbox behavior.
381 #
382 aidFailureReasons = [
383 self.getFailureReason(self.ktReason_Host_DriverNotUnloading).idFailureReason,
384 self.getFailureReason(self.ktReason_Host_DriverNotCompilable).idFailureReason,
385 self.getFailureReason(self.ktReason_Host_InstallationFailed).idFailureReason,
386 ];
387
388 #
389 # Get list of bad test boxes for given period and check them out individually.
390 #
391 aidBadTestBoxes = self.oTestSetLogic.fetchBadTestBoxIds(cHoursBack = cHoursBack, tsNow = tsNow,
392 aidFailureReasons = aidFailureReasons);
393 for idTestBox in aidBadTestBoxes:
394 # Skip if the testbox is already disabled or has a pending reboot command.
395 try:
396 oTestBox = TestBoxData().initFromDbWithId(self.oDb, idTestBox);
397 except Exception as oXcpt:
398 rcExit = self.eprint('Failed to get data for test box #%u in badTestBoxManagement: %s' % (idTestBox, oXcpt,));
399 continue;
400 if not oTestBox.fEnabled:
401 self.dprint(u'badTestBoxManagement: Skipping test box #%u (%s) as it has been disabled already.'
402 % ( idTestBox, oTestBox.sName, ));
403 continue;
404 if oTestBox.enmPendingCmd != TestBoxData.ksTestBoxCmd_None:
405 self.dprint(u'badTestBoxManagement: Skipping test box #%u (%s) as it has a command pending: %s'
406 % ( idTestBox, oTestBox.sName, oTestBox.enmPendingCmd));
407 continue;
408
409 # Get the most recent testsets for this box (descending on tsDone) and see how bad it is.
410 aoSets = self.oTestSetLogic.fetchSetsForTestBox(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow);
411 cOkay = 0;
412 cBad = 0;
413 iFirstOkay = len(aoSets);
414 for iSet, oSet in enumerate(aoSets):
415 if oSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
416 cBad += 1;
417 else:
418 # Check for bad failure reasons.
419 oFailure = None;
420 if oSet.enmStatus in TestSetData.kasBadTestStatuses:
421 oFailure = self.oTestResultFailureLogic.getById(oSet.idTestResult);
422 if oFailure is not None and oFailure.idFailureReason in aidFailureReasons:
423 cBad += 1;
424 else:
425 # This is an okay test result then.
426 ## @todo maybe check the elapsed time here, it could still be a bad run?
427 cOkay += 1;
428 if iFirstOkay > iSet:
429 iFirstOkay = iSet;
430 if iSet > 10:
431 break;
432
433 # We react if there are two or more bad-testbox statuses at the head of the
434 # history and at least three in the last 10 results.
435 if iFirstOkay >= 2 and cBad > 2:
436 # Frank: For now don't reboot boxes automatically
437 if True or oTestBoxLogic.hasTestBoxRecentlyBeenRebooted(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow):
438 self.vprint(u'Disabling testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u'
439 % ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
440 if self.oConfig.fRealRun is True:
441 try:
442 oTestBoxLogic.disableTestBox(idTestBox, self.uidSelf, fCommit = True,
443 sComment = 'Automatically disabled (iFirstOkay=%u cBad=%u cOkay=%u)'
444 % (iFirstOkay, cBad, cOkay),);
445 except Exception as oXcpt:
446 rcExit = self.eprint(u'Error disabling testbox #%u (%u): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
447 else:
448 self.vprint(u'Rebooting testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u'
449 % ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
450 if self.oConfig.fRealRun is True:
451 try:
452 oTestBoxLogic.rebootTestBox(idTestBox, self.uidSelf, fCommit = True,
453 sComment = 'Automatically rebooted (iFirstOkay=%u cBad=%u cOkay=%u)'
454 % (iFirstOkay, cBad, cOkay),);
455 except Exception as oXcpt:
456 rcExit = self.eprint(u'Error rebooting testbox #%u (%u): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
457 else:
458 self.dprint(u'badTestBoxManagement: #%u (%s) looks ok: iFirstOkay=%u cBad=%u cOkay=%u'
459 % ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
460 return rcExit;
461
462
463 ## @name Failure reasons we know.
464 ## @{
465 ktReason_BSOD_Recovery = ( 'BSOD', 'Recovery' );
466 ktReason_BSOD_Automatic_Repair = ( 'BSOD', 'Automatic Repair' );
467 ktReason_BSOD_0000007F = ( 'BSOD', '0x0000007F' );
468 ktReason_BSOD_000000D1 = ( 'BSOD', '0x000000D1' );
469 ktReason_BSOD_C0000225 = ( 'BSOD', '0xC0000225 (boot)' );
470 ktReason_Guru_Generic = ( 'Guru Meditations', 'Generic Guru Meditation' );
471 ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_INSTR_NOT_IMPLEMENTED' );
472 ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_ASPECT_NOT_IMPLEMENTED' );
473 ktReason_Guru_VERR_TRPM_DONT_PANIC = ( 'Guru Meditations', 'VERR_TRPM_DONT_PANIC' );
474 ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED = ( 'Guru Meditations', 'VERR_PGM_PHYS_PAGE_RESERVED' );
475 ktReason_Guru_VERR_VMX_INVALID_GUEST_STATE = ( 'Guru Meditations', 'VERR_VMX_INVALID_GUEST_STATE' );
476 ktReason_Guru_VINF_EM_TRIPLE_FAULT = ( 'Guru Meditations', 'VINF_EM_TRIPLE_FAULT' );
477 ktReason_Host_HostMemoryLow = ( 'Host', 'HostMemoryLow' );
478 ktReason_Host_DriverNotLoaded = ( 'Host', 'Driver not loaded' );
479 ktReason_Host_DriverNotUnloading = ( 'Host', 'Driver not unloading' );
480 ktReason_Host_DriverNotCompilable = ( 'Host', 'Driver not compilable' );
481 ktReason_Host_InstallationFailed = ( 'Host', 'Installation failed' );
482 ktReason_Host_NotSignedWithBuildCert = ( 'Host', 'Not signed with build cert' );
483 ktReason_Host_DoubleFreeHeap = ( 'Host', 'Double free or corruption' );
484 ktReason_Host_LeftoverService = ( 'Host', 'Leftover service' );
485 ktReason_Host_Reboot_OSX_Watchdog_Timeout = ( 'Host Reboot', 'OSX Watchdog Timeout' );
486 ktReason_Host_Modprobe_Failed = ( 'Host', 'Modprobe failed' );
487 ktReason_Host_Install_Hang = ( 'Host', 'Install hang' );
488 ktReason_Host_NetworkMisconfiguration = ( 'Host', 'Network misconfiguration' );
489 ktReason_Networking_Nonexistent_host_nic = ( 'Networking', 'Nonexistent host networking interface' );
490 ktReason_OSInstall_GRUB_hang = ( 'O/S Install', 'GRUB hang' );
491 ktReason_OSInstall_Udev_hang = ( 'O/S Install', 'udev hang' );
492 ktReason_OSInstall_Sata_no_BM = ( 'O/S Install', 'SATA busmaster bit not set' );
493 ktReason_Panic_BootManagerC000000F = ( 'Panic', 'Hardware Changed' );
494 ktReason_BootManager_Image_corrupt = ( 'Unknown', 'BOOTMGR Image corrupt' );
495 ktReason_Panic_MP_BIOS_IO_APIC = ( 'Panic', 'MP-BIOS/IO-APIC' );
496 ktReason_Panic_HugeMemory = ( 'Panic', 'Huge memory assertion' );
497 ktReason_Panic_IOAPICDoesntWork = ( 'Panic', 'IO-APIC and timer does not work' );
498 ktReason_Panic_TxUnitHang = ( 'Panic', 'Tx Unit Hang' );
499 ktReason_XPCOM_Exit_Minus_11 = ( 'API / (XP)COM', 'exit -11' );
500 ktReason_XPCOM_VBoxSVC_Hang = ( 'API / (XP)COM', 'VBoxSVC hang' );
501 ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption = ( 'API / (XP)COM', 'VBoxSVC hang + heap corruption' );
502 ktReason_XPCOM_NS_ERROR_CALL_FAILED = ( 'API / (XP)COM', 'NS_ERROR_CALL_FAILED' );
503 ktReason_Unknown_Heap_Corruption = ( 'Unknown', 'Heap corruption' );
504 ktReason_Unknown_Reboot_Loop = ( 'Unknown', 'Reboot loop' );
505 ktReason_Unknown_File_Not_Found = ( 'Unknown', 'File not found' );
506 ktReason_Unknown_VM_Crash = ( 'Unknown', 'VM crash' );
507 ktReason_Unknown_HalReturnToFirmware = ( 'Unknown', 'HalReturnToFirmware' );
508 ktReason_VMM_kvm_lock_spinning = ( 'VMM', 'kvm_lock_spinning' );
509 ktReason_Ignore_Buggy_Test_Driver = ( 'Ignore', 'Buggy test driver' );
510 ktReason_Ignore_Stale_Files = ( 'Ignore', 'Stale files' );
511 ktReason_Buggy_Build_Broken_Build = ( 'Broken Build', 'Buggy build' );
512 ktReason_Unknown_VM_Start_Error = ( 'Unknown', 'VM Start Error' );
513 ktReason_Unknown_VM_Runtime_Error = ( 'Unknown', 'VM Runtime Error' );
514 ## @}
515
516 ## BSOD category.
517 ksBsodCategory = 'BSOD';
518 ## Special reason indicating that the flesh and blood sheriff has work to do.
519 ksBsodAddNew = 'Add new BSOD';
520
521 ## Unit test category.
522 ksUnitTestCategory = 'Unit';
523 ## Special reason indicating that the flesh and blood sheriff has work to do.
524 ksUnitTestAddNew = 'Add new';
525
526 ## Used for indica that we shouldn't report anything for this test result ID and
527 ## consider promoting the previous error to test set level if it's the only one.
528 ktHarmless = ( 'Probably', 'Caused by previous error' );
529
530
531 def caseClosed(self, oCaseFile):
532 """
533 Reports the findings in the case and closes it.
534 """
535 #
536 # Log it and create a dReasonForReasultId we can use below.
537 #
538 dCommentForResultId = oCaseFile.dCommentForResultId;
539 if oCaseFile.dReasonForResultId:
540 # Must weed out ktHarmless.
541 dReasonForResultId = {};
542 for idKey, tReason in oCaseFile.dReasonForResultId.items():
543 if tReason is not self.ktHarmless:
544 dReasonForResultId[idKey] = tReason;
545 if not dReasonForResultId:
546 self.vprint(u'TODO: Closing %s without a real reason, only %s.'
547 % (oCaseFile.sName, oCaseFile.dReasonForResultId));
548 return False;
549
550 # Try promote to single reason.
551 atValues = dReasonForResultId.values();
552 fSingleReason = True;
553 if len(dReasonForResultId) == 1 and dReasonForResultId.keys()[0] != oCaseFile.oTestSet.idTestResult:
554 self.dprint(u'Promoting single reason to whole set: %s' % (atValues[0],));
555 elif len(dReasonForResultId) > 1 and len(atValues) == atValues.count(atValues[0]):
556 self.dprint(u'Merged %d reasons to a single one: %s' % (len(atValues), atValues[0]));
557 else:
558 fSingleReason = False;
559 if fSingleReason:
560 dReasonForResultId = { oCaseFile.oTestSet.idTestResult: atValues[0], };
561 if dCommentForResultId:
562 dCommentForResultId = { oCaseFile.oTestSet.idTestResult: dCommentForResultId.values()[0], };
563 elif oCaseFile.tReason is not None:
564 dReasonForResultId = { oCaseFile.oTestSet.idTestResult: oCaseFile.tReason, };
565 else:
566 self.vprint(u'Closing %s without a reason - this should not happen!' % (oCaseFile.sName,));
567 return False;
568
569 self.vprint(u'Closing %s with following reason%s: %s'
570 % ( oCaseFile.sName, 's' if dReasonForResultId > 0 else '', dReasonForResultId, ));
571
572 #
573 # Add the test failure reason record(s).
574 #
575 for idTestResult, tReason in dReasonForResultId.items():
576 oFailureReason = self.getFailureReason(tReason);
577 if oFailureReason is not None:
578 sComment = 'Set by $Revision: 71690 $' # Handy for reverting later.
579 if idTestResult in dCommentForResultId:
580 sComment += ': ' + dCommentForResultId[idTestResult];
581
582 oAdd = TestResultFailureData();
583 oAdd.initFromValues(idTestResult = idTestResult,
584 idFailureReason = oFailureReason.idFailureReason,
585 uidAuthor = self.uidSelf,
586 idTestSet = oCaseFile.oTestSet.idTestSet,
587 sComment = sComment,);
588 if self.oConfig.fRealRun:
589 try:
590 self.oTestResultFailureLogic.addEntry(oAdd, self.uidSelf, fCommit = True);
591 except Exception as oXcpt:
592 self.eprint(u'caseClosed: Exception "%s" while adding reason %s for %s'
593 % (oXcpt, oAdd, oCaseFile.sLongName,));
594 else:
595 self.eprint(u'caseClosed: Cannot locate failure reason: %s / %s' % ( tReason[0], tReason[1],));
596 return True;
597
598 #
599 # Tools for assiting log parsing.
600 #
601
602 @staticmethod
603 def matchFollowedByLines(sStr, off, asFollowingLines):
604 """ Worker for isThisFollowedByTheseLines. """
605
606 # Advance off to the end of the line.
607 off = sStr.find('\n', off);
608 if off < 0:
609 return False;
610 off += 1;
611
612 # Match each string with the subsequent lines.
613 for iLine, sLine in enumerate(asFollowingLines):
614 offEnd = sStr.find('\n', off);
615 if offEnd < 0:
616 return iLine + 1 == len(asFollowingLines) and sStr.find(sLine, off) < 0;
617 if sLine and sStr.find(sLine, off, offEnd) < 0:
618 return False;
619
620 # next line.
621 off = offEnd + 1;
622
623 return True;
624
625 @staticmethod
626 def isThisFollowedByTheseLines(sStr, sFirst, asFollowingLines):
627 """
628 Looks for a line contining sFirst which is then followed by lines
629 with the strings in asFollowingLines. (No newline chars anywhere!)
630 Returns True / False.
631 """
632 off = sStr.find(sFirst, 0);
633 while off >= 0:
634 if VirtualTestSheriff.matchFollowedByLines(sStr, off, asFollowingLines):
635 return True;
636 off = sStr.find(sFirst, off + 1);
637 return False;
638
639 @staticmethod
640 def findAndReturnRestOfLine(sHaystack, sNeedle):
641 """
642 Looks for sNeedle in sHaystack.
643 Returns The text following the needle up to the end of the line.
644 Returns None if not found.
645 """
646 if sHaystack is None:
647 return None;
648 off = sHaystack.find(sNeedle);
649 if off < 0:
650 return None;
651 off += len(sNeedle)
652 offEol = sHaystack.find('\n', off);
653 if offEol < 0:
654 offEol = len(sHaystack);
655 return sHaystack[off:offEol]
656
657 @staticmethod
658 def findInAnyAndReturnRestOfLine(asHaystacks, sNeedle):
659 """
660 Looks for sNeedle in zeroe or more haystacks (asHaystack).
661 Returns The text following the first needed found up to the end of the line.
662 Returns None if not found.
663 """
664 for sHaystack in asHaystacks:
665 sRet = VirtualTestSheriff.findAndReturnRestOfLine(sHaystack, sNeedle);
666 if sRet is not None:
667 return sRet;
668 return None;
669
670
671 #
672 # The investigative units.
673 #
674
675 katSimpleInstallUninstallMainLogReasons = [
676 # ( Whether to stop on hit, reason tuple, needle text. )
677 ( False, ktReason_Host_LeftoverService,
678 'SERVICE_NAME: vbox' ),
679 ];
680
681 kdatSimpleInstallUninstallMainLogReasonsPerOs = {
682 'darwin': [
683 # ( Whether to stop on hit, reason tuple, needle text. )
684 ( True, ktReason_Host_DriverNotUnloading,
685 'Can\'t remove kext org.virtualbox.kext.VBoxDrv; services failed to terminate - 0xe00002c7' ),
686 ],
687 'linux': [
688 # ( Whether to stop on hit, reason tuple, needle text. )
689 ( True, ktReason_Host_DriverNotCompilable,
690 'This system is not currently set up to build kernel modules' ),
691 ( True, ktReason_Host_DriverNotCompilable,
692 'This system is currently not set up to build kernel modules' ),
693 ( True, ktReason_Host_InstallationFailed,
694 'vboxdrv.sh: failed: Look at /var/log/vbox-install.log to find out what went wrong.' ),
695 ( True, ktReason_Host_DriverNotUnloading,
696 'Cannot unload module vboxdrv'),
697 ],
698 };
699
700
701 def investigateInstallUninstallFailure(self, oCaseFile, oFailedResult, sResultLog, fInstall):
702 """
703 Investigates an install or uninstall failure.
704
705 We lump the two together since the installation typically also performs
706 an uninstall first and will be seeing similar issues to the uninstall.
707 """
708
709 if fInstall and oFailedResult.enmStatus == TestSetData.ksTestStatus_TimedOut:
710 oCaseFile.noteReasonForId(self.ktReason_Host_Install_Hang, oFailedResult.idTestResult)
711 return True;
712
713 atSimple = self.katSimpleInstallUninstallMainLogReasons;
714 if oCaseFile.oTestBox.sOs in self.kdatSimpleInstallUninstallMainLogReasonsPerOs:
715 atSimple = self.kdatSimpleInstallUninstallMainLogReasonsPerOs[oCaseFile.oTestBox.sOs] + atSimple;
716
717 fFoundSomething = False;
718 for fStopOnHit, tReason, sNeedle in atSimple:
719 if sResultLog.find(sNeedle) > 0:
720 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
721 if fStopOnHit:
722 return True;
723 fFoundSomething = True;
724
725 return fFoundSomething if fFoundSomething else None;
726
727
728 def investigateBadTestBox(self, oCaseFile):
729 """
730 Checks out bad-testbox statuses.
731 """
732 _ = oCaseFile;
733 return False;
734
735
736 def investigateVBoxUnitTest(self, oCaseFile):
737 """
738 Checks out a VBox unittest problem.
739 """
740
741 #
742 # Process simple test case failures first, using their name as reason.
743 # We do the reason management just like for BSODs.
744 #
745 cRelevantOnes = 0;
746 sMainLog = oCaseFile.getMainLog();
747 aoFailedResults = oCaseFile.oTree.getListOfFailures();
748 for oFailedResult in aoFailedResults:
749 if oFailedResult is oCaseFile.oTree:
750 self.vprint('TODO: toplevel failure');
751 cRelevantOnes += 1
752
753 elif oFailedResult.sName == 'Installing VirtualBox':
754 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
755 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = True)
756 cRelevantOnes += 1
757
758 elif oFailedResult.sName == 'Uninstalling VirtualBox':
759 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
760 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = False)
761 cRelevantOnes += 1
762
763 elif oFailedResult.oParent is not None:
764 # Get the 2nd level node because that's where we'll find the unit test name.
765 while oFailedResult.oParent.oParent is not None:
766 oFailedResult = oFailedResult.oParent;
767
768 # Only report a failure once.
769 if oFailedResult.idTestResult not in oCaseFile.dReasonForResultId:
770 sKey = oFailedResult.sName;
771 if sKey.startswith('testcase/'):
772 sKey = sKey[9:];
773 if sKey in self.asUnitTestReasons:
774 tReason = ( self.ksUnitTestCategory, sKey );
775 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
776 else:
777 self.dprint(u'Unit test failure "%s" not found in %s;' % (sKey, self.asUnitTestReasons));
778 tReason = ( self.ksUnitTestCategory, self.ksUnitTestAddNew );
779 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sKey);
780 cRelevantOnes += 1
781 else:
782 self.vprint(u'Internal error: expected oParent to NOT be None for %s' % (oFailedResult,));
783
784 #
785 # If we've caught all the relevant ones by now, report the result.
786 #
787 if len(oCaseFile.dReasonForResultId) >= cRelevantOnes:
788 return self.caseClosed(oCaseFile);
789 return False;
790
791 def extractGuestCpuStack(self, sInfoText):
792 """
793 Extracts the guest CPU stacks from the input file.
794
795 Returns a dictionary keyed by the CPU number, value being a list of
796 raw stack lines (no header).
797 Returns empty dictionary if no stacks where found.
798 """
799 dRet = {};
800 off = 0;
801 while True:
802 # Find the stack.
803 offStart = sInfoText.find('=== start guest stack VCPU ', off);
804 if offStart < 0:
805 break;
806 offEnd = sInfoText.find('=== end guest stack', offStart + 20);
807 if offEnd >= 0:
808 offEnd += 3;
809 else:
810 offEnd = sInfoText.find('=== start guest stack VCPU', offStart + 20);
811 if offEnd < 0:
812 offEnd = len(sInfoText);
813
814 sStack = sInfoText[offStart : offEnd];
815 sStack = sStack.replace('\r',''); # paranoia
816 asLines = sStack.split('\n');
817
818 # Figure the CPU.
819 asWords = asLines[0].split();
820 if len(asWords) < 6 or not asWords[5].isdigit():
821 break;
822 iCpu = int(asWords[5]);
823
824 # Add it and advance.
825 dRet[iCpu] = [sLine.rstrip() for sLine in asLines[2:-1]]
826 off = offEnd;
827 return dRet;
828
829 def investigateInfoKvmLockSpinning(self, oCaseFile, sInfoText, dLogs):
830 """ Investigates kvm_lock_spinning deadlocks """
831 #
832 # Extract the stacks. We need more than one CPU to create a deadlock.
833 #
834 dStacks = self.extractGuestCpuStack(sInfoText);
835 self.dprint('kvm_lock_spinning: found %s stacks' % (len(dStacks),));
836 if len(dStacks) >= 2:
837 #
838 # Examin each of the stacks. Each must have kvm_lock_spinning in
839 # one of the first three entries.
840 #
841 cHits = 0;
842 for iCpu in dStacks:
843 asBacktrace = dStacks[iCpu];
844 for iFrame in xrange(min(3, len(asBacktrace))):
845 if asBacktrace[iFrame].find('kvm_lock_spinning') >= 0:
846 cHits += 1;
847 break;
848 self.dprint('kvm_lock_spinning: %s/%s hits' % (cHits, len(dStacks),));
849 if cHits == len(dStacks):
850 return (True, self.ktReason_VMM_kvm_lock_spinning);
851
852 _ = dLogs; _ = oCaseFile;
853 return (False, None);
854
855 def investigateInfoHalReturnToFirmware(self, oCaseFile, sInfoText, dLogs):
856 """ Investigates HalReturnToFirmware hangs """
857 del oCaseFile
858 del sInfoText
859 del dLogs
860 # hope that's sufficient
861 return (True, self.ktReason_Unknown_HalReturnToFirmware);
862
863 ## Things we search a main or VM log for to figure out why something went bust.
864 katSimpleMainAndVmLogReasons = [
865 # ( Whether to stop on hit, reason tuple, needle text. )
866 ( False, ktReason_Guru_Generic, 'GuruMeditation' ),
867 ( False, ktReason_Guru_Generic, 'Guru Meditation' ),
868 ( True, ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED, 'VERR_IEM_INSTR_NOT_IMPLEMENTED' ),
869 ( True, ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED, 'VERR_IEM_ASPECT_NOT_IMPLEMENTED' ),
870 ( True, ktReason_Guru_VERR_TRPM_DONT_PANIC, 'VERR_TRPM_DONT_PANIC' ),
871 ( True, ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED, 'VERR_PGM_PHYS_PAGE_RESERVED' ),
872 ( True, ktReason_Guru_VERR_VMX_INVALID_GUEST_STATE, 'VERR_VMX_INVALID_GUEST_STATE' ),
873 ( True, ktReason_Guru_VINF_EM_TRIPLE_FAULT, 'VINF_EM_TRIPLE_FAULT' ),
874 ( True, ktReason_Networking_Nonexistent_host_nic,
875 'rc=E_FAIL text="Nonexistent host networking interface, name \'eth0\' (VERR_INTERNAL_ERROR)"' ),
876 ( True, ktReason_Host_Reboot_OSX_Watchdog_Timeout, ': "OSX Watchdog Timeout: ' ),
877 ( False, ktReason_XPCOM_NS_ERROR_CALL_FAILED,
878 'Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))' ),
879 ( True, ktReason_Host_HostMemoryLow, 'HostMemoryLow' ),
880 ( True, ktReason_Host_HostMemoryLow, 'Failed to procure handy pages; rc=VERR_NO_MEMORY' ),
881 ( True, ktReason_Unknown_File_Not_Found,
882 'Error: failed to start machine. Error message: File not found. (VERR_FILE_NOT_FOUND)' ),
883 ( True, ktReason_Unknown_File_Not_Found, # lump it in with file-not-found for now.
884 'Error: failed to start machine. Error message: Not supported. (VERR_NOT_SUPPORTED)' ),
885 ( False, ktReason_Unknown_VM_Crash, 'txsDoConnectViaTcp: Machine state: Aborted' ),
886 ( True, ktReason_Host_Modprobe_Failed, 'Kernel driver not installed' ),
887 ( True, ktReason_OSInstall_Sata_no_BM, 'PCHS=14128/14134/8224' ),
888 ( True, ktReason_Host_DoubleFreeHeap, 'double free or corruption' ),
889 ( False, ktReason_Unknown_VM_Start_Error, 'VMSetError: ' ),
890 ( False, ktReason_Unknown_VM_Runtime_Error, 'Console: VM runtime error: fatal=true' ),
891 ];
892
893 ## Things we search a VBoxHardening.log file for to figure out why something went bust.
894 katSimpleVBoxHardeningLogReasons = [
895 # ( Whether to stop on hit, reason tuple, needle text. )
896 ( True, ktReason_Host_DriverNotLoaded, 'Error opening VBoxDrvStub: STATUS_OBJECT_NAME_NOT_FOUND' ),
897 ( True, ktReason_Host_NotSignedWithBuildCert, 'Not signed with the build certificate' ),
898 ];
899
900 ## Things we search a kernel.log file for to figure out why something went bust.
901 katSimpleKernelLogReasons = [
902 # ( Whether to stop on hit, reason tuple, needle text. )
903 ( True, ktReason_Panic_HugeMemory, 'mm/huge_memory.c:1988' ),
904 ( True, ktReason_Panic_IOAPICDoesntWork, 'IO-APIC + timer doesn''t work' ),
905 ( True, ktReason_Panic_TxUnitHang, 'Detected Tx Unit Hang' ),
906 ];
907
908 ## Things we search the _RIGHT_ _STRIPPED_ vgatext for.
909 katSimpleVgaTextReasons = [
910 # ( Whether to stop on hit, reason tuple, needle text. )
911 ( True, ktReason_Panic_MP_BIOS_IO_APIC,
912 "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n\n" ),
913 ( True, ktReason_Panic_MP_BIOS_IO_APIC,
914 "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"
915 "...trying to set up timer (IRQ0) through the 8259A ... failed.\n"
916 "...trying to set up timer as Virtual Wire IRQ... failed.\n"
917 "...trying to set up timer as ExtINT IRQ... failed :(.\n"
918 "Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug\n"
919 "and send a report. Then try booting with the 'noapic' option\n"
920 "\n" ),
921 ( True, ktReason_OSInstall_GRUB_hang,
922 "-----\nGRUB Loading stage2..\n\n\n\n" ),
923 ( True, ktReason_OSInstall_GRUB_hang,
924 "-----\nGRUB Loading stage2...\n\n\n\n" ), # the 3 dot hang appears to be less frequent
925 ( True, ktReason_OSInstall_GRUB_hang,
926 "-----\nGRUB Loading stage2....\n\n\n\n" ), # the 4 dot hang appears to be very infrequent
927 ( True, ktReason_OSInstall_GRUB_hang,
928 "-----\nGRUB Loading stage2.....\n\n\n\n" ), # the 5 dot hang appears to be more frequent again
929 ( True, ktReason_OSInstall_Udev_hang,
930 "\nStarting udev:\n\n\n\n" ),
931 ( True, ktReason_OSInstall_Udev_hang,
932 "\nStarting udev:\n------" ),
933 ( True, ktReason_Panic_BootManagerC000000F,
934 "Windows failed to start. A recent hardware or software change might be the" ),
935 ( True, ktReason_BootManager_Image_corrupt,
936 "BOOTMGR image is corrupt. The system cannot boot." ),
937 ];
938
939 ## Things we search for in the info.txt file. Require handlers for now.
940 katInfoTextHandlers = [
941 # ( Trigger text, handler method )
942 ( "kvm_lock_spinning", investigateInfoKvmLockSpinning ),
943 ( "HalReturnToFirmware", investigateInfoHalReturnToFirmware ),
944 ];
945
946 ## Mapping screenshot/failure SHA-256 hashes to failure reasons.
947 katSimpleScreenshotHashReasons = [
948 # ( Whether to stop on hit, reason tuple, lowercased sha-256 of PIL.Image.tostring output )
949 ( True, ktReason_BSOD_Recovery, '576f8e38d62b311cac7e3dc3436a0d0b9bd8cfd7fa9c43aafa95631520a45eac' ),
950 ( True, ktReason_BSOD_Automatic_Repair, 'c6a72076cc619937a7a39cfe9915b36d94cee0d4e3ce5ce061485792dcee2749' ),
951 ( True, ktReason_BSOD_Automatic_Repair, '26c4d8a724ff2c5e1051f3d5b650dbda7b5fdee0aa3e3c6059797f7484a515df' ),
952 ( True, ktReason_BSOD_0000007F, '57e1880619e13042a87100e7a38c8974b85ce3866501be621bea0cc696bb2c63' ),
953 ( True, ktReason_BSOD_000000D1, '134621281f00a3f8aeeb7660064bffbf6187ed56d5852142328d0bcb18ef0ede' ),
954 ( True, ktReason_BSOD_000000D1, '279f11258150c9d2fef041eca65501f3141da8df39256d8f6377e897e3b45a93' ),
955 ( True, ktReason_BSOD_C0000225, 'bd13a144be9dcdfb16bc863ff4c8f02a86e263c174f2cd5ffd27ca5f3aa31789' ),
956 ( True, ktReason_BSOD_C0000225, '8348b465e7ee9e59dd4e785880c57fd8677de05d11ac21e786bfde935307b42f' ),
957 ( True, ktReason_BSOD_C0000225, '1316e1fc818a73348412788e6910b8c016f237d8b4e15b20caf4a866f7a7840e' ),
958 ( True, ktReason_BSOD_C0000225, '54e0acbff365ce20a85abbe42bcd53647b8b9e80c68e45b2cd30e86bf177a0b5' ),
959 ( True, ktReason_BSOD_C0000225, '50fec50b5199923fa48b3f3e782687cc381e1c8a788ebda14e6a355fbe3bb1b3' ),
960 ];
961
962 def investigateVMResult(self, oCaseFile, oFailedResult, sResultLog):
963 """
964 Investigates a failed VM run.
965 """
966
967 def investigateLogSet():
968 """
969 Investigates the current set of VM related logs.
970 """
971 self.dprint('investigateLogSet: lengths: result log %u, VM log %u, kernel log %u, vga text %u, info text %u'
972 % ( len(sResultLog if sResultLog else ''),
973 len(sVMLog if sVMLog else ''),
974 len(sKrnlLog if sKrnlLog else ''),
975 len(sVgaText if sVgaText else ''),
976 len(sInfoText if sInfoText else ''), ));
977
978 #self.dprint(u'main.log<<<\n%s\n<<<\n' % (sResultLog,));
979 #self.dprint(u'vbox.log<<<\n%s\n<<<\n' % (sVMLog,));
980 #self.dprint(u'krnl.log<<<\n%s\n<<<\n' % (sKrnlLog,));
981 #self.dprint(u'vgatext.txt<<<\n%s\n<<<\n' % (sVgaText,));
982 #self.dprint(u'info.txt<<<\n%s\n<<<\n' % (sInfoText,));
983
984 # TODO: more
985
986 #
987 # Look for BSODs. Some stupid stupid inconsistencies in reason and log messages here, so don't try prettify this.
988 #
989 sDetails = self.findInAnyAndReturnRestOfLine([ sVMLog, sResultLog ],
990 'GIM: HyperV: Guest indicates a fatal condition! P0=');
991 if sDetails is not None:
992 # P0=%#RX64 P1=%#RX64 P2=%#RX64 P3=%#RX64 P4=%#RX64 "
993 sKey = sDetails.split(' ', 1)[0];
994 try: sKey = '0x%08X' % (int(sKey, 16),);
995 except: pass;
996 if sKey in self.asBsodReasons:
997 tReason = ( self.ksBsodCategory, sKey );
998 elif sKey.lower() in self.asBsodReasons: # just in case.
999 tReason = ( self.ksBsodCategory, sKey.lower() );
1000 else:
1001 self.dprint(u'BSOD "%s" not found in %s;' % (sKey, self.asBsodReasons));
1002 tReason = ( self.ksBsodCategory, self.ksBsodAddNew );
1003 return oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sDetails.strip());
1004
1005 #
1006 # Look for linux panic.
1007 #
1008 if sKrnlLog is not None:
1009 for fStopOnHit, tReason, sNeedle in self.katSimpleKernelLogReasons:
1010 if sKrnlLog.find(sNeedle) > 0:
1011 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1012 if fStopOnHit:
1013 return True;
1014 fFoundSomething = True;
1015
1016 #
1017 # Loop thru the simple stuff.
1018 #
1019 fFoundSomething = False;
1020 for fStopOnHit, tReason, sNeedle in self.katSimpleMainAndVmLogReasons:
1021 if sResultLog.find(sNeedle) > 0 or (sVMLog is not None and sVMLog.find(sNeedle) > 0):
1022 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1023 if fStopOnHit:
1024 return True;
1025 fFoundSomething = True;
1026
1027 # Continue with vga text.
1028 if sVgaText:
1029 for fStopOnHit, tReason, sNeedle in self.katSimpleVgaTextReasons:
1030 if sVgaText.find(sNeedle) > 0:
1031 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1032 if fStopOnHit:
1033 return True;
1034 fFoundSomething = True;
1035 _ = sInfoText;
1036
1037 # Continue with screen hashes.
1038 if sScreenHash is not None:
1039 for fStopOnHit, tReason, sHash in self.katSimpleScreenshotHashReasons:
1040 if sScreenHash == sHash:
1041 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1042 if fStopOnHit:
1043 return True;
1044 fFoundSomething = True;
1045
1046 # Check VBoxHardening.log.
1047 if sNtHardLog is not None:
1048 for fStopOnHit, tReason, sNeedle in self.katSimpleVBoxHardeningLogReasons:
1049 if sNtHardLog.find(sNeedle) > 0:
1050 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1051 if fStopOnHit:
1052 return True;
1053 fFoundSomething = True;
1054
1055 #
1056 # Complicated stuff.
1057 #
1058 dLogs = {
1059 'sVMLog': sVMLog,
1060 'sNtHardLog': sNtHardLog,
1061 'sScreenHash': sScreenHash,
1062 'sKrnlLog': sKrnlLog,
1063 'sVgaText': sVgaText,
1064 'sInfoText': sInfoText,
1065 };
1066
1067 # info.txt.
1068 if sInfoText:
1069 for sNeedle, fnHandler in self.katInfoTextHandlers:
1070 if sInfoText.find(sNeedle) > 0:
1071 (fStop, tReason) = fnHandler(self, oCaseFile, sInfoText, dLogs);
1072 if tReason is not None:
1073 oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
1074 if fStop:
1075 return True;
1076 fFoundSomething = True;
1077
1078 #
1079 # Check for repeated reboots...
1080 #
1081 if sVMLog is not None:
1082 cResets = sVMLog.count('Changing the VM state from \'RUNNING\' to \'RESETTING\'');
1083 if cResets > 10:
1084 return oCaseFile.noteReasonForId(self.ktReason_Unknown_Reboot_Loop, oFailedResult.idTestResult,
1085 sComment = 'Counted %s reboots' % (cResets,));
1086
1087 return fFoundSomething;
1088
1089 #
1090 # Check if we got any VM or/and kernel logs. Treat them as sets in
1091 # case we run multiple VMs here (this is of course ASSUMING they
1092 # appear in the order that terminateVmBySession uploads them).
1093 #
1094 cTimes = 0;
1095 sVMLog = None;
1096 sNtHardLog = None;
1097 sScreenHash = None;
1098 sKrnlLog = None;
1099 sVgaText = None;
1100 sInfoText = None;
1101 for oFile in oFailedResult.aoFiles:
1102 if oFile.sKind == TestResultFileData.ksKind_LogReleaseVm:
1103 if 'VBoxHardening.log' not in oFile.sFile:
1104 if sVMLog is not None:
1105 if investigateLogSet() is True:
1106 return True;
1107 cTimes += 1;
1108 sInfoText = None;
1109 sVgaText = None;
1110 sKrnlLog = None;
1111 sScreenHash = None;
1112 sNtHardLog = None;
1113 sVMLog = oCaseFile.getLogFile(oFile);
1114 else:
1115 sNtHardLog = oCaseFile.getLogFile(oFile);
1116 elif oFile.sKind == TestResultFileData.ksKind_LogGuestKernel:
1117 sKrnlLog = oCaseFile.getLogFile(oFile);
1118 elif oFile.sKind == TestResultFileData.ksKind_InfoVgaText:
1119 sVgaText = '\n'.join([sLine.rstrip() for sLine in oCaseFile.getLogFile(oFile).split('\n')]);
1120 elif oFile.sKind == TestResultFileData.ksKind_InfoCollection:
1121 sInfoText = oCaseFile.getLogFile(oFile);
1122 elif oFile.sKind == TestResultFileData.ksKind_ScreenshotFailure:
1123 sScreenHash = oCaseFile.getScreenshotSha256(oFile);
1124 if sScreenHash is not None:
1125 sScreenHash = sScreenHash.lower();
1126 self.vprint(u'%s %s' % ( sScreenHash, oFile.sFile,));
1127
1128 if ( sVMLog is not None \
1129 or sNtHardLog is not None \
1130 or cTimes == 0) \
1131 and investigateLogSet() is True:
1132 return True;
1133
1134 return None;
1135
1136
1137 def isResultFromVMRun(self, oFailedResult, sResultLog):
1138 """
1139 Checks if this result and corresponding log snippet looks like a VM run.
1140 """
1141
1142 # Look for startVmEx/ startVmAndConnectToTxsViaTcp and similar output in the log.
1143 if sResultLog.find(' startVm') > 0:
1144 return True;
1145
1146 # Any other indicators? No?
1147 _ = oFailedResult;
1148 return False;
1149
1150 def investigateVBoxVMTest(self, oCaseFile, fSingleVM):
1151 """
1152 Checks out a VBox VM test.
1153
1154 This is generic investigation of a test running one or more VMs, like
1155 for example a smoke test or a guest installation test.
1156
1157 The fSingleVM parameter is a hint, which probably won't come in useful.
1158 """
1159 _ = fSingleVM;
1160
1161 #
1162 # Get a list of test result failures we should be looking into and the main log.
1163 #
1164 aoFailedResults = oCaseFile.oTree.getListOfFailures();
1165 sMainLog = oCaseFile.getMainLog();
1166
1167 #
1168 # There are a set of errors ending up on the top level result record.
1169 # Should deal with these first.
1170 #
1171 if len(aoFailedResults) == 1 and aoFailedResults[0] == oCaseFile.oTree:
1172 # Check if we've just got that XPCOM client smoke test shutdown issue. This will currently always
1173 # be reported on the top result because vboxinstall.py doesn't add an error for it. It is easy to
1174 # ignore other failures in the test if we're not a little bit careful here.
1175 if sMainLog.find('vboxinstaller: Exit code: -11 (') > 0:
1176 oCaseFile.noteReason(self.ktReason_XPCOM_Exit_Minus_11);
1177 return self.caseClosed(oCaseFile);
1178
1179 # Hang after starting VBoxSVC (e.g. idTestSet=136307258)
1180 if self.isThisFollowedByTheseLines(sMainLog, 'oVBoxMgr=<vboxapi.VirtualBoxManager object at',
1181 (' Timeout: ', ' Attempting to abort child...',) ):
1182 if sMainLog.find('*** glibc detected *** /') > 0:
1183 oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption);
1184 else:
1185 oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang);
1186 return self.caseClosed(oCaseFile);
1187
1188 # Look for heap corruption without visible hang.
1189 if sMainLog.find('*** glibc detected *** /') > 0 \
1190 or sMainLog.find("-1073740940") > 0: # STATUS_HEAP_CORRUPTION / 0xc0000374
1191 oCaseFile.noteReason(self.ktReason_Unknown_Heap_Corruption);
1192 return self.caseClosed(oCaseFile);
1193
1194 # Out of memory w/ timeout.
1195 if sMainLog.find('sErrId=HostMemoryLow') > 0:
1196 oCaseFile.noteReason(self.ktReason_Host_HostMemoryLow);
1197 return self.caseClosed(oCaseFile);
1198
1199 # Stale files like vts_rm.exe (windows).
1200 offEnd = sMainLog.rfind('*** The test driver exits successfully. ***');
1201 if offEnd > 0 and sMainLog.find('[Error 145] The directory is not empty: ', offEnd) > 0:
1202 oCaseFile.noteReason(self.ktReason_Ignore_Stale_Files);
1203 return self.caseClosed(oCaseFile);
1204
1205 #
1206 # XPCOM screwup
1207 #
1208 if sMainLog.find('AttributeError: \'NoneType\' object has no attribute \'addObserver\'') > 0:
1209 oCaseFile.noteReason(self.ktReason_Buggy_Build_Broken_Build);
1210 return self.caseClosed(oCaseFile);
1211
1212 #
1213 # Go thru each failed result.
1214 #
1215 for oFailedResult in aoFailedResults:
1216 self.dprint(u'Looking at test result #%u - %s' % (oFailedResult.idTestResult, oFailedResult.getFullName(),));
1217 sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
1218 if oFailedResult.sName == 'Installing VirtualBox':
1219 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = True)
1220
1221 elif oFailedResult.sName == 'Uninstalling VirtualBox':
1222 self.investigateInstallUninstallFailure(oCaseFile, oFailedResult, sResultLog, fInstall = False)
1223
1224 elif self.isResultFromVMRun(oFailedResult, sResultLog):
1225 self.investigateVMResult(oCaseFile, oFailedResult, sResultLog);
1226
1227 elif sResultLog.find('most likely not unique') > 0:
1228 oCaseFile.noteReasonForId(self.ktReason_Host_NetworkMisconfiguration, oFailedResult.idTestResult)
1229 elif sResultLog.find('Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))') > 0:
1230 oCaseFile.noteReasonForId(self.ktReason_XPCOM_NS_ERROR_CALL_FAILED, oFailedResult.idTestResult);
1231
1232 elif sResultLog.find('The machine is not mutable (state is ') > 0:
1233 self.vprint('Ignoring "machine not mutable" error as it is probably due to an earlier problem');
1234 oCaseFile.noteReasonForId(self.ktHarmless, oFailedResult.idTestResult);
1235
1236 elif sResultLog.find('** error: no action was specified') > 0 \
1237 or sResultLog.find('(len(self._asXml, asText))') > 0:
1238 oCaseFile.noteReasonForId(self.ktReason_Ignore_Buggy_Test_Driver, oFailedResult.idTestResult);
1239
1240 else:
1241 self.vprint(u'TODO: Cannot place idTestResult=%u - %s' % (oFailedResult.idTestResult, oFailedResult.sName,));
1242 self.dprint(u'%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
1243
1244 #
1245 # Report home and close the case if we got them all, otherwise log it.
1246 #
1247 if len(oCaseFile.dReasonForResultId) >= len(aoFailedResults):
1248 return self.caseClosed(oCaseFile);
1249
1250 if oCaseFile.dReasonForResultId:
1251 self.vprint(u'TODO: Got %u out of %u - close, but no cigar. :-/'
1252 % (len(oCaseFile.dReasonForResultId), len(aoFailedResults)));
1253 else:
1254 self.vprint(u'XXX: Could not figure out anything at all! :-(');
1255 return False;
1256
1257
1258 def reasoningFailures(self):
1259 """
1260 Guess the reason for failures.
1261 """
1262 #
1263 # Get a list of failed test sets without any assigned failure reason.
1264 #
1265 cGot = 0;
1266 aoTestSets = self.oTestSetLogic.fetchFailedSetsWithoutReason(cHoursBack = self.oConfig.cHoursBack, tsNow = self.tsNow);
1267 for oTestSet in aoTestSets:
1268 self.dprint(u'');
1269 self.dprint(u'reasoningFailures: Checking out test set #%u, status %s' % ( oTestSet.idTestSet, oTestSet.enmStatus,))
1270
1271 #
1272 # Open a case file and assign it to the right investigator.
1273 #
1274 (oTree, _ ) = self.oTestResultLogic.fetchResultTree(oTestSet.idTestSet);
1275 oBuild = BuildDataEx().initFromDbWithId( self.oDb, oTestSet.idBuild, oTestSet.tsCreated);
1276 oTestBox = TestBoxData().initFromDbWithGenId( self.oDb, oTestSet.idGenTestBox);
1277 oTestGroup = TestGroupData().initFromDbWithId( self.oDb, oTestSet.idTestGroup, oTestSet.tsCreated);
1278 oTestCase = TestCaseDataEx().initFromDbWithGenId( self.oDb, oTestSet.idGenTestCase, oTestSet.tsConfig);
1279
1280 oCaseFile = VirtualTestSheriffCaseFile(self, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase);
1281
1282 if oTestSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
1283 self.dprint(u'investigateBadTestBox is taking over %s.' % (oCaseFile.sLongName,));
1284 fRc = self.investigateBadTestBox(oCaseFile);
1285
1286 elif oCaseFile.isVBoxUnitTest():
1287 self.dprint(u'investigateVBoxUnitTest is taking over %s.' % (oCaseFile.sLongName,));
1288 fRc = self.investigateVBoxUnitTest(oCaseFile);
1289
1290 elif oCaseFile.isVBoxInstallTest():
1291 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1292 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1293
1294 elif oCaseFile.isVBoxUSBTest():
1295 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1296 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1297
1298 elif oCaseFile.isVBoxStorageTest():
1299 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1300 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1301
1302 elif oCaseFile.isVBoxGAsTest():
1303 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1304 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1305
1306 elif oCaseFile.isVBoxAPITest():
1307 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1308 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
1309
1310 elif oCaseFile.isVBoxBenchmarkTest():
1311 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1312 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
1313
1314 elif oCaseFile.isVBoxSmokeTest():
1315 self.dprint(u'investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
1316 fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
1317
1318 else:
1319 self.vprint(u'reasoningFailures: Unable to classify test set: %s' % (oCaseFile.sLongName,));
1320 fRc = False;
1321 cGot += fRc is True;
1322
1323 self.vprint(u'reasoningFailures: Got %u out of %u' % (cGot, len(aoTestSets), ));
1324 return 0;
1325
1326
1327 def main(self):
1328 """
1329 The 'main' function.
1330 Return exit code (0, 1, etc).
1331 """
1332 # Database stuff.
1333 self.oDb = TMDatabaseConnection()
1334 self.oTestResultLogic = TestResultLogic(self.oDb);
1335 self.oTestSetLogic = TestSetLogic(self.oDb);
1336 self.oFailureReasonLogic = FailureReasonLogic(self.oDb);
1337 self.oTestResultFailureLogic = TestResultFailureLogic(self.oDb);
1338 self.asBsodReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksBsodCategory);
1339 self.asUnitTestReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksUnitTestCategory);
1340
1341 # Get a fix on our 'now' before we do anything..
1342 self.oDb.execute('SELECT CURRENT_TIMESTAMP - interval \'%s hours\'', (self.oConfig.cStartHoursAgo,));
1343 self.tsNow = self.oDb.fetchOne();
1344
1345 # If we're suppost to commit anything we need to get our user ID.
1346 rcExit = 0;
1347 if self.oConfig.fRealRun:
1348 self.oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
1349 if self.oLogin is None:
1350 rcExit = self.eprint('Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
1351 else:
1352 self.uidSelf = self.oLogin.uid;
1353
1354 #
1355 # Do the stuff.
1356 #
1357 if rcExit == 0:
1358 rcExit = self.selfCheck();
1359 if rcExit == 0:
1360 rcExit = self.badTestBoxManagement();
1361 rcExit2 = self.reasoningFailures();
1362 if rcExit == 0:
1363 rcExit = rcExit2;
1364 # Redo the bad testbox management after failure reasons have been assigned (got timing issues).
1365 if rcExit == 0:
1366 rcExit = self.badTestBoxManagement();
1367
1368 # Cleanup.
1369 self.oFailureReasonLogic = None;
1370 self.oTestResultFailureLogic = None;
1371 self.oTestSetLogic = None;
1372 self.oTestResultLogic = None;
1373 self.oDb.close();
1374 self.oDb = None;
1375 if self.oLogFile is not None:
1376 self.oLogFile.close();
1377 self.oLogFile = None;
1378 return rcExit;
1379
1380if __name__ == '__main__':
1381 sys.exit(VirtualTestSheriff().main());
1382
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette