virtual_test_sheriff.py@ 61476

Last change on this file since 61476 was 61460, checked in by vboxsync, 9 years ago
investigateVBoxUnitTest: bugfix for nested failures (we only care about unit test level).
Property svn:eol-style set to `LF` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 40.1 KB

Line
1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3	# $Id: virtual_test_sheriff.py 61460 2016-06-04 00:19:16Z vboxsync $
4	# pylint: disable=C0301
5
6	"""
7	Virtual Test Sheriff.
8
9	Duties:
10	- Try to a assign failure reasons to recently failed tests.
11	- Reboot or disable bad test boxes.
12
13	"""
14
15	__copyright__ = \
16	"""
17	Copyright (C) 2012-2016 Oracle Corporation
18
19	This file is part of VirtualBox Open Source Edition (OSE), as
20	available from http://www.virtualbox.org. This file is free software;
21	you can redistribute it and/or modify it under the terms of the GNU
22	General Public License (GPL) as published by the Free Software
23	Foundation, in version 2 as it comes in the "COPYING" file of the
24	VirtualBox OSE distribution. VirtualBox OSE is distributed in the
25	hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
26
27	The contents of this file may alternatively be used under the terms
28	of the Common Development and Distribution License Version 1.0
29	(CDDL) only, as it comes in the "COPYING.CDDL" file of the
30	VirtualBox OSE distribution, in which case the provisions of the
31	CDDL are applicable instead of those of the GPL.
32
33	You may elect to license modified versions of this file under the
34	terms and conditions of either the GPL or the CDDL or both.
35	"""
36	__version__ = "$Revision: 61460 $"
37
38
39	# Standard python imports
40	import sys;
41	import os;
42	from optparse import OptionParser;
43
44	# Add Test Manager's modules path
45	g_ksTestManagerDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))));
46	sys.path.append(g_ksTestManagerDir);
47
48	# Test Manager imports
49	from testmanager.core.db import TMDatabaseConnection;
50	from testmanager.core.build import BuildDataEx;
51	from testmanager.core.failurereason import FailureReasonLogic;
52	from testmanager.core.testbox import TestBoxLogic, TestBoxData;
53	from testmanager.core.testcase import TestCaseDataEx;
54	from testmanager.core.testgroup import TestGroupData;
55	from testmanager.core.testset import TestSetLogic, TestSetData;
56	from testmanager.core.testresults import TestResultLogic, TestResultFileData;
57	from testmanager.core.testresultfailures import TestResultFailureLogic, TestResultFailureData;
58	from testmanager.core.useraccount import UserAccountLogic;
59
60
61	class VirtualTestSheriffCaseFile(object):
62	"""
63	A failure investigation case file.
64
65	"""
66
67
68	## Max log file we'll read into memory. (256 MB)
69	kcbMaxLogRead = 0x10000000;
70
71	def __init__(self, oSheriff, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase):
72	self.oSheriff = oSheriff;
73	self.oTestSet = oTestSet; # TestSetData
74	self.oTree = oTree; # TestResultDataEx
75	self.oBuild = oBuild; # BuildDataEx
76	self.oTestBox = oTestBox; # TestBoxData
77	self.oTestGroup = oTestGroup; # TestGroupData
78	self.oTestCase = oTestCase; # TestCaseDataEx
79	self.sMainLog = ''; # The main log file. Empty string if not accessible.
80
81	# Generate a case file name.
82	self.sName = '#%u: %s' % (self.oTestSet.idTestSet, self.oTestCase.sName,)
83	self.sLongName = '#%u: "%s" on "%s" running %s %s (%s), "%s" by %s, using %s %s %s r%u' \
84	% ( self.oTestSet.idTestSet,
85	self.oTestCase.sName,
86	self.oTestBox.sName,
87	self.oTestBox.sOs,
88	self.oTestBox.sOsVersion,
89	self.oTestBox.sCpuArch,
90	self.oTestBox.sCpuName,
91	self.oTestBox.sCpuVendor,
92	self.oBuild.oCat.sProduct,
93	self.oBuild.oCat.sBranch,
94	self.oBuild.oCat.sType,
95	self.oBuild.iRevision, );
96
97	# Investigation notes.
98	self.tReason = None; # None or one of the ktReason_XXX constants.
99	self.dReasonForResultId = {}; # Reason assignments indexed by idTestResult.
100	self.dCommentForResultId = {}; # Comment assignments indexed by idTestResult.
101
102	#
103	# Reason.
104	#
105
106	def noteReason(self, tReason):
107	""" Notes down a possible reason. """
108	self.oSheriff.dprint('noteReason: %s -> %s' % (self.tReason, tReason,));
109	self.tReason = tReason;
110	return True;
111
112	def noteReasonForId(self, tReason, idTestResult, sComment = None):
113	""" Notes down a possible reason for a specific test result. """
114	self.oSheriff.dprint('noteReasonForId: %u: %s -> %s%s'
115	% (idTestResult, self.dReasonForResultId.get(idTestResult, None), tReason,
116	(' (%s)' % (sComment,)) if sComment is not None else ''));
117	self.dReasonForResultId[idTestResult] = tReason;
118	if sComment is not None:
119	self.dCommentForResultId[idTestResult] = sComment;
120	return True;
121
122
123	#
124	# Test classification.
125	#
126
127	def isVBoxTest(self):
128	""" Test classification: VirtualBox (using the build) """
129	return self.oBuild.oCat.sProduct.lower() in [ 'virtualbox', 'vbox' ];
130
131	def isVBoxUnitTest(self):
132	""" Test case classification: The unit test doing all our testcase/*.cpp stuff. """
133	return self.isVBoxTest() \
134	and self.oTestCase.sName.lower() == 'unit tests';
135
136	def isVBoxInstallTest(self):
137	""" Test case classification: VirtualBox Guest installation test. """
138	return self.isVBoxTest() \
139	and self.oTestCase.sName.lower().startswith('install:');
140
141	def isVBoxSmokeTest(self):
142	""" Test case classification: Smoke test. """
143	return self.isVBoxTest() \
144	and self.oTestCase.sName.lower().startswith('smoketest');
145
146
147	#
148	# Utility methods.
149	#
150
151	def getMainLog(self):
152	"""
153	Tries to reads the main log file since this will be the first source of information.
154	"""
155	if len(self.sMainLog) > 0:
156	return self.sMainLog;
157	(oFile, oSizeOrError, _) = self.oTestSet.openFile('main.log', 'rb');
158	if oFile is not None:
159	try:
160	self.sMainLog = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
161	except Exception as oXcpt:
162	self.oSheriff.vprint('Error reading main log file: %s' % (oXcpt,))
163	self.sMainLog = '';
164	else:
165	self.oSheriff.vprint('Error opening main log file: %s' % (oSizeOrError,));
166	return self.sMainLog;
167
168	def getLogFile(self, oFile):
169	"""
170	Tries to reads the given file as a utf-8 log file.
171	oFile is a TestFileDataEx instance.
172	Returns empty string if problems opening or reading the file.
173	"""
174	sContent = '';
175	(oFile, oSizeOrError, _) = self.oTestSet.openFile(oFile.sFile, 'rb');
176	if oFile is not None:
177	try:
178	sContent = oFile.read(min(self.kcbMaxLogRead, oSizeOrError)).decode('utf-8', 'replace');
179	except Exception as oXcpt:
180	self.oSheriff.vprint('Error reading the "%s" log file: %s' % (oFile.sFile, oXcpt,))
181	else:
182	self.oSheriff.vprint('Error opening the "%s" log file: %s' % (oFile.sFile, oSizeOrError,));
183	return sContent;
184
185
186	def isSingleTestFailure(self):
187	"""
188	Figure out if this is a single test failing or if it's one of the
189	more complicated ones.
190	"""
191	if self.oTree.cErrors == 1:
192	return True;
193	if self.oTree.deepCountErrorContributers() <= 1:
194	return True;
195	return False;
196
197
198
199	class VirtualTestSheriff(object): # pylint: disable=R0903
200	"""
201	Add build info into Test Manager database.
202	"""
203
204	## The user account for the virtual sheriff.
205	ksLoginName = 'vsheriff';
206
207	def __init__(self):
208	"""
209	Parse command line.
210	"""
211	self.oDb = None;
212	self.tsNow = None;
213	self.oTestResultLogic = None;
214	self.oTestSetLogic = None;
215	self.oFailureReasonLogic = None; # FailureReasonLogic;
216	self.oTestResultFailureLogic = None; # TestResultFailureLogic
217	self.oLogin = None;
218	self.uidSelf = -1;
219	self.oLogFile = None;
220	self.asBsodReasons = [];
221	self.asUnitTestReasons = [];
222
223	oParser = OptionParser();
224	oParser.add_option('--start-hours-ago', dest = 'cStartHoursAgo', metavar = '<hours>', default = 0, type = 'int',
225	help = 'When to start specified as hours relative to current time. Defauls is right now.', );
226	oParser.add_option('--hours-period', dest = 'cHoursBack', metavar = '<period-in-hours>', default = 2, type = 'int',
227	help = 'Work period specified in hours. Defauls is 2 hours.');
228	oParser.add_option('--real-run-back', dest = 'fRealRun', action = 'store_true', default = False,
229	help = 'Whether to commit the findings to the database. Default is a dry run.');
230	oParser.add_option('-q', '--quiet', dest = 'fQuiet', action = 'store_true', default = False,
231	help = 'Quiet execution');
232	oParser.add_option('-l', '--log', dest = 'sLogFile', metavar = '<logfile>', default = None,
233	help = 'Where to log messages.');
234	oParser.add_option('--debug', dest = 'fDebug', action = 'store_true', default = False,
235	help = 'Enables debug mode.');
236
237	(self.oConfig, _) = oParser.parse_args();
238
239	if self.oConfig.sLogFile is not None and len(self.oConfig.sLogFile) > 0:
240	self.oLogFile = open(self.oConfig.sLogFile, "a");
241	self.oLogFile.write('VirtualTestSheriff: $Revision: 61460 $ \n');
242
243
244	def eprint(self, sText):
245	"""
246	Prints error messages.
247	Returns 1 (for exit code usage.)
248	"""
249	print 'error: %s' % (sText,);
250	if self.oLogFile is not None:
251	self.oLogFile.write('error: %s\n' % (sText,));
252	return 1;
253
254	def dprint(self, sText):
255	"""
256	Prints debug info.
257	"""
258	if self.oConfig.fDebug:
259	if not self.oConfig.fQuiet:
260	print 'debug: %s' % (sText, );
261	if self.oLogFile is not None:
262	self.oLogFile.write('debug: %s\n' % (sText,));
263	return 0;
264
265	def vprint(self, sText):
266	"""
267	Prints verbose info.
268	"""
269	if not self.oConfig.fQuiet:
270	print 'info: %s' % (sText,);
271	if self.oLogFile is not None:
272	self.oLogFile.write('info: %s\n' % (sText,));
273	return 0;
274
275
276	def selfCheck(self):
277	""" Does some self checks, looking up things we expect to be in the database and such. """
278	rcExit = 0;
279	for sAttr in dir(self.__class__):
280	if sAttr.startswith('ktReason_'):
281	tReason = getattr(self.__class__, sAttr);
282	oFailureReason = self.oFailureReasonLogic.cachedLookupByNameAndCategory(tReason[1], tReason[0]);
283	if oFailureReason is None:
284	rcExit = self.eprint('Failured to find failure reason "%s" in category "%s" in the database!'
285	% (tReason[1], tReason[0],));
286
287	# Check the user account as well.
288	if self.oLogin is None:
289	oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
290	if oLogin is None:
291	rcExit = self.eprint('Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
292	return rcExit;
293
294
295
296	def badTestBoxManagement(self):
297	"""
298	Looks for bad test boxes and first tries once to reboot them then disables them.
299	"""
300	rcExit = 0;
301
302	#
303	# We skip this entirely if we're running in the past and not in harmless debug mode.
304	#
305	if self.oConfig.cStartHoursAgo != 0 \
306	and (not self.oConfig.fDebug or self.oConfig.fRealRun):
307	return rcExit;
308	tsNow = self.tsNow if self.oConfig.fDebug else None;
309	cHoursBack = self.oConfig.cHoursBack if self.oConfig.fDebug else 2;
310	oTestBoxLogic = TestBoxLogic(self.oDb);
311
312	#
313	# Get list of bad test boxes for given period and check them out individually.
314	#
315	aidBadTestBoxes = self.oTestSetLogic.fetchBadTestBoxIds(cHoursBack = cHoursBack, tsNow = tsNow);
316	for idTestBox in aidBadTestBoxes:
317	# Skip if the testbox is already disabled or has a pending reboot command.
318	try:
319	oTestBox = TestBoxData().initFromDbWithId(self.oDb, idTestBox);
320	except Exception as oXcpt:
321	rcExit = self.eprint('Failed to get data for test box #%u in badTestBoxManagement: %s' % (idTestBox, oXcpt,));
322	continue;
323	if not oTestBox.fEnabled:
324	self.dprint('badTestBoxManagement: Skipping test box #%u (%s) as it has been disabled already.'
325	% ( idTestBox, oTestBox.sName, ));
326	continue;
327	if oTestBox.enmPendingCmd != TestBoxData.ksTestBoxCmd_None:
328	self.dprint('badTestBoxManagement: Skipping test box #%u (%s) as it has a command pending: %s'
329	% ( idTestBox, oTestBox.sName, oTestBox.enmPendingCmd));
330	continue;
331
332	# Get the most recent testsets for this box (descending on tsDone) and see how bad it is.
333	aoSets = self.oTestSetLogic.fetchSetsForTestBox(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow);
334	cOkay = 0;
335	cBad = 0;
336	iFirstOkay = len(aoSets);
337	for iSet, oSet in enumerate(aoSets):
338	if oSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
339	cBad += 1;
340	else:
341	## @todo maybe check the elapsed time here, it could still be a bad run.
342	cOkay += 1;
343	if iFirstOkay > iSet:
344	iFirstOkay = iSet;
345	if iSet > 10:
346	break;
347
348	# We react if there are two or more bad-testbox statuses at the head of the
349	# history and at least three in the last 10 results.
350	if iFirstOkay >= 2 and cBad > 2:
351	if oTestBoxLogic.hasTestBoxRecentlyBeenRebooted(idTestBox, cHoursBack = cHoursBack, tsNow = tsNow):
352	self.vprint('Disabling testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u'
353	% ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
354	if self.oConfig.fRealRun is True:
355	try:
356	oTestBoxLogic.disableTestBox(idTestBox, self.uidSelf, fCommit = True,
357	sComment = 'Automatically disabled (iFirstOkay=%u cBad=%u cOkay=%u)'
358	% (iFirstOkay, cBad, cOkay),);
359	except Exception as oXcpt:
360	rcExit = self.eprint('Error disabling testbox #%u (%u): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
361	else:
362	self.vprint('Rebooting testbox #%u (%s) - iFirstOkay=%u cBad=%u cOkay=%u'
363	% ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
364	if self.oConfig.fRealRun is True:
365	try:
366	oTestBoxLogic.rebootTestBox(idTestBox, self.uidSelf, fCommit = True,
367	sComment = 'Automatically rebooted (iFirstOkay=%u cBad=%u cOkay=%u)'
368	% (iFirstOkay, cBad, cOkay),);
369	except Exception as oXcpt:
370	rcExit = self.eprint('Error rebooting testbox #%u (%u): %s\n' % (idTestBox, oTestBox.sName, oXcpt,));
371	else:
372	self.dprint('badTestBoxManagement: #%u (%s) looks ok: iFirstOkay=%u cBad=%u cOkay=%u'
373	% ( idTestBox, oTestBox.sName, iFirstOkay, cBad, cOkay));
374	return rcExit;
375
376
377	## @name Failure reasons we know.
378	## @{
379	ktReason_Guru_Generic = ( 'Guru Meditations', 'Generic Guru Meditation' );
380	ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_INSTR_NOT_IMPLEMENTED' );
381	ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED = ( 'Guru Meditations', 'VERR_IEM_ASPECT_NOT_IMPLEMENTED' );
382	ktReason_Guru_VERR_TRPM_DONT_PANIC = ( 'Guru Meditations', 'VERR_TRPM_DONT_PANIC' );
383	ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED = ( 'Guru Meditations', 'VERR_PGM_PHYS_PAGE_RESERVED' );
384	ktReason_Guru_VINF_EM_TRIPLE_FAULT = ( 'Guru Meditations', 'VINF_EM_TRIPLE_FAULT' );
385	ktReason_XPCOM_Exit_Minus_11 = ( 'API / (XP)COM', 'exit -11' );
386	ktReason_XPCOM_VBoxSVC_Hang = ( 'API / (XP)COM', 'VBoxSVC hang' );
387	ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption = ( 'API / (XP)COM', 'VBoxSVC hang + heap corruption' );
388	ktReason_XPCOM_NS_ERROR_CALL_FAILED = ( 'API / (XP)COM', 'NS_ERROR_CALL_FAILED' );
389	ktReason_Unknown_Heap_Corruption = ( 'Unknown', 'Heap corruption' );
390	ktReason_Unknown_Reboot_Loop = ( 'Unknown', 'Reboot loop' );
391	## @}
392
393	## BSOD category.
394	ksBsodCategory = 'BSOD';
395	## Special reason indicating that the flesh and blood sheriff has work to do.
396	ksBsodAddNew = 'Add new BSOD';
397
398	## Unit test category.
399	ksUnitTestCategory = 'Unit';
400	## Special reason indicating that the flesh and blood sheriff has work to do.
401	ksUnitTestAddNew = 'Add new';
402
403	## Used for indica that we shouldn't report anything for this test result ID and
404	## consider promoting the previous error to test set level if it's the only one.
405	ktHarmless = ( 'Probably', 'Caused by previous error' );
406
407
408	def caseClosed(self, oCaseFile):
409	"""
410	Reports the findings in the case and closes it.
411	"""
412	#
413	# Log it and create a dReasonForReasultId we can use below.
414	#
415	dCommentForResultId = oCaseFile.dCommentForResultId;
416	if len(oCaseFile.dReasonForResultId) > 0:
417	# Must weed out ktHarmless.
418	dReasonForResultId = {};
419	for idKey, tReason in oCaseFile.dReasonForResultId.items():
420	if tReason is not self.ktHarmless:
421	dReasonForResultId[idKey] = tReason;
422	if len(dReasonForResultId) == 0:
423	self.vprint('TODO: Closing %s without a real reason, only %s.' % (oCaseFile.sName, oCaseFile.dReasonForResultId));
424	return False;
425
426	# Try promote to single reason.
427	atValues = dReasonForResultId.values();
428	fSingleReason = True;
429	if len(dReasonForResultId) == 1 and dReasonForResultId.keys()[0] != oCaseFile.oTestSet.idTestResult:
430	self.dprint('Promoting single reason to whole set: %s' % (atValues[0],));
431	elif len(dReasonForResultId) > 1 and len(atValues) == atValues.count(atValues[0]):
432	self.dprint('Merged %d reasons to a single one: %s' % (len(atValues), atValues[0]));
433	else:
434	fSingleReason = False;
435	if fSingleReason:
436	dReasonForResultId = { oCaseFile.oTestSet.idTestResult: atValues[0], };
437	if len(dCommentForResultId) > 0:
438	dCommentForResultId = { oCaseFile.oTestSet.idTestResult: dCommentForResultId.values()[0], };
439	elif oCaseFile.tReason is not None:
440	dReasonForResultId = { oCaseFile.oTestSet.idTestResult: oCaseFile.tReason, };
441	else:
442	self.vprint('Closing %s without a reason - this should not happen!' % (oCaseFile.sName,));
443	return False;
444
445	self.vprint('Closing %s with following reason%s: %s'
446	% ( oCaseFile.sName, 's' if dReasonForResultId > 0 else '', dReasonForResultId, ));
447
448	#
449	# Add the test failure reason record(s).
450	#
451	for idTestResult, tReason in dReasonForResultId.items():
452	oFailureReason = self.oFailureReasonLogic.cachedLookupByNameAndCategory(tReason[1], tReason[0]);
453	if oFailureReason is not None:
454	sComment = 'Set by $Revision: 61460 $' # Handy for reverting later.
455	if idTestResult in dCommentForResultId:
456	sComment += ': ' + dCommentForResultId[idTestResult];
457
458	oAdd = TestResultFailureData();
459	oAdd.initFromValues(idTestResult = idTestResult,
460	idFailureReason = oFailureReason.idFailureReason,
461	uidAuthor = self.uidSelf,
462	idTestSet = oCaseFile.oTestSet.idTestSet,
463	sComment = sComment,);
464	if self.oConfig.fRealRun:
465	try:
466	self.oTestResultFailureLogic.addEntry(oAdd, self.uidSelf, fCommit = True);
467	except Exception as oXcpt:
468	self.eprint('caseClosed: Exception "%s" while adding reason %s for %s'
469	% (oXcpt, oAdd, oCaseFile.sLongName,));
470	else:
471	self.eprint('caseClosed: Cannot locate failure reason: %s / %s' % ( tReason[0], tReason[1],));
472	return True;
473
474	#
475	# Tools for assiting log parsing.
476	#
477
478	@staticmethod
479	def matchFollowedByLines(sStr, off, asFollowingLines):
480	""" Worker for isThisFollowedByTheseLines. """
481
482	# Advance off to the end of the line.
483	off = sStr.find('\n', off);
484	if off < 0:
485	return False;
486	off += 1;
487
488	# Match each string with the subsequent lines.
489	for iLine, sLine in enumerate(asFollowingLines):
490	offEnd = sStr.find('\n', off);
491	if offEnd < 0:
492	return iLine + 1 == len(asFollowingLines) and sStr.find(sLine, off) < 0;
493	if len(sLine) > 0 and sStr.find(sLine, off, offEnd) < 0:
494	return False;
495
496	# next line.
497	off = offEnd + 1;
498
499	return True;
500
501	@staticmethod
502	def isThisFollowedByTheseLines(sStr, sFirst, asFollowingLines):
503	"""
504	Looks for a line contining sFirst which is then followed by lines
505	with the strings in asFollowingLines. (No newline chars anywhere!)
506	Returns True / False.
507	"""
508	off = sStr.find(sFirst, 0);
509	while off >= 0:
510	if VirtualTestSheriff.matchFollowedByLines(sStr, off, asFollowingLines):
511	return True;
512	off = sStr.find(sFirst, off + 1);
513	return False;
514
515	@staticmethod
516	def findAndReturnResetOfLine(sHaystack, sNeedle):
517	"""
518	Looks for sNeedle in sHaystack.
519	Returns The text following the needle up to the end of the line.
520	Returns None if not found.
521	"""
522	off = sHaystack.find(sNeedle);
523	if off < 0:
524	return None;
525	off += len(sNeedle)
526	offEol = sHaystack.find('\n', off);
527	if offEol < 0:
528	offEol = len(sHaystack);
529	return sHaystack[off:offEol]
530
531	@staticmethod
532	def findInAnyAndReturnResetOfLine(asHaystacks, sNeedle):
533	"""
534	Looks for sNeedle in zeroe or more haystacks (asHaystack).
535	Returns The text following the first needed found up to the end of the line.
536	Returns None if not found.
537	"""
538	for sHaystack in asHaystacks:
539	sRet = VirtualTestSheriff.findAndReturnResetOfLine(sHaystack, sNeedle);
540	if sRet is not None:
541	return sRet;
542	return None;
543
544
545	#
546	# The investigative units.
547	#
548
549	def investigateBadTestBox(self, oCaseFile):
550	"""
551	Checks out bad-testbox statuses.
552	"""
553	_ = oCaseFile;
554	return False;
555
556
557	def investigateVBoxUnitTest(self, oCaseFile):
558	"""
559	Checks out a VBox unittest problem.
560	"""
561
562	#
563	# Process simple test case failures first, using their name as reason.
564	# We do the reason management just like for BSODs.
565	#
566	cRelevantOnes = 0;
567	aoFailedResults = oCaseFile.oTree.getListOfFailures();
568	for oFailedResult in aoFailedResults:
569	if oFailedResult is oCaseFile.oTree:
570	self.vprint('TODO: toplevel failure');
571	cRelevantOnes += 1
572	elif oFailedResult.sName == 'Installing VirtualBox':
573	self.vprint('TODO: Installation failure');
574	cRelevantOnes += 1
575	elif oFailedResult.sName == 'Uninstalling VirtualBox':
576	self.vprint('TODO: Uninstallation failure');
577	cRelevantOnes += 1
578	elif oFailedResult.oParent is not None:
579	# Get the 2nd level node because that's where we'll find the unit test name.
580	while oFailedResult.oParent.oParent is not None:
581	oFailedResult = oFailedResult.oParent;
582
583	# Only report a failure once.
584	if oFailedResult.idTestResult not in oCaseFile.dReasonForResultId:
585	sKey = oFailedResult.sName;
586	if sKey.startswith('testcase/'):
587	sKey = sKey[9:];
588	if sKey in self.asUnitTestReasons:
589	tReason = ( self.ksUnitTestCategory, sKey );
590	oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
591	else:
592	self.dprint('Unit test failure "%s" not found in %s;' % (sKey, self.asUnitTestReasons));
593	tReason = ( self.ksUnitTestCategory, self.ksUnitTestAddNew );
594	oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sKey);
595	cRelevantOnes += 1
596	else:
597	self.vprint('Internal error: expected oParent to NOT be None for %s' % (oFailedResult,));
598
599	#
600	# If we've caught all the relevant ones by now, report the result.
601	#
602	if len(oCaseFile.dReasonForResultId) >= cRelevantOnes:
603	return self.caseClosed(oCaseFile);
604	return False;
605
606
607	## Thing we search a main or VM log for to figure out why something went bust.
608	katSimpleMainAndVmLogReasons = [
609	# ( Whether to stop on hit, needle, reason tuple ),
610	( False, 'GuruMeditation', ktReason_Guru_Generic ),
611	( False, 'Guru Meditation', ktReason_Guru_Generic ),
612	( True, 'VERR_IEM_INSTR_NOT_IMPLEMENTED', ktReason_Guru_VERR_IEM_INSTR_NOT_IMPLEMENTED ),
613	( True, 'VERR_IEM_ASPECT_NOT_IMPLEMENTED', ktReason_Guru_VERR_IEM_ASPECT_NOT_IMPLEMENTED ),
614	( True, 'VERR_TRPM_DONT_PANIC', ktReason_Guru_VERR_TRPM_DONT_PANIC ),
615	( True, 'VERR_PGM_PHYS_PAGE_RESERVED', ktReason_Guru_VERR_PGM_PHYS_PAGE_RESERVED ),
616	( True, 'VINF_EM_TRIPLE_FAULT', ktReason_Guru_VINF_EM_TRIPLE_FAULT ),
617	( False, 'Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))',
618	ktReason_XPCOM_NS_ERROR_CALL_FAILED ),
619	];
620
621	def investigateVMResult(self, oCaseFile, oFailedResult, sResultLog):
622	"""
623	Investigates a failed VM run.
624	"""
625
626	def investigateLogSet():
627	"""
628	Investigates the current set of VM related logs.
629	"""
630	self.dprint('investigateLogSet: %u chars result log, %u chars VM log, %u chars kernel log'
631	% ( len(sResultLog) if sResultLog is not None else 0,
632	len(sVMLog) if sVMLog is not None else 0,
633	len(sKrnlLog) if sKrnlLog is not None else 0), );
634	#self.dprint('main.log<<<\n%s\n<<<\n' % (sResultLog,));
635	#self.dprint('vbox.log<<<\n%s\n<<<\n' % (sVMLog,));
636	#self.dprint('krnl.log<<<\n%s\n<<<\n' % (sKrnlLog,));
637
638	# TODO: more
639
640	#
641	# Look for BSODs. Some stupid stupid inconsistencies in reason and log messages here, so don't try prettify this.
642	#
643	sDetails = self.findInAnyAndReturnResetOfLine([ sVMLog, sResultLog ],
644	'GIM: HyperV: Guest indicates a fatal condition! P0=');
645	if sDetails is not None:
646	# P0=%#RX64 P1=%#RX64 P2=%#RX64 P3=%#RX64 P4=%#RX64 "
647	sKey = sDetails.split(' ', 1)[0];
648	try: sKey = '0x%08X' % (int(sKey, 16),);
649	except: pass;
650	if sKey in self.asBsodReasons:
651	tReason = ( self.ksBsodCategory, sKey );
652	elif sKey.lower() in self.asBsodReasons: # just in case.
653	tReason = ( self.ksBsodCategory, sKey.lower() );
654	else:
655	self.dprint('BSOD "%s" not found in %s;' % (sKey, self.asBsodReasons));
656	tReason = ( self.ksBsodCategory, self.ksBsodAddNew );
657	return oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult, sComment = sDetails.strip());
658
659	#
660	# Look for linux panic.
661	#
662	if sKrnlLog is not None:
663	pass; ## @todo
664
665	#
666	# Loop thru the simple stuff.
667	#
668	fFoundSomething = False;
669	for fStopOnHit, sNeedle, tReason in self.katSimpleMainAndVmLogReasons:
670	if sResultLog.find(sNeedle) > 0 or sVMLog.find(sNeedle) > 0:
671	oCaseFile.noteReasonForId(tReason, oFailedResult.idTestResult);
672	if fStopOnHit:
673	return True;
674	fFoundSomething = True;
675
676	#
677	# Check for repeated reboots...
678	#
679	cResets = sVMLog.count('Changing the VM state from \'RUNNING\' to \'RESETTING\'');
680	if cResets > 10:
681	return oCaseFile.noteReasonForId(self.ktReason_Unknown_Reboot_Loop, oFailedResult.idTestResult,
682	sComment = 'Counted %s reboots' % (cResets,));
683
684	return fFoundSomething;
685
686	#
687	# Check if we got any VM or/and kernel logs. Treat them as sets in
688	# case we run multiple VMs here.
689	#
690	sVMLog = None;
691	sKrnlLog = None;
692	for oFile in oFailedResult.aoFiles:
693	if oFile.sKind == TestResultFileData.ksKind_LogReleaseVm:
694	if sVMLog is not None:
695	if investigateLogSet() is True:
696	return True;
697	sKrnlLog = None;
698	sVMLog = oCaseFile.getLogFile(oFile);
699	elif oFile.sKind == TestResultFileData.ksKind_LogGuestKernel:
700	sKrnlLog = oCaseFile.getLogFile(oFile);
701	if sVMLog is not None and investigateLogSet() is True:
702	return True;
703
704	return None;
705
706
707	def isResultFromVMRun(self, oFailedResult, sResultLog):
708	"""
709	Checks if this result and corresponding log snippet looks like a VM run.
710	"""
711
712	# Look for startVmEx/ startVmAndConnectToTxsViaTcp and similar output in the log.
713	if sResultLog.find(' startVm') > 0:
714	return True;
715
716	# Any other indicators? No?
717	_ = oFailedResult;
718	return False;
719
720
721	def investigateVBoxVMTest(self, oCaseFile, fSingleVM):
722	"""
723	Checks out a VBox VM test.
724
725	This is generic investigation of a test running one or more VMs, like
726	for example a smoke test or a guest installation test.
727
728	The fSingleVM parameter is a hint, which probably won't come in useful.
729	"""
730	_ = fSingleVM;
731
732	#
733	# Get a list of test result failures we should be looking into and the main log.
734	#
735	aoFailedResults = oCaseFile.oTree.getListOfFailures();
736	sMainLog = oCaseFile.getMainLog();
737
738	#
739	# There are a set of errors ending up on the top level result record.
740	# Should deal with these first.
741	#
742	if len(aoFailedResults) == 1 and aoFailedResults[0] == oCaseFile.oTree:
743	# Check if we've just got that XPCOM client smoke test shutdown issue. This will currently always
744	# be reported on the top result because vboxinstall.py doesn't add an error for it. It is easy to
745	# ignore other failures in the test if we're not a little bit careful here.
746	if sMainLog.find('vboxinstaller: Exit code: -11 (') > 0:
747	oCaseFile.noteReason(self.ktReason_XPCOM_Exit_Minus_11);
748	return self.caseClosed(oCaseFile);
749
750	# Hang after starting VBoxSVC (e.g. idTestSet=136307258)
751	if self.isThisFollowedByTheseLines(sMainLog, 'oVBoxMgr=<vboxapi.VirtualBoxManager object at',
752	(' Timeout: ', ' Attempting to abort child...',) ):
753	if sMainLog.find('* glibc detected * /') > 0:
754	oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang_Plus_Heap_Corruption);
755	else:
756	oCaseFile.noteReason(self.ktReason_XPCOM_VBoxSVC_Hang);
757	return self.caseClosed(oCaseFile);
758
759
760
761	# Look for heap corruption without visible hang.
762	if sMainLog.find('* glibc detected * /') > 0 \
763	or sMainLog.find("-1073740940") > 0: # STATUS_HEAP_CORRUPTION / 0xc0000374
764	oCaseFile.noteReason(self.ktReason_Unknown_Heap_Corruption);
765	return self.caseClosed(oCaseFile);
766
767	#
768	# Go thru each failed result.
769	#
770	for oFailedResult in aoFailedResults:
771	self.dprint('Looking at test result #%u - %s' % (oFailedResult.idTestResult, oFailedResult.getFullName(),));
772	sResultLog = TestSetData.extractLogSectionElapsed(sMainLog, oFailedResult.tsCreated, oFailedResult.tsElapsed);
773	if oFailedResult.sName == 'Installing VirtualBox':
774	self.vprint('TODO: Installation failure');
775	elif oFailedResult.sName == 'Uninstalling VirtualBox':
776	self.vprint('TODO: Uninstallation failure');
777	elif self.isResultFromVMRun(oFailedResult, sResultLog):
778	self.investigateVMResult(oCaseFile, oFailedResult, sResultLog);
779	elif sResultLog.find('Exception: 0x800706be (Call to remote object failed (NS_ERROR_CALL_FAILED))') > 0:
780	oCaseFile.noteReasonForId(self.ktReason_XPCOM_NS_ERROR_CALL_FAILED, oFailedResult.idTestResult);
781	elif sResultLog.find('The machine is not mutable (state is ') > 0:
782	self.vprint('Ignorining "machine not mutable" error as it is probably due to an earlier problem');
783	oCaseFile.noteReasonForId(self.ktHarmless, oFailedResult.idTestResult);
784	else:
785	self.vprint('TODO: Cannot place idTestResult=%u - %s' % (oFailedResult.idTestResult, oFailedResult.sName,));
786	self.dprint('%s + %s <<\n%s\n<<' % (oFailedResult.tsCreated, oFailedResult.tsElapsed, sResultLog,));
787
788	#
789	# Report home and close the case if we got them all, otherwise log it.
790	#
791	if len(oCaseFile.dReasonForResultId) >= len(aoFailedResults):
792	return self.caseClosed(oCaseFile);
793
794	if len(oCaseFile.dReasonForResultId) > 0:
795	self.vprint('TODO: Got %u out of %u - close, but no cigar. :-/'
796	% (len(oCaseFile.dReasonForResultId), len(aoFailedResults)));
797	else:
798	self.vprint('XXX: Could not figure out anything at all! :-(');
799	return False;
800
801
802	def reasoningFailures(self):
803	"""
804	Guess the reason for failures.
805	"""
806	#
807	# Get a list of failed test sets without any assigned failure reason.
808	#
809	cGot = 0;
810	aoTestSets = self.oTestSetLogic.fetchFailedSetsWithoutReason(cHoursBack = self.oConfig.cHoursBack, tsNow = self.tsNow);
811	for oTestSet in aoTestSets:
812	self.dprint('');
813	self.dprint('reasoningFailures: Checking out test set #%u, status %s' % ( oTestSet.idTestSet, oTestSet.enmStatus,))
814
815	#
816	# Open a case file and assign it to the right investigator.
817	#
818	(oTree, _ ) = self.oTestResultLogic.fetchResultTree(oTestSet.idTestSet);
819	oBuild = BuildDataEx().initFromDbWithId( self.oDb, oTestSet.idBuild, oTestSet.tsCreated);
820	oTestBox = TestBoxData().initFromDbWithGenId( self.oDb, oTestSet.idGenTestBox);
821	oTestGroup = TestGroupData().initFromDbWithId( self.oDb, oTestSet.idTestGroup, oTestSet.tsCreated);
822	oTestCase = TestCaseDataEx().initFromDbWithGenId( self.oDb, oTestSet.idGenTestCase, oTestSet.tsConfig);
823
824	oCaseFile = VirtualTestSheriffCaseFile(self, oTestSet, oTree, oBuild, oTestBox, oTestGroup, oTestCase);
825
826	if oTestSet.enmStatus == TestSetData.ksTestStatus_BadTestBox:
827	self.dprint('investigateBadTestBox is taking over %s.' % (oCaseFile.sLongName,));
828	fRc = self.investigateBadTestBox(oCaseFile);
829	elif oCaseFile.isVBoxUnitTest():
830	self.dprint('investigateVBoxUnitTest is taking over %s.' % (oCaseFile.sLongName,));
831	fRc = self.investigateVBoxUnitTest(oCaseFile);
832	elif oCaseFile.isVBoxInstallTest():
833	self.dprint('investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
834	fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = True);
835	elif oCaseFile.isVBoxSmokeTest():
836	self.dprint('investigateVBoxVMTest is taking over %s.' % (oCaseFile.sLongName,));
837	fRc = self.investigateVBoxVMTest(oCaseFile, fSingleVM = False);
838	else:
839	self.vprint('reasoningFailures: Unable to classify test set: %s' % (oCaseFile.sLongName,));
840	fRc = False;
841	cGot += fRc is True;
842	self.vprint('reasoningFailures: Got %u out of %u' % (cGot, len(aoTestSets), ));
843	return 0;
844
845
846	def main(self):
847	"""
848	The 'main' function.
849	Return exit code (0, 1, etc).
850	"""
851	# Database stuff.
852	self.oDb = TMDatabaseConnection()
853	self.oTestResultLogic = TestResultLogic(self.oDb);
854	self.oTestSetLogic = TestSetLogic(self.oDb);
855	self.oFailureReasonLogic = FailureReasonLogic(self.oDb);
856	self.oTestResultFailureLogic = TestResultFailureLogic(self.oDb);
857	self.asBsodReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksBsodCategory);
858	self.asUnitTestReasons = self.oFailureReasonLogic.fetchForSheriffByNamedCategory(self.ksUnitTestCategory);
859
860	# Get a fix on our 'now' before we do anything..
861	self.oDb.execute('SELECT CURRENT_TIMESTAMP - interval \'%s hours\'', (self.oConfig.cStartHoursAgo,));
862	self.tsNow = self.oDb.fetchOne();
863
864	# If we're suppost to commit anything we need to get our user ID.
865	rcExit = 0;
866	if self.oConfig.fRealRun:
867	self.oLogin = UserAccountLogic(self.oDb).tryFetchAccountByLoginName(VirtualTestSheriff.ksLoginName);
868	if self.oLogin is None:
869	rcExit = self.eprint('Cannot find my user account "%s"!' % (VirtualTestSheriff.ksLoginName,));
870	else:
871	self.uidSelf = self.oLogin.uid;
872
873	# Do the stuff.
874	if rcExit == 0:
875	rcExit = self.selfCheck();
876	if rcExit == 0:
877	rcExit = self.badTestBoxManagement();
878	rcExit2 = self.reasoningFailures();
879	if rcExit == 0:
880	rcExit = rcExit2;
881
882	# Cleanup.
883	self.oFailureReasonLogic = None;
884	self.oTestResultFailureLogic = None;
885	self.oTestSetLogic = None;
886	self.oTestResultLogic = None;
887	self.oDb.close();
888	self.oDb = None;
889	if self.oLogFile is not None:
890	self.oLogFile.close();
891	self.oLogFile = None;
892	return rcExit;
893
894	if __name__ == '__main__':
895	sys.exit(VirtualTestSheriff().main());
896

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/ValidationKit/testmanager/batch/virtual_test_sheriff.py@ 61476

Download in other formats: