Keir Fraser writes ("Re: [Xen-devel] [PATCH] detect and report qemu-dm
failure"):
> This patch doesn't apply to tip.
Updated (and given a quick retest).
Ian.
Signed-off-by: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
diff -r 4ac0898df538 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/XendDomain.py Thu Jun 12 16:46:56 2008 +0100
@@ -34,7 +34,7 @@ import xen.lowlevel.xc
from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo
from xen.xend.PrettyPrint import prettyprint
-from xen.xend import XendConfig
+from xen.xend import XendConfig, image
from xen.xend.XendError import XendError, XendInvalidDomain, VmError
from xen.xend.XendError import VMBadState
from xen.xend.XendLogging import log
@@ -178,6 +178,8 @@ class XendDomain:
except Exception:
log.exception("Failed to create reference to running "
"domain id: %d" % dom['domid'])
+
+ image.cleanup_stale_sentinel_fifos()
# add all managed domains as dormant domains.
for dom in managed:
diff -r 4ac0898df538 tools/python/xen/xend/XendLogging.py
--- a/tools/python/xen/xend/XendLogging.py Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/XendLogging.py Thu Jun 12 16:46:56 2008 +0100
@@ -25,10 +25,10 @@ import types
import types
import logging
import logging.handlers
-import fcntl
from xen.util import mkdir
from xen.xend.server import params
+from xen.util import oshelp
__all__ = [ 'log', 'init', 'getLogFilename' ]
@@ -103,9 +103,7 @@ class XendRotatingFileHandler(logging.ha
# entire FileHandler, StreamHandler & RotatingFileHandler classes which
# is even worse
def setCloseOnExec(self):
- flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD)
- flags |= fcntl.FD_CLOEXEC
- fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags)
+ oshelp.fcntl_setfd_cloexec(self.stream, True)
def init(filename, level):
diff -r 4ac0898df538 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py Thu Jun 12 16:18:31 2008 +0100
+++ b/tools/python/xen/xend/image.py Thu Jun 12 16:47:49 2008 +0100
@@ -22,6 +22,12 @@ import math
import math
import time
import signal
+import thread
+import fcntl
+import sys
+import errno
+import glob
+import traceback
import xen.lowlevel.xc
from xen.xend.XendConstants import *
@@ -32,11 +38,23 @@ from xen.xend.xenstore.xswatch import xs
from xen.xend.xenstore.xswatch import xswatch
from xen.xend import arch
from xen.xend import XendOptions
+from xen.util import oshelp
+from xen.util import utils
xc = xen.lowlevel.xc.xc()
MAX_GUEST_CMDLINE = 1024
+sentinel_path_prefix = '/var/run/xend/dm-'
+sentinel_fifos_inuse = { }
+
+def cleanup_stale_sentinel_fifos():
+ for path in glob.glob(sentinel_path_prefix + '*.fifo'):
+ if path in sentinel_fifos_inuse: continue
+ try: os.unlink(path)
+ except OSError, e:
+ log.warning('could not delete stale fifo %s: %s',
+ path, utils.exception_string(e))
def create(vm, vmConfig):
"""Create an image handler for a vm.
@@ -324,6 +342,13 @@ class ImageHandler:
args = args + self.dmargs
return args
+ def _openSentinel(self, sentinel_path_fifo):
+ self.sentinel_fifo = file(sentinel_path_fifo, 'r')
+ self.sentinel_lock = thread.allocate_lock()
+ oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True)
+ sentinel_fifos_inuse[sentinel_path_fifo] = 1
+ self.sentinel_path_fifo = sentinel_path_fifo
+
def createDeviceModel(self, restore = False):
if self.device_model is None:
return
@@ -339,21 +364,29 @@ class ImageHandler:
env['XAUTHORITY'] = self.xauthority
if self.vncconsole:
args = args + ([ "-vncviewer" ])
+ unique_id = "%i-%i" % (self.vm.getDomid(), time.time())
+ sentinel_path = sentinel_path_prefix + unique_id
+ sentinel_path_fifo = sentinel_path + '.fifo'
+ os.mkfifo(sentinel_path_fifo, 0600)
+ sentinel_write = file(sentinel_path_fifo, 'r+')
+ self._openSentinel(sentinel_path_fifo)
+ self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo)
xstransact.Mkdir("/local/domain/0/device-model/%i" %
self.vm.getDomid())
xstransact.SetPermissions("/local/domain/0/device-model/%i" %
self.vm.getDomid(),
{ 'dom': self.vm.getDomid(), 'read': True, 'write':
True })
log.info("spawning device models: %s %s", self.device_model, args)
# keep track of pid and spawned options to kill it later
- logfile = "/var/log/xen/qemu-dm-%s.log" %
str(self.vm.info['name_label'])
- if os.path.exists(logfile):
- if os.path.exists(logfile + ".1"):
- os.unlink(logfile + ".1")
- os.rename(logfile, logfile + ".1")
+ self.logfile = "/var/log/xen/qemu-dm-%s.log" %
str(self.vm.info['name_label'])
+ if os.path.exists(self.logfile):
+ if os.path.exists(self.logfile + ".1"):
+ os.unlink(self.logfile + ".1")
+ os.rename(self.logfile, self.logfile + ".1")
null = os.open("/dev/null", os.O_RDONLY)
- logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC)
+ logfd = os.open(self.logfile,
os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
+ sys.stderr.flush()
pid = os.fork()
if pid == 0: #child
try:
@@ -362,18 +395,26 @@ class ImageHandler:
os.dup2(logfd, 2)
os.close(null)
os.close(logfd)
+ self.sentinel_fifo.close()
try:
os.execve(self.device_model, args, env)
- except:
- os._exit(127)
+ except Exception, e:
+ print >>sys.stderr, (
+ 'failed to set up fds or execute dm %s: %s' %
+ (self.device_model, utils.exception_string(e)))
+ os._exit(126)
except:
os._exit(127)
else:
self.pid = pid
os.close(null)
os.close(logfd)
+ sentinel_write.close()
self.vm.storeDom("image/device-model-pid", self.pid)
log.info("device model pid: %d", self.pid)
+ # we would very much prefer not to have a thread here and instead
+ # have a callback but sadly we don't have Twisted in xend
+ self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,())
def signalDeviceModel(self, cmd, ret, par = None):
if self.device_model is None:
@@ -419,44 +460,116 @@ class ImageHandler:
xstransact.Store("/local/domain/0/device-model/%i"
% self.vm.getDomid(), ('command', 'continue'))
+ def _dmfailed(self, message):
+ log.warning("domain %s: %s", self.vm.getName(), message)
+ # ideally we would like to forcibly crash the domain with
+ # something like
+ # xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH)
+ # but this can easily lead to very rapid restart loops against
+ # which we currently have no protection
+
def recreate(self):
if self.device_model is None:
return
- self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+ name = self.vm.getName()
+ sentinel_path_fifo = self.vm.readDom('image/device-model-fifo')
+ fifo_fd = -1
+ log.debug("rediscovering %s", sentinel_path_fifo)
+ if sentinel_path_fifo is None:
+ log.debug("%s device model no sentinel, cannot rediscover", name)
+ else:
+ try:
+ # We open it O_WRONLY because that fails ENXIO if no-one
+ # has it open for reading (see SuSv3). The dm process got
+ # a read/write descriptor from our earlier invocation.
+ fifo_fd = os.open(sentinel_path_fifo,
os.O_WRONLY|os.O_NONBLOCK)
+ except OSError, e:
+ if e.errno == errno.ENXIO:
+ self._dmfailed("%s device model no longer running"%name)
+ elif e.errno == errno.ENOENT:
+ log.debug("%s device model sentinel %s absent!",
+ name, sentinel_path_fifo)
+ else:
+ raise
+ if fifo_fd >= 0:
+ self._openSentinel(sentinel_path_fifo)
+ os.close(fifo_fd)
+ self.pid = self.vm.gatherDom(('image/device-model-pid', int))
+ log.debug("%s device model rediscovered, pid %s sentinel fifo %s",
+ name, self.pid, sentinel_path_fifo)
+ self.sentinel_thread =
thread.start_new_thread(self._sentinel_watch,())
+
+ def _sentinel_watch(self):
+ log.info("waiting for sentinel_fifo")
+ try: self.sentinel_fifo.read(1)
+ except OSError, e: pass
+ self.sentinel_lock.acquire()
+ try:
+ if self.pid:
+ (p,st) = os.waitpid(self.pid, os.WNOHANG)
+ if p == self.pid:
+ message = oshelp.waitstatus_description(st)
+ else:
+ # obviously it is malfunctioning, kill it now
+ try:
+ os.kill(self.pid, signal.SIGKILL)
+ message = "malfunctioning (closed sentinel), killed"
+ except:
+ message = "malfunctioning or died ?"
+ message = "pid %d: %s" % (self.pid, message)
+ else:
+ message = "no longer running"
+ except Exception, e:
+ message = "waitpid failed: %s" % utils.exception_string(e)
+ message = "device model failure: %s" % message
+ try: message += "; see %s " % self.logfile
+ except: pass
+ self._dmfailed(message)
+ self.pid = None
+ self.sentinel_lock.release()
def destroyDeviceModel(self):
if self.device_model is None:
return
if self.pid:
+ self.sentinel_lock.acquire()
try:
- os.kill(self.pid, signal.SIGHUP)
- except OSError, exn:
- log.exception(exn)
- try:
- # Try to reap the child every 100ms for 10s. Then SIGKILL it.
- for i in xrange(100):
- (p, rv) = os.waitpid(self.pid, os.WNOHANG)
- if p == self.pid:
- break
- time.sleep(0.1)
- else:
- log.warning("DeviceModel %d took more than 10s "
- "to terminate: sending SIGKILL" % self.pid)
+ try:
+ os.kill(self.pid, signal.SIGHUP)
+ except OSError, exn:
+ log.exception(exn)
+ try:
+ # Try to reap the child every 100ms for 10s. Then SIGKILL
it.
+ for i in xrange(100):
+ (p, rv) = os.waitpid(self.pid, os.WNOHANG)
+ if p == self.pid:
+ break
+ time.sleep(0.1)
+ else:
+ log.warning("DeviceModel %d took more than 10s "
+ "to terminate: sending SIGKILL" % self.pid)
+ os.kill(self.pid, signal.SIGKILL)
+ os.waitpid(self.pid, 0)
+ except OSError, exn:
+ # This is expected if Xend has been restarted within the
+ # life of this domain. In this case, we can kill the
process,
+ # but we can't wait for it because it's not our child.
+ # We just make really sure it's going away (SIGKILL) first.
os.kill(self.pid, signal.SIGKILL)
- os.waitpid(self.pid, 0)
- except OSError, exn:
- # This is expected if Xend has been restarted within the
- # life of this domain. In this case, we can kill the process,
- # but we can't wait for it because it's not our child.
- # We just make really sure it's going away (SIGKILL) first.
- os.kill(self.pid, signal.SIGKILL)
- self.pid = None
- state = xstransact.Remove("/local/domain/0/device-model/%i"
- % self.vm.getDomid())
+ state = xstransact.Remove("/local/domain/0/device-model/%i"
+ % self.vm.getDomid())
+ finally:
+ self.pid = None
+ self.sentinel_lock.release()
try:
os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid())
os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid())
+ except:
+ pass
+ try:
+ del sentinel_fifos_inuse[self.sentinel_path_fifo]
+ os.unlink(self.sentinel_path_fifo)
except:
pass
diff -r 4ac0898df538 tools/python/xen/util/oshelp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/oshelp.py Thu Jun 12 16:46:56 2008 +0100
@@ -0,0 +1,20 @@
+import fcntl
+import os
+
+def fcntl_setfd_cloexec(file, bool):
+ f = fcntl.fcntl(file, fcntl.F_GETFD)
+ if bool: f |= fcntl.FD_CLOEXEC
+ else: f &= ~fcntl.FD_CLOEXEC
+ fcntl.fcntl(file, fcntl.F_SETFD)
+
+def waitstatus_description(st):
+ if os.WIFEXITED(st):
+ es = os.WEXITSTATUS(st)
+ if es: return "exited with nonzero status %i" % es
+ else: return "exited"
+ elif os.WIFSIGNALED(st):
+ s = "died due to signal %i" % os.WTERMSIG(st)
+ if os.WCOREDUMP(st): s += " (core dumped)"
+ return s
+ else:
+ return "failed with unexpected wait status %i" % st
diff -r 4ac0898df538 tools/python/xen/util/utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/utils.py Thu Jun 12 16:46:56 2008 +0100
@@ -0,0 +1,6 @@
+import traceback
+import sys
+
+def exception_string(e):
+ (ty,v,tb) = sys.exc_info()
+ return traceback.format_exception_only(ty,v)
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|