Hi,
There are a number of problems in xenbus and the paravirtualized drivers
when it comes to kexec'ing a domU kernel. It's not specific to kexec,
but to handing over devices to someone else within the same domU. The
same issues will come up when someone tries to combine mini-os and grub
to build some kind of in-domain bootloader.
Fundamental issue is that there is no way to cleanly shutdown the
frontend/backend drivers without destroying the domain and the backend
devices. The patch addresses some but not all of the problems arising
from that. It also has some debug stuff left in and needs some more
polishing before final submission.
Here are the changes:
(1) ballon: print unconditionally driver_pages in /proc/xen/balloon.
(2) xenbus: unregister watches in xs_suspend()
(3) blk/netback: allow Closing -> Initialising state transition.
(4) blk/netfront: shutdown (but *not* destroy) devices
when devices_shutdown is called, by jumping from Closing to
Initialising state.
(5) Some cleanups along the way.
Not solved yet:
The netfront driver leaks pages in case it is shutdown while the
interface is active. Problem is that it can't get back the rx buffers,
the netback driver sits on them and doesn't release them until it has
some packet data to fill in. The patch adds a netif_release_rx_bufs()
function, but it doesn't work reliable, usually it gets back only a few
pages.
To reproduce this you don't need to play with kexec. Just apply the
balloon driver patch. Boot a domU with the new kernel. Configure the
eth0 interface, "xm network-detach ...", "xm network-attach", repeat.
Watch driver_pages grow in /proc/xen/balloon.
cheers,
Gerd
PS: documentation and complete domU kexec patches are available from
http://www.suse.de/~kraxel/xen/kexec.html
--
Gerd Hoffmann <kraxel@xxxxxxx>
http://www.suse.de/~kraxel/julika-dora.jpeg
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c
--- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Mon Aug 14
12:49:11 2006 +0200
@@ -440,15 +440,17 @@ static int balloon_read(char *page, char
"Requested target: %8lu kB\n"
"Low-mem balloon: %8lu kB\n"
"High-mem balloon: %8lu kB\n"
+ "Driver pages: %8lu kB\n"
"Xen hard limit: ",
PAGES2KB(current_pages), PAGES2KB(target_pages),
- PAGES2KB(balloon_low), PAGES2KB(balloon_high));
+ PAGES2KB(balloon_low), PAGES2KB(balloon_high),
+ PAGES2KB(driver_pages));
if (hard_limit != ~0UL) {
len += sprintf(
page + len,
- "%8lu kB (inc. %8lu kB driver headroom)\n",
- PAGES2KB(hard_limit), PAGES2KB(driver_pages));
+ "%8lu kB\n",
+ PAGES2KB(hard_limit));
} else {
len += sprintf(
page + len,
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Sun Aug 13 09:44:07
2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Mon Aug 14 12:49:11
2006 +0200
@@ -305,6 +305,11 @@ static void frontend_changed(struct xenb
switch (frontend_state) {
case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosing) {
+ printk("%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
break;
case XenbusStateInitialised:
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Mon Aug 14
12:49:11 2006 +0200
@@ -272,10 +272,13 @@ static void backend_changed(struct xenbu
xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
down(&bd->bd_sem);
- if (info->users > 0)
+ if (info->users)
+ printk("%-20s: %s: %d user(s) left\n", __FUNCTION__,
+ dev->nodename, info->users);
+ if (info->users > 0 && !dev->shutdown_in_progress) {
xenbus_dev_error(dev, -EBUSY,
"Device in use; refusing to close");
- else
+ } else
blkfront_closing(dev);
up(&bd->bd_sem);
bdput(bd);
@@ -359,7 +362,7 @@ static void blkfront_closing(struct xenb
xlvbd_del(info);
- xenbus_switch_state(dev, XenbusStateClosed);
+ xenbus_closing_done(dev);
}
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Sun Aug 13 09:44:07
2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Mon Aug 14 12:49:11
2006 +0200
@@ -228,10 +228,25 @@ static void frontend_changed(struct xenb
switch (frontend_state) {
case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosing) {
+ printk("%s: %s: prepare for reconnect\n",
+ __FUNCTION__, dev->nodename);
+ if (be->netif) {
+ netif_disconnect(be->netif);
+ be->netif = NULL;
+ }
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
case XenbusStateInitialised:
break;
case XenbusStateConnected:
+ if (!be->netif) {
+ /* reconnect: setup be->netif */
+ backend_changed(&be->backend_watch, NULL, 0);
+ }
maybe_connect(be);
break;
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/Makefile Mon Aug 14
12:49:11 2006 +0200
@@ -1,3 +1,4 @@
+EXTRA_CFLAGS += -DDEBUG=1
obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Mon Aug 14
12:49:11 2006 +0200
@@ -436,7 +436,7 @@ static void backend_changed(struct xenbu
struct netfront_info *np = dev->dev.driver_data;
struct net_device *netdev = np->netdev;
- DPRINTK("\n");
+ DPRINTK("%s\n", xenbus_strstate(backend_state));
switch (backend_state) {
case XenbusStateInitialising:
@@ -447,9 +447,11 @@ static void backend_changed(struct xenbu
break;
case XenbusStateInitWait:
- network_connect(netdev);
- xenbus_switch_state(dev, XenbusStateConnected);
- (void)send_fake_arp(netdev);
+ if (!dev->shutdown_in_progress) {
+ network_connect(netdev);
+ xenbus_switch_state(dev, XenbusStateConnected);
+ (void)send_fake_arp(netdev);
+ }
break;
case XenbusStateClosing:
@@ -492,6 +494,7 @@ static int network_open(struct net_devic
{
struct netfront_info *np = netdev_priv(dev);
+ DPRINTK("%s\n", np->xbdev->nodename);
memset(&np->stats, 0, sizeof(np->stats));
network_alloc_rx_buffers(dev);
@@ -651,6 +654,8 @@ no_skb:
((np->rx_target *= 2) > np->rx_max_target))
np->rx_target = np->rx_max_target;
+// printk("%s: %d bufs, target %d\n", __FUNCTION__, i, np->rx_target);
+
refill:
for (i = 0; ; i++) {
if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
@@ -1223,6 +1228,7 @@ err:
/* Some pages are no longer absent... */
balloon_update_driver_allowance(-pages_done);
+// printk("%s: %d driver pages\n", __FUNCTION__, work_done);
/* Do all the remapping work, and M2P updates, in one big hypercall. */
if (likely(pages_done)) {
@@ -1283,10 +1289,80 @@ err:
return more_to_do;
}
+static void netif_release_rx_bufs(struct netfront_info *np)
+{
+ struct mmu_update *mmu = np->rx_mmu;
+ struct multicall_entry *mcl = np->rx_mcl;
+ struct sk_buff *skb;
+ unsigned long mfn;
+ int bufs = 0, gnttab = 0, unused = 0;
+ int id, ref;
+
+ spin_lock(&np->rx_lock);
+
+ for (id = 0; id < NET_RX_RING_SIZE; id++) {
+ if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+ unused++;
+ continue;
+ }
+ if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
+ gnttab++;
+ continue;
+ }
+
+ gnttab_release_grant_reference(&np->gref_rx_head, ref);
+ np->grant_rx_ref[id] = GRANT_INVALID_REF;
+
+ skb = np->rx_skbs[id];
+ add_id_to_freelist(np->rx_skbs, id);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Remap the page. */
+ MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
+ pfn_pte_ma(mfn, PAGE_KERNEL),
+ 0);
+ mcl++;
+ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE;
+ mmu->val = __pa(skb->head) >> PAGE_SHIFT;
+ mmu++;
+
+ set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
+ mfn);
+ }
+ bufs++;
+
+#if 0 /* FIXME */
+ dev_kfree_skb(skb);
+#endif
+ }
+
+ printk("%s: %d released ok, %d gnttab errs, %d unused slots\n",
+ __FUNCTION__, bufs, gnttab, unused);
+ if (0 == bufs)
+ return;
+
+ /* Some pages are no longer absent... */
+ balloon_update_driver_allowance(-bufs);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /* Do all the remapping work, and M2P updates, in one big
hypercall. */
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)np->rx_mmu;
+ mcl->args[1] = mmu - np->rx_mmu;
+ mcl->args[2] = 0;
+ mcl->args[3] = DOMID_SELF;
+ mcl++;
+ HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+ }
+
+ spin_unlock(&np->rx_lock);
+}
static int network_close(struct net_device *dev)
{
struct netfront_info *np = netdev_priv(dev);
+ DPRINTK("%s\n", np->xbdev->nodename);
netif_stop_queue(np->netdev);
return 0;
}
@@ -1425,6 +1501,8 @@ static void netif_uninit(struct net_devi
static void netif_uninit(struct net_device *dev)
{
struct netfront_info *np = netdev_priv(dev);
+ DPRINTK("%s\n", np->xbdev->nodename);
+ netif_release_rx_bufs(np);
gnttab_free_grant_references(np->gref_tx_head);
gnttab_free_grant_references(np->gref_rx_head);
}
@@ -1720,11 +1798,11 @@ static void netfront_closing(struct xenb
{
struct netfront_info *info = dev->dev.driver_data;
- DPRINTK("netfront_closing: %s removed\n", dev->nodename);
+ DPRINTK("%s\n", dev->nodename);
close_netdev(info);
-
- xenbus_switch_state(dev, XenbusStateClosed);
+// netif_release_rx_bufs(info);
+ xenbus_closing_done(dev);
}
@@ -1740,9 +1818,9 @@ static int __devexit netfront_remove(str
return 0;
}
-
static void close_netdev(struct netfront_info *info)
{
+ DPRINTK("%s\n", info->xbdev->nodename);
del_timer_sync(&info->rx_refill_timer);
xennet_sysfs_delif(info->netdev);
@@ -1752,6 +1830,7 @@ static void close_netdev(struct netfront
static void netif_disconnect_backend(struct netfront_info *info)
{
+ DPRINTK("%s\n", info->xbdev->nodename);
/* Stop old i/f to prevent errors whilst we rebuild the state. */
spin_lock_irq(&info->tx_lock);
spin_lock(&info->rx_lock);
@@ -1774,6 +1853,7 @@ static void netif_disconnect_backend(str
static void netif_free(struct netfront_info *info)
{
+ DPRINTK("%s\n", info->xbdev->nodename);
close_netdev(info);
netif_disconnect_backend(info);
free_netdev(info->netdev);
@@ -1797,9 +1877,9 @@ static struct xenbus_device_id netfront_
static struct xenbus_driver netfront = {
+ .ids = netfront_ids,
.name = "vif",
.owner = THIS_MODULE,
- .ids = netfront_ids,
.probe = netfront_probe,
.remove = __devexit_p(netfront_remove),
.resume = netfront_resume,
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c Mon Aug 14
12:49:11 2006 +0200
@@ -41,6 +41,21 @@ extern char *kasprintf(const char *fmt,
#define DPRINTK(fmt, args...) \
pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__,
##args)
+char *xenbus_strstate(enum xenbus_state state)
+{
+ static char *name[] = {
+ [ XenbusStateUnknown ] = "Unknown",
+ [ XenbusStateInitialising ] = "Initialising",
+ [ XenbusStateInitWait ] = "InitWait",
+ [ XenbusStateInitialised ] = "Initialised",
+ [ XenbusStateConnected ] = "Connected",
+ [ XenbusStateClosing ] = "Closing",
+ [ XenbusStateClosed ] = "Closed",
+ };
+ return state < sizeof(name)/sizeof(name[0])
+ ? name[state] : "INVALID";
+}
+
int xenbus_watch_path(struct xenbus_device *dev, const char *path,
struct xenbus_watch *watch,
void (*callback)(struct xenbus_watch *,
@@ -124,6 +139,23 @@ int xenbus_switch_state(struct xenbus_de
}
EXPORT_SYMBOL_GPL(xenbus_switch_state);
+int xenbus_closing_done(struct xenbus_device *dev)
+{
+ if (dev->shutdown_in_progress) {
+ /* frontend (we) triggered closing, because of
devices_shutdown(),
+ * prepare for possible re-connect */
+ printk("%-20s: %s: => Initialising\n", __FUNCTION__,
dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitialising);
+ } else {
+ /* backend triggered closing, probably the device went away,
+ * thus we are going to close down too. */
+ printk("%-20s: %s: => Closed\n", __FUNCTION__, dev->nodename);
+ xenbus_switch_state(dev, XenbusStateClosed);
+ }
+ complete(&dev->down);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_closing_done);
/**
* Return the path to the error node for the given device, or NULL on failure.
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Mon Aug 14
12:49:11 2006 +0200
@@ -60,6 +60,15 @@ static struct notifier_block *xenstore_c
static void wait_for_devices(struct xenbus_driver *xendrv);
+static int xenbus_probe_frontend(const char *type, const char *name);
+static int xenbus_uevent_backend(struct device *dev, char **envp,
+ int num_envp, char *buffer, int buffer_size);
+static int xenbus_probe_backend(const char *type, const char *domid);
+
+static int xenbus_dev_probe(struct device *_dev);
+static int xenbus_dev_remove(struct device *_dev);
+static void xenbus_dev_shutdown(struct device *_dev);
+
/* If something in array of ids matches this device, return it. */
static const struct xenbus_device_id *
match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -168,15 +177,17 @@ static int read_frontend_details(struct
/* Bus type for frontend drivers. */
-static int xenbus_probe_frontend(const char *type, const char *name);
static struct xen_bus_type xenbus_frontend = {
.root = "device",
.levels = 2, /* device/type/<id> */
.get_bus_id = frontend_bus_id,
.probe = xenbus_probe_frontend,
.bus = {
- .name = "xen",
- .match = xenbus_match,
+ .name = "xen",
+ .match = xenbus_match,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+ .shutdown = xenbus_dev_shutdown,
},
.dev = {
.bus_id = "xen",
@@ -221,18 +232,18 @@ static int backend_bus_id(char bus_id[BU
return 0;
}
-static int xenbus_uevent_backend(struct device *dev, char **envp,
- int num_envp, char *buffer, int buffer_size);
-static int xenbus_probe_backend(const char *type, const char *domid);
static struct xen_bus_type xenbus_backend = {
.root = "backend",
.levels = 3, /* backend/type/<frontend>/<id> */
.get_bus_id = backend_bus_id,
.probe = xenbus_probe_backend,
.bus = {
- .name = "xen-backend",
- .match = xenbus_match,
- .uevent = xenbus_uevent_backend,
+ .name = "xen-backend",
+ .match = xenbus_match,
+ .probe = xenbus_dev_probe,
+ .remove = xenbus_dev_remove,
+// .shutdown = xenbus_dev_shutdown,
+ .uevent = xenbus_uevent_backend,
},
.dev = {
.bus_id = "xen-backend",
@@ -391,6 +402,26 @@ static int xenbus_dev_remove(struct devi
return 0;
}
+static void xenbus_dev_shutdown(struct device *_dev)
+{
+ struct xenbus_device *dev = to_xenbus_device(_dev);
+ unsigned long timeout = 5*HZ;
+
+ get_device(&dev->dev);
+ if (dev->state != XenbusStateConnected) {
+ printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__,
+ dev->nodename, xenbus_strstate(dev->state));
+ goto out;
+ }
+ dev->shutdown_in_progress = 1;
+ xenbus_switch_state(dev, XenbusStateClosing);
+ timeout = wait_for_completion_timeout(&dev->down, timeout);
+ if (!timeout)
+ printk("%s: %s timeout closing device\n", __FUNCTION__,
dev->nodename);
+ out:
+ put_device(&dev->dev);
+}
+
static int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus)
{
@@ -399,8 +430,6 @@ static int xenbus_register_driver_common
drv->driver.name = drv->name;
drv->driver.bus = &bus->bus;
drv->driver.owner = drv->owner;
- drv->driver.probe = xenbus_dev_probe;
- drv->driver.remove = xenbus_dev_remove;
mutex_lock(&xenwatch_mutex);
ret = driver_register(&drv->driver);
@@ -575,6 +604,7 @@ static int xenbus_probe_node(struct xen_
tmpstring += strlen(tmpstring) + 1;
strcpy(tmpstring, type);
xendev->devicetype = tmpstring;
+ init_completion(&xendev->down);
xendev->dev.parent = &bus->dev;
xendev->dev.bus = &bus->bus;
diff -r befab551b0e1 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c
--- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Sun Aug 13
09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Mon Aug 14
12:49:11 2006 +0200
@@ -665,7 +665,18 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watc
void xs_suspend(void)
{
+ struct xenbus_watch *watch;
+ char token[sizeof(watch) * 2 + 1];
+
down_write(&xs_state.suspend_mutex);
+
+ /* No need for watches_lock: the suspend_mutex is sufficient. */
+ list_for_each_entry(watch, &watches, list) {
+ sprintf(token, "%lX", (long)watch);
+ xs_unwatch(watch->node, token);
+// printk("%s: unwatch %s\n", __FUNCTION__, watch->node);
+ }
+
mutex_lock(&xs_state.request_mutex);
}
diff -r befab551b0e1 linux-2.6-xen-sparse/include/xen/xenbus.h
--- a/linux-2.6-xen-sparse/include/xen/xenbus.h Sun Aug 13 09:44:07 2006 +0100
+++ b/linux-2.6-xen-sparse/include/xen/xenbus.h Mon Aug 14 12:49:11 2006 +0200
@@ -37,6 +37,7 @@
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/mutex.h>
+#include <linux/completion.h>
#include <xen/interface/xen.h>
#include <xen/interface/grant_table.h>
#include <xen/interface/io/xenbus.h>
@@ -74,6 +75,9 @@ struct xenbus_device {
struct xenbus_watch otherend_watch;
struct device dev;
enum xenbus_state state;
+
+ struct completion down;
+ int shutdown_in_progress;
};
static inline struct xenbus_device *to_xenbus_device(struct device *dev)
@@ -296,4 +300,7 @@ void xenbus_dev_fatal(struct xenbus_devi
...);
+int xenbus_closing_done(struct xenbus_device *dev);
+char *xenbus_strstate(enum xenbus_state state);
+
#endif /* _XEN_XENBUS_H */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|