On Fri, 3 Dec 2010, Alex Bligh wrote:
> > I was referring to the device name in the disk config file option,
> > usually is "hda" in HVM config files. Some blkfront development versions
> > don't always create an xvd device, so if you manually specify xvda in
> > the disk config file you would rule that problem out (even though it
> > also depends on the behaviour toolstack and I don't remember what xend 3.3
> > would do).
>
> Would this not break booting a non-PV driver equipped kernel? In the
> general case, we don't know what guests will be booted (up to the
> customer).
It wouldn't break anything because qemu emulates the IDE disks anyway,
but I am not recommending this in production, just for debugging this
particular issue.
> > But in any case upstream blkfront should always create xvd devices so I am
> > not sure how you could end up with a /dev/sda there.
>
> The /dev/sda that is there is the emulated devices. xen_watch (judging
> by the call trace) is trying to create another /dev/sda, presumably
> for the paravirtualised device (given the call path through blkfront).
>
This shouldn't happen. What kernel are you using? Can I see the sources?
> > If you'd like a stock kernel, the best thing to do is upgrade xen, then
> > you don't need any changes in the guest.
> > Keep in mind that xen 3.4 was released almost 2 years ago.
> > If you know what you are doing you can manually remove the check for the
> > unplug protocol in the kernel, just hack
> > arch/x86/xen/platform-pci-unplug.c:check_platform_magic.
>
> That's easier said than done on a running platform with large numbers
> of nodes, a huge variety of operating systems etc.; we have Xen 4 working
> in the lab, but the code change our end is very substantial,
>
> My first priority is to ensure 2.6.37 doesn't break anything (that's
> fine, because it comes up without PV drivers which is no worse than
> 2.6.36). My second priority is to try and allow end users to get
> 2.6.37+ working with PV drivers easily (so adding grub command line
> options is doable).
>
Another option would be to backport the appended patch that implements
the unplug protocol in qemu to your qemu-xen tree:
commit e7911109f4321e9ba0cc56a253b653600aa46bea
Author: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
Date: Wed Dec 31 16:00:20 2008 +0000
disable qemu PCI devices in HVM domains
Magic ioport (0x10) protocol for negotating with guest PV drivers
during startup, and allowing PV drivers to disable hardware emulations
thus preventing guest from seeing the same device through two paths.
Protocol and implementation from the Citrix Xenserver product line.
Documentation (protocol spec) will follow in a moment.
Contributed-By: Steven Smith <steven.smith@xxxxxxxxxxxxx>
Signed-off-by: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 044d1f4..f705c51 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -51,6 +51,7 @@
#include <sys/ioctl.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
+#include <sys/mount.h>
#endif
#ifdef __FreeBSD__
#include <signal.h>
@@ -792,6 +793,10 @@ static void raw_close(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
if (s->fd >= 0) {
+#ifndef CONFIG_STUBDOM
+ /* Invalidate buffer cache for this device. */
+ ioctl(s->fd, BLKFLSBUF, 0);
+#endif
close(s->fd);
s->fd = -1;
if (s->aligned_buf != NULL)
diff --git a/hw/ide.c b/hw/ide.c
index bf97d69..da7057c 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -501,6 +501,7 @@ typedef struct PCIIDEState {
int type; /* see IDE_TYPE_xxx */
} PCIIDEState;
+static PCIIDEState *principal_ide_controller;
#if defined(__ia64__)
#include <xen/hvm/ioreq.h>
@@ -2906,6 +2907,47 @@ static void ide_reset(IDEState *s)
s->media_changed = 0;
}
+/* Unplug all of the IDE hard disks, starting at index @start in the
+ table. */
+static void _ide_unplug_harddisks(int start)
+{
+ IDEState *s;
+ int i, j;
+
+ if (!principal_ide_controller) {
+ fprintf(stderr, "No principal controller?\n");
+ return;
+ }
+ for (i = start; i < 4; i++) {
+ s = principal_ide_controller->ide_if + i;
+ if (!s->bs)
+ continue; /* drive not present */
+ if (s->is_cdrom)
+ continue; /* cdrom */
+ /* Is a hard disk, unplug it. */
+ for (j = 0; j < nb_drives; j++)
+ if (drives_table[j].bdrv == s->bs)
+ drives_table[j].bdrv = NULL;
+ bdrv_close(s->bs);
+ s->bs = NULL;
+ ide_reset(s);
+ }
+}
+
+/* Unplug all hard disks except for the primary master (which will
+ almost always be the boot device). */
+void ide_unplug_aux_harddisks(void)
+{
+ _ide_unplug_harddisks(1);
+}
+
+/* Unplug all hard disks, including the boot device. */
+void ide_unplug_harddisks(void)
+{
+ _ide_unplug_harddisks(0);
+}
+
+
struct partition {
uint8_t boot_ind; /* 0x80 - active */
uint8_t head; /* starting head */
@@ -3423,6 +3465,9 @@ void pci_cmd646_ide_init(PCIBus *bus, BlockDriverState
**hd_table,
sizeof(PCIIDEState),
-1,
NULL, NULL);
+ if (principal_ide_controller)
+ abort();
+ principal_ide_controller = d;
d->type = IDE_TYPE_CMD646;
pci_conf = d->dev.config;
pci_conf[0x00] = 0x95; // CMD646
@@ -3557,6 +3602,9 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState
**hd_table, int devfn,
sizeof(PCIIDEState),
devfn,
NULL, NULL);
+ if (principal_ide_controller)
+ abort();
+ principal_ide_controller = d;
d->type = IDE_TYPE_PIIX3;
pci_conf = d->dev.config;
diff --git a/hw/pc.h b/hw/pc.h
index 02b7018..455306d 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -150,6 +150,8 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState
**hd_table, int devfn,
qemu_irq *pic);
void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
qemu_irq *pic);
+void ide_unplug_harddisks(void);
+void ide_unplug_aux_harddisks(void);
/* ne2000.c */
diff --git a/hw/pci.c b/hw/pci.c
index 2d4099b..e72c669 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -26,6 +26,9 @@
#include "console.h"
#include "net.h"
+#include "exec-all.h"
+#include "qemu-xen.h"
+
//#define DEBUG_PCI
struct PCIBus {
@@ -648,6 +651,46 @@ void pci_nic_init(PCIBus *bus, NICInfo *nd, int devfn)
}
}
+void pci_unplug_netifs(void)
+{
+ PCIBus *bus;
+ PCIDevice *dev;
+ PCIIORegion *region;
+ int x;
+ int i;
+
+ /* We only support one PCI bus */
+ for (bus = first_bus; bus; bus = NULL) {
+ for (x = 0; x < 256; x++) {
+ dev = bus->devices[x];
+ if (dev &&
+ dev->config[0xa] == 0 &&
+ dev->config[0xb] == 2) {
+ /* Found a netif. Remove it from the bus. Note that
+ we don't free it here, since there could still be
+ references to it floating around. There are only
+ ever one or two structures leaked, and it's not
+ worth finding them all. */
+ bus->devices[x] = NULL;
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ region = &dev->io_regions[i];
+ if (region->addr == (uint32_t)-1 ||
+ region->size == 0)
+ continue;
+ fprintf(logfile, "region type %d at [%x,%x).\n",
+ region->type, region->addr,
+ region->addr+region->size);
+ if (region->type == PCI_ADDRESS_SPACE_IO) {
+ isa_unassign_ioport(region->addr, region->size);
+ } else if (region->type == PCI_ADDRESS_SPACE_MEM) {
+ unregister_iomem(region->addr);
+ }
+ }
+ }
+ }
+ }
+}
+
typedef struct {
PCIDevice dev;
PCIBus *bus;
diff --git a/hw/xen_platform.c b/hw/xen_platform.c
index 937b802..84a2b19 100644
--- a/hw/xen_platform.c
+++ b/hw/xen_platform.c
@@ -24,13 +24,20 @@
*/
#include "hw.h"
+#include "pc.h"
#include "pci.h"
#include "irq.h"
#include "qemu-xen.h"
+#include <assert.h>
#include <xenguest.h>
+static int drivers_blacklisted;
+static uint16_t driver_product_version;
+static int throttling_disabled;
extern FILE *logfile;
+static char log_buffer[4096];
+static int log_buffer_off;
#define PFFLAG_ROM_LOCK 1 /* Sets whether ROM memory area is RW or RO */
@@ -41,6 +48,88 @@ typedef struct PCIXenPlatformState
uint64_t vga_stolen_ram;
} PCIXenPlatformState;
+/* We throttle access to dom0 syslog, to avoid DOS attacks. This is
+ modelled as a token bucket, with one token for every byte of log.
+ The bucket size is 128KB (->1024 lines of 128 bytes each) and
+ refills at 256B/s. It starts full. The guest is blocked if no
+ tokens are available when it tries to generate a log message. */
+#define BUCKET_MAX_SIZE (128*1024)
+#define BUCKET_FILL_RATE 256
+
+static void throttle(unsigned count)
+{
+ static unsigned available;
+ static struct timespec last_refil;
+ static int started;
+ static int warned;
+
+ struct timespec waiting_for, now;
+ double delay;
+ struct timespec ts;
+
+ if (throttling_disabled)
+ return;
+
+ if (!started) {
+ clock_gettime(CLOCK_MONOTONIC, &last_refil);
+ available = BUCKET_MAX_SIZE;
+ started = 1;
+ }
+
+ if (count > BUCKET_MAX_SIZE) {
+ fprintf(logfile, "tried to get %d tokens, but bucket size is %d\n",
+ BUCKET_MAX_SIZE, count);
+ exit(1);
+ }
+
+ if (available < count) {
+ /* The bucket is empty. Refil it */
+
+ /* When will it be full enough to handle this request? */
+ delay = (double)(count - available) / BUCKET_FILL_RATE;
+ waiting_for = last_refil;
+ waiting_for.tv_sec += delay;
+ waiting_for.tv_nsec += (delay - (int)delay) * 1e9;
+ if (waiting_for.tv_nsec >= 1000000000) {
+ waiting_for.tv_nsec -= 1000000000;
+ waiting_for.tv_sec++;
+ }
+
+ /* How long do we have to wait? (might be negative) */
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ ts.tv_sec = waiting_for.tv_sec - now.tv_sec;
+ ts.tv_nsec = waiting_for.tv_nsec - now.tv_nsec;
+ if (ts.tv_nsec < 0) {
+ ts.tv_sec--;
+ ts.tv_nsec += 1000000000;
+ }
+
+ /* Wait for it. */
+ if (ts.tv_sec > 0 ||
+ (ts.tv_sec == 0 && ts.tv_nsec > 0)) {
+ if (!warned) {
+ fprintf(logfile, "throttling guest access to syslog");
+ warned = 1;
+ }
+ while (nanosleep(&ts, &ts) < 0 && errno == EINTR)
+ ;
+ }
+
+ /* Refil */
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ delay = (now.tv_sec - last_refil.tv_sec) +
+ (now.tv_nsec - last_refil.tv_nsec) * 1.0e-9;
+ available += BUCKET_FILL_RATE * delay;
+ if (available > BUCKET_MAX_SIZE)
+ available = BUCKET_MAX_SIZE;
+ last_refil = now;
+ }
+
+ assert(available >= count);
+
+ available -= count;
+}
+
static uint32_t xen_platform_ioport_readb(void *opaque, uint32_t addr)
{
PCIXenPlatformState *s = opaque;
@@ -68,6 +157,19 @@ static void xen_platform_ioport_writeb(void *opaque,
uint32_t addr, uint32_t val
d->platform_flags = val & PFFLAG_ROM_LOCK;
break;
}
+ case 8:
+ {
+ if (val == '\n' || log_buffer_off == sizeof(log_buffer) - 1) {
+ /* Flush buffer */
+ log_buffer[log_buffer_off] = 0;
+ throttle(log_buffer_off);
+ fprintf(logfile, "%s\n", log_buffer);
+ log_buffer_off = 0;
+ break;
+ }
+ log_buffer[log_buffer_off++] = val;
+ }
+ break;
default:
break;
}
@@ -163,6 +265,113 @@ static void platform_mmio_map(PCIDevice *d, int
region_num,
cpu_register_physical_memory(addr, 0x1000000, mmio_io_addr);
}
+#define UNPLUG_ALL_IDE_DISKS 1
+#define UNPLUG_ALL_NICS 2
+#define UNPLUG_AUX_IDE_DISKS 4
+
+static void platform_fixed_ioport_write2(void *opaque, uint32_t addr, uint32_t
val)
+{
+ switch (addr - 0x10) {
+ case 0:
+ /* Unplug devices. Value is a bitmask of which devices to
+ unplug, with bit 0 the IDE devices, bit 1 the network
+ devices, and bit 2 the non-primary-master IDE devices. */
+ if (val & UNPLUG_ALL_IDE_DISKS)
+ ide_unplug_harddisks();
+ if (val & UNPLUG_ALL_NICS) {
+ pci_unplug_netifs();
+ net_tap_shutdown_all();
+ }
+ if (val & UNPLUG_AUX_IDE_DISKS) {
+ ide_unplug_aux_harddisks();
+ }
+ break;
+ case 2:
+ switch (val) {
+ case 1:
+ fprintf(logfile, "Citrix Windows PV drivers loaded in guest\n");
+ break;
+ case 0:
+ fprintf(logfile, "Guest claimed to be running PV product 0?\n");
+ break;
+ default:
+ fprintf(logfile, "Unknown PV product %d loaded in guest\n", val);
+ break;
+ }
+ driver_product_version = val;
+ break;
+ }
+}
+
+static void platform_fixed_ioport_write4(void *opaque, uint32_t addr,
+ uint32_t val)
+{
+ switch (addr - 0x10) {
+ case 0:
+ /* PV driver version */
+ if (driver_product_version == 0) {
+ fprintf(logfile,
+ "Drivers tried to set their version number (%d) before
setting the product number?\n",
+ val);
+ return;
+ }
+ fprintf(logfile, "PV driver build %d\n", val);
+ if (xenstore_pv_driver_build_blacklisted(driver_product_version,
+ val)) {
+ fprintf(logfile, "Drivers are blacklisted!\n");
+ drivers_blacklisted = 1;
+ }
+ break;
+ }
+}
+
+
+static void platform_fixed_ioport_write1(void *opaque, uint32_t addr, uint32_t
val)
+{
+ switch (addr - 0x10) {
+ case 2:
+ /* Send bytes to syslog */
+ if (val == '\n' || log_buffer_off == sizeof(log_buffer) - 1) {
+ /* Flush buffer */
+ log_buffer[log_buffer_off] = 0;
+ throttle(log_buffer_off);
+ fprintf(logfile, "%s\n", log_buffer);
+ log_buffer_off = 0;
+ break;
+ }
+ log_buffer[log_buffer_off++] = val;
+ break;
+ }
+}
+
+static uint32_t platform_fixed_ioport_read2(void *opaque, uint32_t addr)
+{
+ switch (addr - 0x10) {
+ case 0:
+ if (drivers_blacklisted) {
+ /* The drivers will recognise this magic number and refuse
+ * to do anything. */
+ return 0xd249;
+ } else {
+ /* Magic value so that you can identify the interface. */
+ return 0x49d2;
+ }
+ default:
+ return 0xffff;
+ }
+}
+
+static uint32_t platform_fixed_ioport_read1(void *opaque, uint32_t addr)
+{
+ switch (addr - 0x10) {
+ case 2:
+ /* Version number */
+ return 1;
+ default:
+ return 0xff;
+ }
+}
+
struct pci_config_header {
uint16_t vendor_id;
uint16_t device_id;
@@ -224,6 +433,7 @@ void pci_xen_platform_init(PCIBus *bus)
{
PCIXenPlatformState *d;
struct pci_config_header *pch;
+ struct stat stbuf;
printf("Register xen platform.\n");
d = (PCIXenPlatformState *)pci_register_device(
@@ -255,4 +465,13 @@ void pci_xen_platform_init(PCIBus *bus)
register_savevm("platform", 0, 2, xen_pci_save, xen_pci_load, d);
printf("Done register platform.\n");
+ register_ioport_write(0x10, 16, 4, platform_fixed_ioport_write4, NULL);
+ register_ioport_write(0x10, 16, 2, platform_fixed_ioport_write2, NULL);
+ register_ioport_write(0x10, 16, 1, platform_fixed_ioport_write1, NULL);
+ register_ioport_read(0x10, 16, 2, platform_fixed_ioport_read2, NULL);
+ register_ioport_read(0x10, 16, 1, platform_fixed_ioport_read1, NULL);
+
+ if (stat("/etc/disable-guest-log-throttle", &stbuf) == 0)
+ throttling_disabled = 1;
+
}
diff --git a/i386-dm/exec-dm.c b/i386-dm/exec-dm.c
index 6dbd460..20ac93c 100644
--- a/i386-dm/exec-dm.c
+++ b/i386-dm/exec-dm.c
@@ -272,7 +272,7 @@ void cpu_abort(CPUState *env, const char *fmt, ...)
/* XXX: Simple implementation. Fix later */
#define MAX_MMIO 32
-struct mmio_space {
+static struct mmio_space {
target_phys_addr_t start;
unsigned long size;
unsigned long io_index;
@@ -418,6 +418,17 @@ int iomem_index(target_phys_addr_t addr)
return 0;
}
+void unregister_iomem(target_phys_addr_t start)
+{
+ int index = iomem_index(start);
+ if (index) {
+ fprintf(logfile, "squash iomem [%lx, %lx).\n", mmio[index].start,
+ mmio[index].start + mmio[index].size);
+ mmio[index].start = mmio[index].size = 0;
+ }
+}
+
+
#if defined(__i386__) || defined(__x86_64__)
#define phys_ram_addr(x) (qemu_map_cache(x))
#elif defined(__ia64__)
diff --git a/qemu-xen.h b/qemu-xen.h
index 8375b79..44467a3 100644
--- a/qemu-xen.h
+++ b/qemu-xen.h
@@ -26,8 +26,11 @@ void xen_vga_populate_vram(uint64_t vram_addr);
void xen_vga_vram_map(uint64_t vram_addr, int copy);
#endif
-
+void ide_unplug_harddisks(void);
+void net_tap_shutdown_all(void);
+void pci_unplug_netifs(void);
void destroy_hvm_domain(void);
+void unregister_iomem(target_phys_addr_t start);
#ifdef __ia64__
static inline void xc_domain_shutdown_hook(int xc_handle, uint32_t domid)
@@ -88,6 +91,8 @@ char *xenstore_vm_read(int domid, const char *key, unsigned
int *len);
char *xenstore_device_model_read(int domid, char *key, unsigned int *len);
char *xenstore_read_battery_data(int battery_status);
int xenstore_refresh_battery_status(void);
+int xenstore_pv_driver_build_blacklisted(uint16_t product_number,
+ uint32_t build_nr);
/* xenfbfront.c */
int xenfb_pv_display_init(DisplayState *ds);
diff --git a/vl.c b/vl.c
index ac38c2a..ae9170d 100644
--- a/vl.c
+++ b/vl.c
@@ -275,6 +275,20 @@ uint8_t qemu_uuid[16];
#include "xen-vl-extra.c"
+typedef struct IOHandlerRecord {
+ int fd;
+ IOCanRWHandler *fd_read_poll;
+ IOHandler *fd_read;
+ IOHandler *fd_write;
+ int deleted;
+ void *opaque;
+ /* temporary data */
+ struct pollfd *ufd;
+ struct IOHandlerRecord *next;
+} IOHandlerRecord;
+
+static IOHandlerRecord *first_io_handler;
+
/***********************************************************/
/* x86 ISA bus support */
@@ -4355,6 +4369,7 @@ void do_info_slirp(void)
typedef struct TAPState {
VLANClientState *vc;
int fd;
+ struct TAPState *next;
char down_script[1024];
char script_arg[1024];
} TAPState;
@@ -4392,6 +4407,34 @@ static void tap_send(void *opaque)
}
}
+static TAPState *head_net_tap;
+
+void net_tap_shutdown_all(void)
+{
+ struct IOHandlerRecord **pioh, *ioh;
+
+ while (head_net_tap) {
+ pioh = &first_io_handler;
+ for (;;) {
+ ioh = *pioh;
+ if (ioh == NULL)
+ break;
+ if (ioh->fd == head_net_tap->fd) {
+ *pioh = ioh->next;
+ qemu_free(ioh);
+ break;
+ }
+ pioh = &ioh->next;
+ }
+ if (!ioh)
+ fprintf(stderr,
+ "warning: can't find iohandler for %d to close it
properly.\n",
+ head_net_tap->fd);
+ close(head_net_tap->fd);
+ head_net_tap = head_net_tap->next;
+ }
+}
+
/* fd support */
static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
@@ -4403,6 +4446,8 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
return NULL;
s->fd = fd;
s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s);
+ s->next = head_net_tap;
+ head_net_tap = s;
qemu_set_fd_handler(s->fd, tap_send, NULL, s);
snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd);
return s;
@@ -6133,20 +6178,6 @@ static void dumb_display_init(DisplayState *ds)
#define MAX_IO_HANDLERS 64
-typedef struct IOHandlerRecord {
- int fd;
- IOCanRWHandler *fd_read_poll;
- IOHandler *fd_read;
- IOHandler *fd_write;
- int deleted;
- void *opaque;
- /* temporary data */
- struct pollfd *ufd;
- struct IOHandlerRecord *next;
-} IOHandlerRecord;
-
-static IOHandlerRecord *first_io_handler;
-
/* XXX: fd_read_poll should be suppressed, but an API change is
necessary in the character devices to suppress fd_can_read(). */
int qemu_set_fd_handler2(int fd,
diff --git a/xenstore.c b/xenstore.c
index 86e8b63..e57fd66 100644
--- a/xenstore.c
+++ b/xenstore.c
@@ -778,6 +778,34 @@ void xenstore_record_dm(char *subpath, char *state)
free(path);
}
+int
+xenstore_pv_driver_build_blacklisted(uint16_t product_nr,
+ uint32_t build_nr)
+{
+ char *buf = NULL;
+ char *tmp;
+ const char *product;
+
+ switch (product_nr) {
+ case 1:
+ product = "xensource-windows";
+ break;
+ default:
+ /* Don't know what product this is -> we can't blacklist
+ * it. */
+ return 0;
+ }
+ if (asprintf(&buf, "/mh/driver-blacklist/%s/%d", product, build_nr) < 0)
+ return 0;
+ tmp = xs_read(xsh, XBT_NULL, buf, NULL);
+ free(tmp);
+ free(buf);
+ if (tmp == NULL)
+ return 0;
+ else
+ return 1;
+}
+
void xenstore_record_dm_state(char *state)
{
xenstore_record_dm("state", state);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|