# HG changeset patch
# User Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
# Date 1306418186 14400
# Node ID 6f9f54084bd3d5969298e50c79d21d7e74a2789a
# Parent 99e599644e12a7e8fded4cc8e4494ee861d3800a
libxl: Add support for passing in the host's E820 for PCI passthrough
The code that populates E820 is unconditionally triggered by the guest
configuration having "pci=['<BDF>,..']", being a PV guest, and if
b_info->u.pv.e820_host is set.
The code do_domain_create calls the libxl__e820_alloc when
it notices that the guest is PV, has at least one PCI devices, and has
the e820_host flag set.
libxl__e820_alloc calls the xc_get_machine_memory_map to retrieve the systems
E820. Then the E820 is sanitized to weed out E820 entries below 16MB, and as
well remove any E820_RAM or E820_UNUSED regions as the guest does not need to
know about them. The guest only needs the E820_ACPI, E820_NVS, E820_RESERVED to
get an idea of where the PCI I/O space is. Mostly.. The Linux kernel assumes
that any
gap in the E820 is considered PCI I/O space which means that if we pass
in the guest 2GB, and the E820_ACPI, and its friend start at 3GB, the
gap between 2GB and 3GB will be considered as PCI I/O space. To guard against
that we also create an E820_UNUSABLE between the region of 'target_kb'
(called ram_end in the code) up to the first E820_[ACPI,NVS,RESERVED] region.
Lastly, the xc_domain_set_memory_map is called to install the new E820.
When tested with another PV guest (NetBSD 5.1) the modified E820 gave
it no trouble. The code has also been tested with older "classic" Xen Linux
and with the newer "pvops" with success (SLES11, RHEL5, Ubuntu Lucid,
Debian Squeeze, 2.6.37, 2.6.38, 2.6.39).
Memory that is slack or for balloon (so 'maxmem' in guest configuration)
is put behind the machine E820. Which in most cases is after the 4GB.
The reason for doing the fetching of the E820 using the hypercall in
the toolstack (instead of the guest doing it) is that when a guest
would do a hypercall to 'XENMEM_machine_memory_map' it would
retrieve an E820 with I/O range caps added in. Meaning that the
region after 4GB up to end of possible memory would be marked as unusable
and the kernel would not have any space to allocate a balloon
region.
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Committed-by: Ian Jackson <ian.jackson.citrix.com>
---
diff -r 99e599644e12 -r 6f9f54084bd3 tools/libxl/libxl.idl
--- a/tools/libxl/libxl.idl Thu May 26 15:55:22 2011 +0100
+++ b/tools/libxl/libxl.idl Thu May 26 09:56:26 2011 -0400
@@ -180,6 +180,7 @@
("cmdline", string),
("ramdisk", libxl_file_reference),
("features", string, True),
+ ("e820_host", bool, False, "Use host's
E820 for PCI passthrough."),
])),
])),
],
diff -r 99e599644e12 -r 6f9f54084bd3 tools/libxl/libxl_create.c
--- a/tools/libxl/libxl_create.c Thu May 26 15:55:22 2011 +0100
+++ b/tools/libxl/libxl_create.c Thu May 26 09:56:26 2011 -0400
@@ -525,6 +525,14 @@
for (i = 0; i < d_config->num_pcidevs; i++)
libxl__device_pci_add(gc, domid, &d_config->pcidevs[i], 1);
+ if (!d_config->c_info.hvm && d_config->b_info.u.pv.e820_host) {
+ int rc;
+ rc = libxl__e820_alloc(ctx, domid, d_config);
+ if (rc)
+ LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR,
+ "Failed while collecting E820 with: %d (errno:%d)\n",
+ rc, errno);
+ }
if ( cb && (d_config->c_info.hvm || d_config->b_info.u.pv.bootloader )) {
if ( (*cb)(ctx, domid, priv) )
goto error_out;
diff -r 99e599644e12 -r 6f9f54084bd3 tools/libxl/libxl_internal.h
--- a/tools/libxl/libxl_internal.h Thu May 26 15:55:22 2011 +0100
+++ b/tools/libxl/libxl_internal.h Thu May 26 09:56:26 2011 -0400
@@ -369,4 +369,5 @@
_hidden int libxl__file_reference_map(libxl_file_reference *f);
_hidden int libxl__file_reference_unmap(libxl_file_reference *f);
+_hidden int libxl__e820_alloc(libxl_ctx *ctx, uint32_t domid,
libxl_domain_config *d_config);
#endif
diff -r 99e599644e12 -r 6f9f54084bd3 tools/libxl/libxl_pci.c
--- a/tools/libxl/libxl_pci.c Thu May 26 15:55:22 2011 +0100
+++ b/tools/libxl/libxl_pci.c Thu May 26 09:56:26 2011 -0400
@@ -1051,3 +1051,167 @@
free(pcidevs);
return 0;
}
+
+static const char *e820_names(int type)
+{
+ switch (type) {
+ case E820_RAM: return "RAM";
+ case E820_RESERVED: return "Reserved";
+ case E820_ACPI: return "ACPI";
+ case E820_NVS: return "ACPI NVS";
+ case E820_UNUSABLE: return "Unusable";
+ default: break;
+ }
+ return "Unknown";
+}
+
+static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
+ uint32_t *nr_entries,
+ unsigned long map_limitkb,
+ unsigned long balloon_kb)
+{
+ uint64_t delta_kb = 0, start = 0, start_kb = 0, last = 0, ram_end;
+ uint32_t i, idx = 0, nr;
+ struct e820entry e820[E820MAX];
+
+ if (!src || !map_limitkb || !balloon_kb || !nr_entries)
+ return ERROR_INVAL;
+
+ nr = *nr_entries;
+ if (!nr)
+ return ERROR_INVAL;
+
+ if (nr > E820MAX)
+ return ERROR_NOMEM;
+
+ /* Weed out anything under 1MB */
+ for (i = 0; i < nr; i++) {
+ if (src[i].addr > 0x100000)
+ continue;
+
+ src[i].type = 0;
+ src[i].size = 0;
+ src[i].addr = -1ULL;
+ }
+
+ /* Find the lowest and highest entry in E820, skipping over
+ * undesired entries. */
+ start = -1ULL;
+ last = 0;
+ for (i = 0; i < nr; i++) {
+ if ((src[i].type == E820_RAM) ||
+ (src[i].type == E820_UNUSABLE) ||
+ (src[i].type == 0))
+ continue;
+
+ start = src[i].addr < start ? src[i].addr : start;
+ last = src[i].addr + src[i].size > last ?
+ src[i].addr + src[i].size > last : last;
+ }
+ if (start > 1024)
+ start_kb = start >> 10;
+
+ /* Add the memory RAM region for the guest */
+ e820[idx].addr = 0;
+ e820[idx].size = (uint64_t)map_limitkb << 10;
+ e820[idx].type = E820_RAM;
+
+ /* .. and trim if neccessary */
+ if (start_kb && map_limitkb > start_kb) {
+ delta_kb = map_limitkb - start_kb;
+ if (delta_kb)
+ e820[idx].size -= (uint64_t)(delta_kb << 10);
+ }
+ /* Note: We don't touch balloon_kb here. Will add it at the end. */
+ ram_end = e820[idx].addr + e820[idx].size;
+ idx ++;
+
+ LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Memory: %"PRIu64"kB End of RAM: " \
+ "0x%"PRIx64" (PFN) Delta: %"PRIu64"kB, PCI start: %"PRIu64"kB "
\
+ "(0x%"PRIx64" PFN), Balloon %"PRIu64"kB\n",
(uint64_t)map_limitkb,
+ ram_end >> 12, delta_kb, start_kb ,start >> 12,
+ (uint64_t)balloon_kb);
+
+ /* Check if there is a region between ram_end and start. */
+ if (start > ram_end) {
+ /* .. and if not present, add it in. This is to guard against
+ the Linux guest assuming that the gap between the end of
+ RAM region and the start of the E820_[ACPI,NVS,RESERVED]
+ is PCI I/O space. Which it certainly is _not_. */
+ e820[idx].type = E820_UNUSABLE;
+ e820[idx].addr = ram_end;
+ e820[idx].size = start - ram_end;
+ idx++;
+ }
+ /* Almost done: copy them over, ignoring the undesireable ones */
+ for (i = 0; i < nr; i++) {
+ if ((src[i].type == E820_RAM) ||
+ (src[i].type == E820_UNUSABLE) ||
+ (src[i].type == 0))
+ continue;
+
+ e820[idx].type = src[i].type;
+ e820[idx].addr = src[i].addr;
+ e820[idx].size = src[i].size;
+ idx++;
+ }
+ /* At this point we have the mapped RAM + E820 entries from src. */
+ if (balloon_kb) {
+ /* and if we truncated the RAM region, then add it to the end. */
+ e820[idx].type = E820_RAM;
+ e820[idx].addr = (uint64_t)(1ULL << 32) > last ?
+ (uint64_t)(1ULL << 32) : last;
+ /* also add the balloon memory to the end. */
+ e820[idx].size = (uint64_t)(delta_kb << 10) +
+ (uint64_t)(balloon_kb << 10);
+ idx++;
+
+ }
+ nr = idx;
+
+ for (i = 0; i < nr; i++) {
+ LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, ":\t[%"PRIx64" -> %"PRIx64"] %s",
+ e820[i].addr >> 12, (e820[i].addr + e820[i].size) >> 12,
+ e820_names(e820[i].type));
+ }
+
+ /* Done: copy the sanitized version. */
+ *nr_entries = nr;
+ memcpy(src, e820, nr * sizeof(struct e820entry));
+ return 0;
+}
+
+int libxl__e820_alloc(libxl_ctx *ctx, uint32_t domid, libxl_domain_config
*d_config)
+{
+ int rc;
+ uint32_t nr;
+ struct e820entry map[E820MAX];
+ libxl_domain_build_info *b_info;
+
+ if (d_config == NULL || d_config->c_info.hvm)
+ return ERROR_INVAL;
+
+ b_info = &d_config->b_info;
+ if (!b_info->u.pv.e820_host)
+ return ERROR_INVAL;
+
+ rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+ if (rc < 0) {
+ errno = rc;
+ return ERROR_FAIL;
+ }
+ nr = rc;
+ rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+ (b_info->max_memkb - b_info->target_memkb) +
+ b_info->u.pv.slack_memkb);
+ if (rc)
+ return ERROR_FAIL;
+
+ rc = xc_domain_set_memory_map(ctx->xch, domid, map, nr);
+
+ if (rc < 0) {
+ errno = rc;
+ return ERROR_FAIL;
+ }
+ return 0;
+}
diff -r 99e599644e12 -r 6f9f54084bd3 tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c Thu May 26 15:55:22 2011 +0100
+++ b/tools/libxl/xl_cmdimpl.c Thu May 26 09:56:26 2011 -0400
@@ -380,6 +380,7 @@
printf("\t\t\t(kernel %s)\n", b_info->u.pv.kernel.path);
printf("\t\t\t(cmdline %s)\n", b_info->u.pv.cmdline);
printf("\t\t\t(ramdisk %s)\n", b_info->u.pv.ramdisk.path);
+ printf("\t\t\t(e820_host %d)\n", b_info->u.pv.e820_host);
printf("\t\t)\n");
}
printf("\t)\n");
@@ -1001,6 +1002,8 @@
if (!libxl_device_pci_parse_bdf(ctx, pcidev, buf))
d_config->num_pcidevs++;
}
+ if (d_config->num_pcidevs && !c_info->hvm)
+ b_info->u.pv.e820_host = true;
}
switch (xlu_cfg_get_list(config, "cpuid", &cpuids, 0, 1)) {
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|