# HG changeset patch
# User Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
# Date 1306418190 14400
# Node ID 271659b83e1590c5091363a47ea4794291fa2207
# Parent 6f9f54084bd3d5969298e50c79d21d7e74a2789a
libxl: Convert E820_UNUSABLE and E820_RAM to E820_UNUSABLE as appropriate.
Most machines after the RAM regions in the e802 have a couple of
E820_RESERVED, with E820_ACPI and E820_NVS. On some Intel machines, the
E820 looks like swiss cheese:
(XEN) Initial Xen-e820 RAM map:
(XEN) 0000000000000000 - 000000000009d000 (usable)
(XEN) 000000000009d000 - 00000000000a0000 (reserved)
(XEN) 00000000000e0000 - 0000000000100000 (reserved)
(XEN) 0000000000100000 - 000000009cf66000 (usable)
(XEN) 000000009cf66000 - 000000009d102000 (ACPI NVS)
(XEN) 000000009d102000 - 000000009f6bd000 (usable) <--
(XEN) 000000009f6bd000 - 000000009f6bf000 (reserved)
(XEN) 000000009f6bf000 - 000000009f714000 (usable) <--
(XEN) 000000009f714000 - 000000009f7bf000 (ACPI NVS)
(XEN) 000000009f7bf000 - 000000009f7e0000 (usable) <--
(XEN) 000000009f7e0000 - 000000009f7ff000 (ACPI data)
(XEN) 000000009f7ff000 - 000000009f800000 (usable) <--
(XEN) 000000009f800000 - 00000000a0000000 (reserved)
(XEN) 00000000a0000000 - 00000000b0000000 (reserved)
(XEN) 00000000fc000000 - 00000000fd000000 (reserved)
(XEN) 00000000ffe00000 - 0000000100000000 (reserved)
(XEN) 0000000100000000 - 0000000160000000 (usable)
Which means we have to pay attention to the E820_RAM that are
between the E820_[ACPI,NVS,RESERVED]. If we remove those
E820_RAM (b/c the amount of memory passed to the guest
is less that where those E820 regions reside) from the E820, the
Linux kernel interprets those "gaps" as PCI I/O space.
This is what we are currently doing.
This can be disastrous if we pass in an Intel IGD card which tries
to use the first available PCI I/O space - and ends up
using the MFNs which are actually RAM instead of being the
PCI I/O space.
To make this work, we convert all E820_RAM that are above
the 'target_kb' (those that overlap the 'target_kb'
are truncated appropriately) to be E820_UNUSABLE. We also limit this
alternation up to 4GB. This means that an E820 for a guest
>from this (target_kb=1024, maxmem=2048):
[ 0.000000] Set 405658 page(s) to 1-1 mapping.
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable)
[ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved)
[ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable)
[ 0.000000] Xen: 0000000040000000 - 000000009cf66000 (unusable)
[ 0.000000] Xen: 000000009cf66000 - 000000009d102000 (ACPI NVS)
[ 0.000000] Xen: 000000009f6bd000 - 000000009f6bf000 (reserved)
[ 0.000000] Xen: 000000009f714000 - 000000009f7bf000 (ACPI NVS)
[ 0.000000] Xen: 000000009f7e0000 - 000000009f7ff000 (ACPI data)
[ 0.000000] Xen: 000000009f800000 - 00000000b0000000 (reserved)
[ 0.000000] Xen: 00000000fc000000 - 00000000fd000000 (reserved)
[ 0.000000] Xen: 00000000fec00000 - 00000000fec01000 (reserved)
[ 0.000000] Xen: 00000000fee00000 - 00000000fee01000 (reserved)
[ 0.000000] Xen: 00000000ffe00000 - 0000000100000000 (reserved)
[ 0.000000] Xen: 0000000100000000 - 0000000140800000 (usable)
Will look as so:
[ 0.000000] Set 395880 page(s) to 1-1 mapping.
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] Xen: 0000000000000000 - 00000000000a0000 (usable)
[ 0.000000] Xen: 00000000000a0000 - 0000000000100000 (reserved)
[ 0.000000] Xen: 0000000000100000 - 0000000040000000 (usable)
[ 0.000000] Xen: 0000000040000000 - 000000009cf66000 (unusable)
[ 0.000000] Xen: 000000009cf66000 - 000000009d102000 (ACPI NVS)
[ 0.000000] Xen: 000000009d102000 - 000000009f6bd000 (unusable)
[ 0.000000] Xen: 000000009f6bd000 - 000000009f6bf000 (reserved)
[ 0.000000] Xen: 000000009f6bf000 - 000000009f714000 (unusable)
[ 0.000000] Xen: 000000009f714000 - 000000009f7bf000 (ACPI NVS)
[ 0.000000] Xen: 000000009f7bf000 - 000000009f7e0000 (unusable)
[ 0.000000] Xen: 000000009f7e0000 - 000000009f7ff000 (ACPI data)
[ 0.000000] Xen: 000000009f7ff000 - 000000009f800000 (unusable)
[ 0.000000] Xen: 000000009f800000 - 00000000b0000000 (reserved)
[ 0.000000] Xen: 00000000fc000000 - 00000000fd000000 (reserved)
[ 0.000000] Xen: 00000000fec00000 - 00000000fec01000 (reserved)
[ 0.000000] Xen: 00000000fee00000 - 00000000fee01000 (reserved)
[ 0.000000] Xen: 00000000ffe00000 - 0000000100000000 (reserved)
[ 0.000000] Xen: 0000000100000000 - 0000000140800000 (usable)
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Committed-by: Ian Jackson <ian.jackson.citrix.com>
---
diff -r 6f9f54084bd3 -r 271659b83e15 tools/libxl/libxl_pci.c
--- a/tools/libxl/libxl_pci.c Thu May 26 09:56:26 2011 -0400
+++ b/tools/libxl/libxl_pci.c Thu May 26 09:56:30 2011 -0400
@@ -1132,21 +1132,98 @@
ram_end >> 12, delta_kb, start_kb ,start >> 12,
(uint64_t)balloon_kb);
+
+ /* This whole code below is to guard against if the Intel IGD is passed
into
+ * the guest. If we don't pass in IGD, this whole code can be ignored.
+ *
+ * The reason for this code is that Intel boxes fill their E820 with
+ * E820_RAM amongst E820_RESERVED and we can't just ditch those E820_RAM.
+ * That is b/c any "gaps" in the E820 is considered PCI I/O space by
+ * Linux and it would be utilized by the Intel IGD as I/O space while
+ * in reality it was an RAM region.
+ *
+ * What this means is that we have to walk the E820 and for any region
+ * that is RAM and below 4GB and above ram_end, needs to change its type
+ * to E820_UNUSED. We also need to move some of the E820_RAM regions if
+ * the overlap with ram_end. */
+ for (i = 0; i < nr; i++) {
+ uint64_t end = src[i].addr + src[i].size;
+
+ /* We don't care about E820_UNUSABLE, but we need to
+ * change the type to zero b/c the loop after this
+ * sticks E820_UNUSABLE on the guest's E820 but ignores
+ * the ones with type zero. */
+ if ((src[i].type == E820_UNUSABLE) ||
+ /* Any region that is within the "RAM region" can
+ * be safely ditched. */
+ (end < ram_end)) {
+ src[i].type = 0;
+ continue;
+ }
+
+ /* Look only at RAM regions. */
+ if (src[i].type != E820_RAM)
+ continue;
+
+ /* We only care about RAM regions below 4GB. */
+ if (src[i].addr >= (1ULL<<32))
+ continue;
+
+ /* E820_RAM overlaps with our RAM region. Move it */
+ if (src[i].addr < ram_end) {
+ uint64_t delta;
+
+ src[i].type = E820_UNUSABLE;
+ delta = ram_end - src[i].addr;
+ /* The end < ram_end should weed this out */
+ if (src[i].size - delta < 0)
+ src[i].type = 0;
+ else {
+ src[i].size -= delta;
+ src[i].addr = ram_end;
+ }
+ if (src[i].addr + src[i].size != end) {
+ /* We messed up somewhere */
+ src[i].type = 0;
+ LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Computed E820
wrongly. Continuing on.");
+ }
+ }
+ /* Lastly, convert the RAM to UNSUABLE. Look in the Linux kernel
+ at git commit 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948
+ "xen/setup: Inhibit resource API from using System RAM E820
+ gaps as PCI mem gaps" for full explanation. */
+ if (end > ram_end)
+ src[i].type = E820_UNUSABLE;
+ }
+
/* Check if there is a region between ram_end and start. */
if (start > ram_end) {
+ int add_unusable = 1;
+ for (i = 0; i < nr && add_unusable; i++) {
+ if (src[i].type != E820_UNUSABLE)
+ continue;
+ if (ram_end != src[i].addr)
+ continue;
+ if (start != src[i].addr + src[i].size) {
+ /* there is one, adjust it */
+ src[i].size = start - src[i].addr;
+ }
+ add_unusable = 0;
+ }
/* .. and if not present, add it in. This is to guard against
- the Linux guest assuming that the gap between the end of
- RAM region and the start of the E820_[ACPI,NVS,RESERVED]
- is PCI I/O space. Which it certainly is _not_. */
- e820[idx].type = E820_UNUSABLE;
- e820[idx].addr = ram_end;
- e820[idx].size = start - ram_end;
- idx++;
+ the Linux guest assuming that the gap between the end of
+ RAM region and the start of the E820_[ACPI,NVS,RESERVED]
+ is PCI I/O space. Which it certainly is _not_. */
+ if (add_unusable) {
+ e820[idx].type = E820_UNUSABLE;
+ e820[idx].addr = ram_end;
+ e820[idx].size = start - ram_end;
+ idx++;
+ }
}
/* Almost done: copy them over, ignoring the undesireable ones */
for (i = 0; i < nr; i++) {
if ((src[i].type == E820_RAM) ||
- (src[i].type == E820_UNUSABLE) ||
(src[i].type == 0))
continue;
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|