From: Jun Nakajima <jun.nakajima@xxxxxxxxx>
On IA32 host or IA32 PAE host, at present, generally, we can't create
an HVM guest with more than 2G memory, because generally it's almost
impossible for Qemu to find a large enough and consecutive virtual
address space to map an HVM guest's whole physical address space.
The attached patch fixes this issue using dynamic mapping based on
little blocks of memory.
Each call to qemu_get_ram_ptr makes a call to qemu_map_cache with the
lock option, so mapcache will not unmap these ram_ptr.
Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>
Signed-off-by: Anthony PERARD <anthony.perard@xxxxxxxxxx>
Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>
---
Makefile.target | 3 +
configure | 3 +
exec.c | 40 ++++++-
hw/xen.h | 13 +++
xen-all.c | 64 +++++++++++
xen-mapcache-stub.c | 33 ++++++
xen-mapcache.c | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++
xen-mapcache.h | 14 +++
xen-stub.c | 4 +
9 files changed, 471 insertions(+), 4 deletions(-)
create mode 100644 xen-mapcache-stub.c
create mode 100644 xen-mapcache.c
create mode 100644 xen-mapcache.h
diff --git a/Makefile.target b/Makefile.target
index 8126da9..18b3959 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -207,9 +207,12 @@ QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
QEMU_CFLAGS += $(VNC_PNG_CFLAGS)
# xen support
+CONFIG_NO_XEN_MAPCACHE = $(if $(subst n,,$(CONFIG_XEN_MAPCACHE)),n,y)
obj-$(CONFIG_XEN) += xen_interfaces.o
obj-$(CONFIG_XEN) += xen-all.o
obj-$(CONFIG_NO_XEN) += xen-stub.o
+obj-$(CONFIG_XEN_MAPCACHE) += xen-mapcache.o
+obj-$(CONFIG_NO_XEN_MAPCACHE) += xen-mapcache-stub.o
# xen backend driver support
obj-$(CONFIG_XEN) += xen_backend.o xen_devconfig.o
diff --git a/configure b/configure
index fde9bad..c9a13e1 100755
--- a/configure
+++ b/configure
@@ -3069,6 +3069,9 @@ case "$target_arch2" in
echo "CONFIG_XEN=y" >> $config_target_mak
echo "LIBS+=$xen_libs" >> $config_target_mak
echo "CONFIG_XEN_CTRL_INTERFACE_VERSION=$xen_ctrl_version" >>
$config_target_mak
+ if test "$cpu" = "i386" -o "$cpu" = "x86_64"; then
+ echo "CONFIG_XEN_MAPCACHE=y" >> $config_target_mak
+ fi
fi
esac
case "$target_arch2" in
diff --git a/exec.c b/exec.c
index e950df2..3b137dc 100644
--- a/exec.c
+++ b/exec.c
@@ -32,6 +32,7 @@
#include "hw/qdev.h"
#include "osdep.h"
#include "kvm.h"
+#include "hw/xen.h"
#include "qemu-timer.h"
#if defined(CONFIG_USER_ONLY)
#include <qemu.h>
@@ -51,6 +52,8 @@
#include <libutil.h>
#endif
#endif
+#else /* !CONFIG_USER_ONLY */
+#include "xen-mapcache.h"
#endif
//#define DEBUG_TB_INVALIDATE
@@ -2835,6 +2838,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev,
const char *name,
}
}
+ new_block->offset = find_ram_offset(size);
if (host) {
new_block->host = host;
} else {
@@ -2856,13 +2860,15 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev,
const char *name,
PROT_EXEC|PROT_READ|PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
#else
- new_block->host = qemu_vmalloc(size);
+ if (xen_mapcache_enabled()) {
+ xen_ram_alloc(new_block->offset, size);
+ } else {
+ new_block->host = qemu_vmalloc(size);
+ }
#endif
qemu_madvise(new_block->host, size, QEMU_MADV_MERGEABLE);
}
}
-
- new_block->offset = find_ram_offset(size);
new_block->length = size;
QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
@@ -2903,7 +2909,11 @@ void qemu_ram_free(ram_addr_t addr)
#if defined(TARGET_S390X) && defined(CONFIG_KVM)
munmap(block->host, block->length);
#else
- qemu_vfree(block->host);
+ if (xen_mapcache_enabled()) {
+ qemu_invalidate_entry(block->host);
+ } else {
+ qemu_vfree(block->host);
+ }
#endif
}
qemu_free(block);
@@ -2929,6 +2939,15 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
if (addr - block->offset < block->length) {
QLIST_REMOVE(block, next);
QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+ if (xen_mapcache_enabled()) {
+ /* We need to check if the requested address is in the RAM
+ * because we don't want to map the entire memory in QEMU.
+ */
+ if (block->offset == 0) {
+ return qemu_map_cache(addr, 0, 1);
+ }
+ block->host = qemu_map_cache(block->offset, block->length, 1);
+ }
return block->host + (addr - block->offset);
}
}
@@ -2964,11 +2983,21 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t
*ram_addr)
uint8_t *host = ptr;
QLIST_FOREACH(block, &ram_list.blocks, next) {
+ /* This case append when the block is not mapped. */
+ if (block->host == NULL) {
+ continue;
+ }
if (host - block->host < block->length) {
*ram_addr = block->offset + (host - block->host);
return 0;
}
}
+
+ if (xen_mapcache_enabled()) {
+ *ram_addr = qemu_ram_addr_from_mapcache(ptr);
+ return 0;
+ }
+
return -1;
}
@@ -3879,6 +3908,9 @@ void cpu_physical_memory_unmap(void *buffer,
target_phys_addr_t len,
if (is_write) {
cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len);
}
+ if (xen_enabled()) {
+ qemu_invalidate_entry(buffer);
+ }
qemu_vfree(bounce.buffer);
bounce.buffer = NULL;
cpu_notify_map_clients();
diff --git a/hw/xen.h b/hw/xen.h
index 338cf76..dd3fb68 100644
--- a/hw/xen.h
+++ b/hw/xen.h
@@ -31,10 +31,23 @@ static inline int xen_enabled(void)
#endif
}
+static inline int xen_mapcache_enabled(void)
+{
+#ifdef CONFIG_XEN_MAPCACHE
+ return xen_enabled();
+#else
+ return 0;
+#endif
+}
+
int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num);
void xen_piix3_set_irq(void *opaque, int irq_num, int level);
void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len);
int xen_init(int smp_cpus);
+#if defined(NEED_CPU_H) && !defined(CONFIG_USER_ONLY)
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size);
+#endif
+
#endif /* QEMU_HW_XEN_H */
diff --git a/xen-all.c b/xen-all.c
index 205cbc4..2b9e71c 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -13,6 +13,8 @@
#include "hw/xen_backend.h"
#include "hw/xen_redirect.h"
+#include "xen-mapcache.h"
+
/* Xen specific function for piix pci */
int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
@@ -55,6 +57,64 @@ qemu_irq *i8259_xen_init(void)
return qemu_allocate_irqs(i8259_set_irq, NULL, 16);
}
+
+/* Memory Ops */
+
+static void xen_ram_init(ram_addr_t ram_size)
+{
+ RAMBlock *new_block;
+ ram_addr_t below_4g_mem_size, above_4g_mem_size = 0;
+
+ new_block = qemu_mallocz(sizeof (*new_block));
+ pstrcpy(new_block->idstr, sizeof (new_block->idstr), "xen.ram");
+ new_block->host = NULL;
+ new_block->offset = 0;
+ new_block->length = ram_size;
+
+ QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+ ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+ new_block->length >> TARGET_PAGE_BITS);
+ memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+ 0xff, new_block->length >> TARGET_PAGE_BITS);
+
+ if (ram_size >= 0xe0000000 ) {
+ above_4g_mem_size = ram_size - 0xe0000000;
+ below_4g_mem_size = 0xe0000000;
+ } else {
+ below_4g_mem_size = ram_size;
+ }
+
+ cpu_register_physical_memory(0, below_4g_mem_size, new_block->offset);
+#if TARGET_PHYS_ADDR_BITS > 32
+ if (above_4g_mem_size > 0) {
+ cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
+ new_block->offset + below_4g_mem_size);
+ }
+#endif
+}
+
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
+{
+ unsigned long nr_pfn;
+ xen_pfn_t *pfn_list;
+ int i;
+
+ nr_pfn = size >> TARGET_PAGE_BITS;
+ pfn_list = qemu_malloc(sizeof (*pfn_list) * nr_pfn);
+
+ for (i = 0; i < nr_pfn; i++) {
+ pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
+ }
+
+ if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0,
pfn_list)) {
+ hw_error("xen: failed to populate ram at %lx", ram_addr);
+ }
+
+ qemu_free(pfn_list);
+}
+
+
/* Initialise Xen */
int xen_init(int smp_cpus)
@@ -68,5 +128,9 @@ int xen_init(int smp_cpus)
return -1;
}
+ /* Init RAM management */
+ qemu_map_cache_init();
+ xen_ram_init(ram_size);
+
return 0;
}
diff --git a/xen-mapcache-stub.c b/xen-mapcache-stub.c
new file mode 100644
index 0000000..69ce2e7
--- /dev/null
+++ b/xen-mapcache-stub.c
@@ -0,0 +1,33 @@
+#include "config.h"
+
+#include "exec-all.h"
+#include "qemu-common.h"
+#include "cpu-common.h"
+#include "xen-mapcache.h"
+
+int qemu_map_cache_init(void)
+{
+ return 0;
+}
+
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size,
uint8_t lock)
+{
+ return qemu_get_ram_ptr(phys_addr);
+}
+
+void qemu_map_cache_unlock(void *buffer)
+{
+}
+
+ram_addr_t qemu_ram_addr_from_mapcache(void *ptr)
+{
+ return -1;
+}
+
+void qemu_invalidate_map_cache(void)
+{
+}
+
+void qemu_invalidate_entry(uint8_t *buffer)
+{
+}
diff --git a/xen-mapcache.c b/xen-mapcache.c
new file mode 100644
index 0000000..3e1cca9
--- /dev/null
+++ b/xen-mapcache.c
@@ -0,0 +1,301 @@
+#include "config.h"
+
+#include "hw/xen_backend.h"
+#include "blockdev.h"
+
+#include <xen/hvm/params.h>
+#include <sys/mman.h>
+
+#include "xen-mapcache.h"
+
+
+//#define MAPCACHE_DEBUG
+
+#ifdef MAPCACHE_DEBUG
+# define DPRINTF(fmt, ...) do { \
+ fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+# define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#if defined(__i386__)
+# define MAX_MCACHE_SIZE 0x40000000 /* 1GB max for x86 */
+# define MCACHE_BUCKET_SHIFT 16
+#elif defined(__x86_64__)
+# define MAX_MCACHE_SIZE 0x1000000000 /* 64GB max for x86_64 */
+# define MCACHE_BUCKET_SHIFT 20
+#endif
+#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+#define BITS_TO_LONGS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)]
+#define test_bit(bit, map) \
+ (!!((map)[(bit) / BITS_PER_LONG] & (1UL << ((bit) % BITS_PER_LONG))))
+
+typedef struct MapCacheEntry {
+ target_phys_addr_t paddr_index;
+ uint8_t *vaddr_base;
+ DECLARE_BITMAP(valid_mapping, MCACHE_BUCKET_SIZE >> XC_PAGE_SHIFT);
+ uint8_t lock;
+ struct MapCacheEntry *next;
+} MapCacheEntry;
+
+typedef struct MapCacheRev {
+ uint8_t *vaddr_req;
+ target_phys_addr_t paddr_index;
+ QTAILQ_ENTRY(MapCacheRev) next;
+} MapCacheRev;
+
+typedef struct MapCache {
+ MapCacheEntry *entry;
+ unsigned long nr_buckets;
+ QTAILQ_HEAD(map_cache_head, MapCacheRev) locked_entries;
+
+ /* For most cases (>99.9%), the page address is the same. */
+ target_phys_addr_t last_address_index;
+ uint8_t *last_address_vaddr;
+} MapCache;
+
+static MapCache *mapcache;
+
+
+int qemu_map_cache_init(void)
+{
+ unsigned long size;
+
+ mapcache = qemu_mallocz(sizeof (MapCache));
+
+ QTAILQ_INIT(&mapcache->locked_entries);
+ mapcache->last_address_index = -1;
+
+ mapcache->nr_buckets = (((MAX_MCACHE_SIZE >> XC_PAGE_SHIFT) +
+ (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
+ (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
+
+ /*
+ * Use mmap() directly: lets us allocate a big hash table with no up-front
+ * cost in storage space. The OS will allocate memory only for the buckets
+ * that we actually use. All others will contain all zeroes.
+ */
+ size = mapcache->nr_buckets * sizeof (MapCacheEntry);
+ size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
+ DPRINTF("qemu_map_cache_init, nr_buckets = %lx size %lu\n",
mapcache->nr_buckets, size);
+ mapcache->entry = mmap(NULL, size, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANON, -1, 0);
+ if (mapcache->entry == MAP_FAILED) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void qemu_remap_bucket(MapCacheEntry *entry,
+ target_phys_addr_t size,
+ target_phys_addr_t address_index)
+{
+ uint8_t *vaddr_base;
+ xen_pfn_t *pfns;
+ int *err;
+ unsigned int i, j;
+ target_phys_addr_t nb_pfn = size >> XC_PAGE_SHIFT;
+
+ pfns = qemu_mallocz(nb_pfn * sizeof (xen_pfn_t));
+ err = qemu_mallocz(nb_pfn * sizeof (int));
+
+ if (entry->vaddr_base != NULL) {
+ if (munmap(entry->vaddr_base, size) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+ }
+
+ for (i = 0; i < nb_pfn; i++) {
+ pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
+ }
+
+ vaddr_base = xc_map_foreign_bulk(xen_xc, xen_domid, PROT_READ|PROT_WRITE,
+ pfns, err, nb_pfn);
+ if (vaddr_base == NULL) {
+ perror("xc_map_foreign_bulk");
+ exit(-1);
+ }
+
+ entry->vaddr_base = vaddr_base;
+ entry->paddr_index = address_index;
+
+ for (i = 0; i < nb_pfn; i += BITS_PER_LONG) {
+ unsigned long word = 0;
+ if ((i + BITS_PER_LONG) > nb_pfn) {
+ j = nb_pfn % BITS_PER_LONG;
+ } else {
+ j = BITS_PER_LONG;
+ }
+ while (j > 0) {
+ word = (word << 1) | !err[i + --j];
+ }
+ entry->valid_mapping[i / BITS_PER_LONG] = word;
+ }
+
+ qemu_free(pfns);
+ qemu_free(err);
+}
+
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size,
uint8_t lock)
+{
+ MapCacheEntry *entry, *pentry = NULL;
+ target_phys_addr_t address_index = phys_addr >> MCACHE_BUCKET_SHIFT;
+ target_phys_addr_t address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
+
+ if (address_index == mapcache->last_address_index && !lock) {
+ return mapcache->last_address_vaddr + address_offset;
+ }
+
+ entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+
+ while (entry && entry->lock && entry->paddr_index != address_index &&
entry->vaddr_base) {
+ pentry = entry;
+ entry = entry->next;
+ }
+ if (!entry) {
+ entry = qemu_mallocz(sizeof (MapCacheEntry));
+ pentry->next = entry;
+ qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index);
+ } else if (!entry->lock) {
+ if (!entry->vaddr_base || entry->paddr_index != address_index ||
+ !test_bit(address_offset >> XC_PAGE_SHIFT, entry->valid_mapping)) {
+ qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE,
address_index);
+ }
+ }
+
+ if (!test_bit(address_offset >> XC_PAGE_SHIFT, entry->valid_mapping)) {
+ mapcache->last_address_index = -1;
+ return NULL;
+ }
+
+ mapcache->last_address_index = address_index;
+ mapcache->last_address_vaddr = entry->vaddr_base;
+ if (lock) {
+ MapCacheRev *reventry = qemu_mallocz(sizeof(MapCacheRev));
+ entry->lock++;
+ reventry->vaddr_req = mapcache->last_address_vaddr + address_offset;
+ reventry->paddr_index = mapcache->last_address_index;
+ QTAILQ_INSERT_TAIL(&mapcache->locked_entries, reventry, next);
+ }
+
+ return mapcache->last_address_vaddr + address_offset;
+}
+
+ram_addr_t qemu_ram_addr_from_mapcache(void *ptr)
+{
+ MapCacheRev *reventry;
+ target_phys_addr_t paddr_index;
+ int found = 0;
+
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ if (reventry->vaddr_req == ptr) {
+ paddr_index = reventry->paddr_index;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ fprintf(stderr, "qemu_ram_addr_from_mapcache, could not find %p\n",
ptr);
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ DPRINTF(" %lx -> %p is present\n", reventry->paddr_index,
+ reventry->vaddr_req);
+ }
+ abort();
+ return 0;
+ }
+
+ return paddr_index << MCACHE_BUCKET_SHIFT;
+}
+
+void qemu_invalidate_entry(uint8_t *buffer)
+{
+ MapCacheEntry *entry = NULL, *pentry = NULL;
+ MapCacheRev *reventry;
+ target_phys_addr_t paddr_index;
+ int found = 0;
+
+ if (mapcache->last_address_vaddr == buffer) {
+ mapcache->last_address_index = -1;
+ }
+
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ if (reventry->vaddr_req == buffer) {
+ paddr_index = reventry->paddr_index;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ DPRINTF("qemu_invalidate_entry, could not find %p\n", buffer);
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ DPRINTF(" %lx -> %p is present\n", reventry->paddr_index,
reventry->vaddr_req);
+ }
+ return;
+ }
+ QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
+ qemu_free(reventry);
+
+ entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+ while (entry && entry->paddr_index != paddr_index) {
+ pentry = entry;
+ entry = entry->next;
+ }
+ if (!entry) {
+ DPRINTF("Trying to unmap address %p that is not in the mapcache!\n",
buffer);
+ return;
+ }
+ entry->lock--;
+ if (entry->lock > 0 || pentry == NULL) {
+ return;
+ }
+
+ pentry->next = entry->next;
+ if (munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+ qemu_free(entry);
+}
+
+void qemu_invalidate_map_cache(void)
+{
+ unsigned long i;
+ MapCacheRev *reventry;
+
+ qemu_aio_flush();
+
+ QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+ DPRINTF("There should be no locked mappings at this time, "
+ "but %lx -> %p is present\n",
+ reventry->paddr_index, reventry->vaddr_req);
+ }
+
+ mapcache_lock();
+
+ for (i = 0; i < mapcache->nr_buckets; i++) {
+ MapCacheEntry *entry = &mapcache->entry[i];
+
+ if (entry->vaddr_base == NULL) {
+ continue;
+ }
+
+ if (munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE) != 0) {
+ perror("unmap fails");
+ exit(-1);
+ }
+
+ entry->paddr_index = 0;
+ entry->vaddr_base = NULL;
+ }
+
+ mapcache->last_address_index = -1;
+ mapcache->last_address_vaddr = NULL;
+
+ mapcache_unlock();
+}
diff --git a/xen-mapcache.h b/xen-mapcache.h
new file mode 100644
index 0000000..86a017b
--- /dev/null
+++ b/xen-mapcache.h
@@ -0,0 +1,14 @@
+#ifndef XEN_MAPCACHE_H
+#define XEN_MAPCACHE_H
+
+int qemu_map_cache_init(void);
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t
size, uint8_t lock);
+void qemu_map_cache_unlock(void *phys_addr);
+ram_addr_t qemu_ram_addr_from_mapcache(void *ptr);
+void qemu_invalidate_entry(uint8_t *buffer);
+void qemu_invalidate_map_cache(void);
+
+#define mapcache_lock() ((void)0)
+#define mapcache_unlock() ((void)0)
+
+#endif /* !XEN_MAPCACHE_H */
diff --git a/xen-stub.c b/xen-stub.c
index 07e64bc..c9f477d 100644
--- a/xen-stub.c
+++ b/xen-stub.c
@@ -24,6 +24,10 @@ void xen_piix_pci_write_config_client(uint32_t address,
uint32_t val, int len)
{
}
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
+{
+}
+
int xen_init(int smp_cpus)
{
return -ENOSYS;
--
1.7.1
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|