Signed-off-by: Zhai Edwin Signed-off-by: Nakajima Jun diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/hw/cirrus_vga.c --- a/tools/ioemu/hw/cirrus_vga.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/ioemu/hw/cirrus_vga.c Wed Dec 13 22:52:02 2006 +0800 @@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w cirrus_mmio_writel, }; +void cirrus_stop_acc(CirrusVGAState *s) +{ + if (s->map_addr){ + int error; + s->map_addr = 0; + error = unset_vram_mapping(s->cirrus_lfb_addr, + s->cirrus_lfb_end); + fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n"); + + munmap(s->vram_ptr, VGA_RAM_SIZE); + } +} + +void cirrus_restart_acc(CirrusVGAState *s) +{ + if (s->cirrus_lfb_addr && s->cirrus_lfb_end) { + void *vram_pointer, *old_vram; + fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, lfb_end=0x%lx.\n", + s->cirrus_lfb_addr, s->cirrus_lfb_end); + vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end); + if (!vram_pointer){ + fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n"); + } else { + old_vram = vga_update_vram((VGAState *)s, vram_pointer, + VGA_RAM_SIZE); + qemu_free(old_vram); + s->map_addr = s->cirrus_lfb_addr; + s->map_end = s->cirrus_lfb_end; + } + } +} + /* load/save state */ static void cirrus_vga_save(QEMUFile *f, void *opaque) { CirrusVGAState *s = opaque; + uint8_t vga_acc; qemu_put_be32s(f, &s->latch); qemu_put_8s(f, &s->sr_index); @@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f, qemu_put_be32s(f, &s->hw_cursor_y); /* XXX: we do not save the bitblt state - we assume we do not save the state when the blitter is active */ + + vga_acc = (!!s->map_addr); + qemu_put_8s(f, &vga_acc); + qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr); + qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end); + qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); + if (vga_acc) + cirrus_stop_acc(s); } static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id) { CirrusVGAState *s = opaque; + uint8_t vga_acc = 0; if (version_id != 1) return -EINVAL; @@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f, qemu_get_be32s(f, &s->hw_cursor_x); qemu_get_be32s(f, &s->hw_cursor_y); + + qemu_get_8s(f, &vga_acc); + qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr); + qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end); + qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); + if (vga_acc){ + cirrus_restart_acc(s); + } /* force refresh */ s->graphic_mode = -1; diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/helper2.c --- a/tools/ioemu/target-i386-dm/helper2.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/ioemu/target-i386-dm/helper2.c Wed Dec 13 22:52:02 2006 +0800 @@ -525,6 +525,7 @@ int main_loop(void) { extern int vm_running; extern int shutdown_requested; + extern int suspend_requested; CPUState *env = cpu_single_env; int evtchn_fd = xc_evtchn_fd(xce_handle); @@ -542,12 +543,24 @@ int main_loop(void) qemu_system_reset(); reset_requested = 0; } + if (suspend_requested) { + fprintf(logfile, "device model received suspend signal!\n"); + break; + } } /* Wait up to 10 msec. */ main_loop_wait(10); } - destroy_hvm_domain(); + if (!suspend_requested) + destroy_hvm_domain(); + else { + char qemu_file[20]; + sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid); + if (qemu_savevm(qemu_file) < 0) + fprintf(stderr, "qemu save fail.\n"); + } + return 0; } diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/piix_pci-dm.c --- a/tools/ioemu/target-i386-dm/piix_pci-dm.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/ioemu/target-i386-dm/piix_pci-dm.c Wed Dec 13 22:52:02 2006 +0800 @@ -83,6 +83,11 @@ PCIBus *i440fx_init(void) /* PIIX3 PCI to ISA bridge */ static PCIDevice *piix3_dev; +static int pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num) +{ + /* This is the barber's pole mapping used by Xen. */ + return (irq_num + (pci_dev->devfn >> 3)) & 3; +} static void piix3_write_config(PCIDevice *d, uint32_t address, uint32_t val, int len) @@ -150,3 +155,227 @@ int piix3_init(PCIBus *bus) } void pci_bios_init(void) {} + +/***********************************************************/ +/* XXX: the following should be moved to the PC BIOS */ + +static __attribute__((unused)) uint32_t isa_inb(uint32_t addr) +{ + return cpu_inb(NULL, addr); +} + +static void isa_outb(uint32_t val, uint32_t addr) +{ + cpu_outb(NULL, addr, val); +} + +static __attribute__((unused)) uint32_t isa_inw(uint32_t addr) +{ + return cpu_inw(NULL, addr); +} + +static __attribute__((unused)) void isa_outw(uint32_t val, uint32_t addr) +{ + cpu_outw(NULL, addr, val); +} + +static __attribute__((unused)) uint32_t isa_inl(uint32_t addr) +{ + return cpu_inl(NULL, addr); +} + +static __attribute__((unused)) void isa_outl(uint32_t val, uint32_t addr) +{ + cpu_outl(NULL, addr, val); +} + +static uint32_t pci_bios_io_addr; +static uint32_t pci_bios_mem_addr; +/* host irqs corresponding to PCI irqs A-D */ +static uint8_t pci_irqs[4] = { 5, 6, 10, 11 }; + +static void pci_config_writel(PCIDevice *d, uint32_t addr, uint32_t val) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + pci_data_write(s, addr, val, 4); +} + +static void pci_config_writew(PCIDevice *d, uint32_t addr, uint32_t val) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + pci_data_write(s, addr, val, 2); +} + +static void pci_config_writeb(PCIDevice *d, uint32_t addr, uint32_t val) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + pci_data_write(s, addr, val, 1); +} + +static __attribute__((unused)) uint32_t pci_config_readl(PCIDevice *d, uint32_t addr) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + return pci_data_read(s, addr, 4); +} + +static uint32_t pci_config_readw(PCIDevice *d, uint32_t addr) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + return pci_data_read(s, addr, 2); +} + +static uint32_t pci_config_readb(PCIDevice *d, uint32_t addr) +{ + PCIBus *s = d->bus; + addr |= (pci_bus_num(s) << 16) | (d->devfn << 8); + return pci_data_read(s, addr, 1); +} + +static void pci_set_io_region_addr(PCIDevice *d, int region_num, uint32_t addr) +{ + PCIIORegion *r; + uint16_t cmd; + uint32_t ofs; + + if ( region_num == PCI_ROM_SLOT ) { + ofs = 0x30; + }else{ + ofs = 0x10 + region_num * 4; + } + + pci_config_writel(d, ofs, addr); + r = &d->io_regions[region_num]; + + /* enable memory mappings */ + cmd = pci_config_readw(d, PCI_COMMAND); + if ( region_num == PCI_ROM_SLOT ) + cmd |= 2; + else if (r->type & PCI_ADDRESS_SPACE_IO) + cmd |= 1; + else + cmd |= 2; + pci_config_writew(d, PCI_COMMAND, cmd); +} + +static void pci_bios_init_device(PCIDevice *d) +{ + int class; + PCIIORegion *r; + uint32_t *paddr; + int i, pin, pic_irq, vendor_id, device_id; + + class = pci_config_readw(d, PCI_CLASS_DEVICE); + vendor_id = pci_config_readw(d, PCI_VENDOR_ID); + device_id = pci_config_readw(d, PCI_DEVICE_ID); + switch(class) { + case 0x0101: + if (vendor_id == 0x8086 && device_id == 0x7010) { + /* PIIX3 IDE */ + pci_config_writew(d, 0x40, 0x8000); // enable IDE0 + pci_config_writew(d, 0x42, 0x8000); // enable IDE1 + goto default_map; + } else { + /* IDE: we map it as in ISA mode */ + pci_set_io_region_addr(d, 0, 0x1f0); + pci_set_io_region_addr(d, 1, 0x3f4); + pci_set_io_region_addr(d, 2, 0x170); + pci_set_io_region_addr(d, 3, 0x374); + } + break; + case 0x0680: + if (vendor_id == 0x8086 && device_id == 0x7113) { + /* + * PIIX4 ACPI PM. + * Special device with special PCI config space. No ordinary BARs. + */ + pci_config_writew(d, 0x20, 0x0000); // No smb bus IO enable + pci_config_writew(d, 0x22, 0x0000); + pci_config_writew(d, 0x3c, 0x0009); // Hardcoded IRQ9 + pci_config_writew(d, 0x3d, 0x0001); + } + break; + case 0x0300: + if (vendor_id != 0x1234) + goto default_map; + /* VGA: map frame buffer to default Bochs VBE address */ + pci_set_io_region_addr(d, 0, 0xE0000000); + break; + case 0x0800: + /* PIC */ + vendor_id = pci_config_readw(d, PCI_VENDOR_ID); + device_id = pci_config_readw(d, PCI_DEVICE_ID); + if (vendor_id == 0x1014) { + /* IBM */ + if (device_id == 0x0046 || device_id == 0xFFFF) { + /* MPIC & MPIC2 */ + pci_set_io_region_addr(d, 0, 0x80800000 + 0x00040000); + } + } + break; + case 0xff00: + if (vendor_id == 0x0106b && + (device_id == 0x0017 || device_id == 0x0022)) { + /* macio bridge */ + pci_set_io_region_addr(d, 0, 0x80800000); + } + break; + default: + default_map: + /* default memory mappings */ + for(i = 0; i < PCI_NUM_REGIONS; i++) { + r = &d->io_regions[i]; + if (r->size) { + if (r->type & PCI_ADDRESS_SPACE_IO) + paddr = &pci_bios_io_addr; + else + paddr = &pci_bios_mem_addr; + *paddr = (*paddr + r->size - 1) & ~(r->size - 1); + pci_set_io_region_addr(d, i, *paddr); + *paddr += r->size; + } + } + break; + } + + /* map the interrupt */ + pin = pci_config_readb(d, PCI_INTERRUPT_PIN); + if (pin != 0) { + pin = pci_slot_get_pirq(d, pin - 1); + pic_irq = pci_irqs[pin]; + pci_config_writeb(d, PCI_INTERRUPT_LINE, pic_irq); + } +} + +/* + * This function initializes the PCI devices as a normal PCI BIOS + * would do. It is provided just in case the BIOS has no support for + * PCI. + */ +void pci_setup(void) +{ + int i, irq; + uint8_t elcr[2]; + + pci_bios_io_addr = 0xc000; + pci_bios_mem_addr = HVM_BELOW_4G_MMIO_START; + + /* activate IRQ mappings */ + elcr[0] = 0x00; + elcr[1] = 0x00; + for(i = 0; i < 4; i++) { + irq = pci_irqs[i]; + /* set to trigger level */ + elcr[irq >> 3] |= (1 << (irq & 7)); + /* activate irq remapping in PIIX */ + pci_config_writeb(piix3_dev, 0x60 + i, irq); + } + isa_outb(elcr[0], 0x4d0); + isa_outb(elcr[1], 0x4d1); + + pci_for_each_device(pci_bios_init_device); +} diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/vl.c --- a/tools/ioemu/vl.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/ioemu/vl.c Wed Dec 13 22:52:02 2006 +0800 @@ -4441,6 +4441,11 @@ int qemu_loadvm(const char *filename) qemu_fseek(f, cur_pos + record_len, SEEK_SET); } fclose(f); + + /* del tmp file */ + if (unlink(filename) == -1) + fprintf(stderr, "delete tmp qemu state file failed.\n"); + ret = 0; the_end: if (saved_vm_running) @@ -5027,6 +5032,7 @@ static QEMUResetEntry *first_reset_entry static QEMUResetEntry *first_reset_entry; int reset_requested; int shutdown_requested; +int suspend_requested; static int powerdown_requested; void qemu_register_reset(QEMUResetHandler *func, void *opaque) @@ -5806,6 +5812,14 @@ int set_mm_mapping(int xc_handle, uint32 } return 0; +} + +void suspend(int sig) +{ + fprintf(logfile, "suspend sig handler called with requested=%d!\n", suspend_requested); + if (sig != SIGUSR1) + fprintf(logfile, "suspend signal dismatch, get sig=%d!\n", sig); + suspend_requested = 1; } #if defined(__i386__) || defined(__x86_64__) @@ -6709,8 +6723,12 @@ int main(int argc, char **argv) } } else #endif - if (loadvm) + if (loadvm) { + /*XXX: ugly, since pci_bios_init are moved to hvmloader*/ + extern void pci_setup(void); + pci_setup(); qemu_loadvm(loadvm); + } { /* XXX: simplify init */ @@ -6719,6 +6737,26 @@ int main(int argc, char **argv) vm_start(); } } + + /* register signal for the suspend request when save */ + { + struct sigaction act; + sigset_t set; + act.sa_handler = suspend; + act.sa_flags = SA_RESTART; + sigemptyset(&act.sa_mask); + + sigaction(SIGUSR1, &act, NULL); + + /* control panel mask some signals when spawn qemu, need unmask here*/ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGTERM); + if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1) + fprintf(stderr, "unblock signal fail, possible issue for HVM save!\n"); + + } + main_loop(); quit_timers(); return 0; diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/Makefile --- a/tools/libxc/Makefile Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/Makefile Wed Dec 13 22:52:02 2006 +0800 @@ -27,7 +27,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c GUEST_SRCS-$(CONFIG_IA64) += xc_linux_build.c GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c -GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c +GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c -include $(XEN_TARGET_ARCH)/Makefile diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xc_domain.c Wed Dec 13 22:52:02 2006 +0800 @@ -233,6 +233,50 @@ int xc_domain_getinfolist(int xc_handle, unlock_pages(info, max_domains*sizeof(xc_domaininfo_t)); return ret; +} + +/* get info from hvm guest for save */ +int xc_domain_hvm_getcontext(int xc_handle, + uint32_t domid, + hvm_domain_context_t *hvm_ctxt) +{ + int rc; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_gethvmcontext; + domctl.domain = (domid_t)domid; + set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt); + + if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 ) + return rc; + + rc = do_domctl(xc_handle, &domctl); + + safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt)); + + return rc; +} + +/* set info to hvm guest for restore */ +int xc_domain_hvm_setcontext(int xc_handle, + uint32_t domid, + hvm_domain_context_t *hvm_ctxt) +{ + int rc; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_sethvmcontext; + domctl.domain = domid; + set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt); + + if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 ) + return rc; + + rc = do_domctl(xc_handle, &domctl); + + safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt)); + + return rc; } int xc_vcpu_getcontext(int xc_handle, diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xc_hvm_build.c Wed Dec 13 22:52:02 2006 +0800 @@ -86,7 +86,7 @@ static void build_e820map(void *e820_pag /* 0x0-0x9F000: Ordinary RAM. */ e820entry[nr_map].addr = 0x0; - e820entry[nr_map].size = 0x9F000; + e820entry[nr_map].size = 0x90000; e820entry[nr_map].type = E820_RAM; nr_map++; @@ -96,7 +96,7 @@ static void build_e820map(void *e820_pag * TODO: SMBIOS tables should be moved higher (>=0xE0000). * They are unusually low in our memory map: could cause problems? */ - e820entry[nr_map].addr = 0x9F000; + e820entry[nr_map].addr = 0x90000; e820entry[nr_map].size = 0x1000; e820entry[nr_map].type = E820_RESERVED; nr_map++; diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xc_linux_save.c Wed Dec 13 22:52:02 2006 +0800 @@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu #endif -static inline ssize_t write_exact(int fd, void *buf, size_t count) -{ - if(write(fd, buf, count) != count) - return 0; - return 1; -} - - - static int print_stats(int xc_handle, uint32_t domid, int pages_sent, xc_shadow_op_stats_t *stats, int print) { @@ -356,7 +347,7 @@ static int analysis_phase(int xc_handle, } -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, +int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, int dom, xc_dominfo_t *info, vcpu_guest_context_t *ctxt) { diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xenctrl.h Wed Dec 13 22:52:02 2006 +0800 @@ -313,6 +313,30 @@ int xc_domain_getinfolist(int xc_handle, xc_domaininfo_t *info); /** + * This function returns information about the context of a hvm domain + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain to get information from + * @parm hvm_ctxt a pointer to a structure to store the execution context of the + * hvm domain + * @return 0 on success, -1 on failure + */ +int xc_domain_hvm_getcontext(int xc_handle, + uint32_t domid, + hvm_domain_context_t *hvm_ctxt); + +/** + * This function will set the context for hvm domain + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain to set the hvm domain context for + * @parm hvm_ctxt pointer to the the hvm context with the values to set + * @return 0 on success, -1 on failure + */ +int xc_domain_hvm_setcontext(int xc_handle, + uint32_t domid, + hvm_domain_context_t *hvm_ctxt); + +/** * This function returns information about the execution context of a * particular vcpu of a domain. * diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xenguest.h Wed Dec 13 22:52:02 2006 +0800 @@ -11,6 +11,7 @@ #define XCFLAGS_LIVE 1 #define XCFLAGS_DEBUG 2 +#define XCFLAGS_HVM 4 /** @@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_ uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, int (*suspend)(int domid)); +/** + * This function will save a hvm domain running unmodified guest. + * @return 0 on success, -1 on failure + */ +int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, + uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, + int (*suspend)(int domid)); /** * This function will restore a saved domain running Linux. @@ -41,6 +49,18 @@ int xc_linux_restore(int xc_handle, int unsigned long nr_pfns, unsigned int store_evtchn, unsigned long *store_mfn, unsigned int console_evtchn, unsigned long *console_mfn); + +/** + * This function will restore a saved hvm domain running unmodified guest. + * + * @parm store_mfn pass mem size & returned with the mfn of the store page + * @return 0 on success, -1 on failure + */ +int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom, + unsigned long nr_pfns, unsigned int store_evtchn, + unsigned long *store_mfn, unsigned int console_evtchn, + unsigned long *console_mfn, + unsigned int pae, unsigned int apic); /** * This function will create a domain for a paravirtualized Linux diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/libxc/xg_save_restore.h Wed Dec 13 22:52:02 2006 +0800 @@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand return 1; } +static inline ssize_t write_exact(int fd, void *buf, size_t count) +{ + if(write(fd, buf, count) != count) + return 0; + return 1; +} + +extern int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, + int dom, xc_dominfo_t *info, + vcpu_guest_context_t *ctxt); /* ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables. diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/python/xen/lowlevel/xc/xc.c Wed Dec 13 22:52:02 2006 +0800 @@ -158,6 +158,20 @@ static PyObject *pyxc_domain_destroy(XcO static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args) { return dom_op(self, args, xc_domain_destroy); +} + +static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args) +{ + uint32_t dom, reason; + + if (!PyArg_ParseTuple(args, "ii", &dom, &reason)) + return NULL; + + if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0) + return pyxc_error_to_exception(); + + Py_INCREF(zero); + return zero; } @@ -969,6 +983,14 @@ static PyMethodDef pyxc_methods[] = { METH_VARARGS, "\n" "Destroy a domain.\n" " dom [int]: Identifier of domain to be destroyed.\n\n" + "Returns: [int] 0 on success; -1 on error.\n" }, + + { "domain_shutdown", + (PyCFunction)pyxc_domain_shutdown, + METH_VARARGS, "\n" + "Shutdown a domain.\n" + " dom [int, 0]: Domain identifier to use.\n" + " reason [int, 0]: Reason for shutdown.\n" "Returns: [int] 0 on success; -1 on error.\n" }, { "vcpu_setaffinity", diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/python/xen/xend/XendCheckpoint.py Wed Dec 13 22:52:02 2006 +0800 @@ -22,11 +22,14 @@ from xen.xend.XendConstants import * from xen.xend.XendConstants import * SIGNATURE = "LinuxGuestRecord" +QEMU_SIGNATURE = "QemuDeviceModelRecord" +dm_batch = 512 XC_SAVE = "xc_save" XC_RESTORE = "xc_restore" sizeof_int = calcsize("i") +sizeof_unsigned_int = calcsize("I") sizeof_unsigned_long = calcsize("L") @@ -69,6 +72,11 @@ def save(fd, dominfo, network, live, dst "could not write guest state file: config len") write_exact(fd, config, "could not write guest state file: config") + image_cfg = dominfo.info.get('image', {}) + hvm = image_cfg.has_key('hvm') + + if hvm: + log.info("save hvm domain") # xc_save takes three customization parameters: maxit, max_f, and # flags the last controls whether or not save is 'live', while the # first two further customize behaviour when 'live' save is @@ -76,7 +84,7 @@ def save(fd, dominfo, network, live, dst # libxenguest; see the comments and/or code in xc_linux_save() for # more information. cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd), - str(dominfo.getDomid()), "0", "0", str(int(live)) ] + str(dominfo.getDomid()), "0", "0", str(int(live) | (int(hvm) << 2)) ] log.debug("[xc_save]: %s", string.join(cmd)) def saveInputHandler(line, tochild): @@ -90,11 +98,28 @@ def save(fd, dominfo, network, live, dst log.info("Domain %d suspended.", dominfo.getDomid()) dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3, domain_name) + #send signal to device model for save + if hvm: + log.info("release_devices for hvm domain") + dominfo._releaseDevices(True) tochild.write("done\n") tochild.flush() log.debug('Written done') forkHelper(cmd, fd, saveInputHandler, False) + + # put qemu device model state + if hvm: + write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature") + qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), os.O_RDONLY) + while True: + buf = os.read(qemu_fd, dm_batch) + if len(buf): + write_exact(fd, buf, "could not write device model state") + else: + break + os.close(qemu_fd) + os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid()) dominfo.destroyDomain() try: @@ -147,19 +172,38 @@ def restore(xd, fd, dominfo = None, paus assert store_port assert console_port + #if hvm, pass mem size to calculate the store_mfn + hvm = 0 + apic = 0 + pae = 0 + image_cfg = dominfo.info.get('image', {}) + hvm = image_cfg.has_key('hvm') + if hvm: + #the 'memory' in config has been removed + hvm = dominfo.info['memory_static_min'] + apic = dominfo.info['image']['hvm'].get('apic', 0) + pae = dominfo.info['image']['hvm'].get('pae', 0) + log.info("restore hvm domain %d, mem=%d, apic=%d, pae=%d", dominfo.domid, hvm, apic, pae) + try: - l = read_exact(fd, sizeof_unsigned_long, - "not a valid guest state file: pfn count read") - nr_pfns = unpack("L", l)[0] # native sizeof long + if hvm: + l = read_exact(fd, sizeof_unsigned_int, + "not a valid hvm guest state file: pfn count read") + nr_pfns = unpack("I", l)[0] # native sizeof int + else: + l = read_exact(fd, sizeof_unsigned_long, + "not a valid guest state file: pfn count read") + nr_pfns = unpack("L", l)[0] # native sizeof long if nr_pfns > 16*1024*1024: # XXX raise XendError( "not a valid guest state file: pfn count out of range") balloon.free(xc.pages_to_kib(nr_pfns)) + log.info("HVM restore:balloon free 0x%x pages.", nr_pfns) cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE), fd, dominfo.getDomid(), nr_pfns, - store_port, console_port]) + store_port, console_port, hvm, pae, apic]) log.debug("[xc_restore]: %s", string.join(cmd)) handler = RestoreInputHandler() @@ -169,10 +213,29 @@ def restore(xd, fd, dominfo = None, paus if handler.store_mfn is None or handler.console_mfn is None: raise XendError('Could not read store/console MFN') - os.read(fd, 1) # Wait for source to close connection dominfo.waitForDevices() # Wait for backends to set up if not paused: dominfo.unpause() + + # get qemu state and create a tmp file for dm restore + if hvm: + qemu_signature = read_exact(fd, len(QEMU_SIGNATURE), + "not a valid device model state: signature read") + if qemu_signature != QEMU_SIGNATURE: + raise XendError("not a valid device model state: found '%s'" % + qemu_signature) + qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), + os.O_WRONLY | os.O_CREAT | os.O_TRUNC) + while True: + buf = os.read(fd, dm_batch) + if len(buf): + write_exact(qemu_fd, buf, "could not write dm state to tmp file") + else: + break + os.close(qemu_fd) + + os.read(fd, 1) # Wait for source to close connection + dominfo.completeRestore(handler.store_mfn, handler.console_mfn) diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/python/xen/xend/XendDomainInfo.py Wed Dec 13 22:52:02 2006 +0800 @@ -488,6 +488,16 @@ class XendDomainInfo: self._removeVm('xend/previous_restart_time') self.storeDom("control/shutdown", reason) + ## shutdown hypercall for hvm domain desides xenstore write + image_cfg = self.info.get('image', {}) + hvm = image_cfg.has_key('hvm') + if hvm: + for code in DOMAIN_SHUTDOWN_REASONS.keys(): + if DOMAIN_SHUTDOWN_REASONS[code] == reason: + break + xc.domain_shutdown(self.domid, code) + + def pause(self): """Pause domain @@ -1203,8 +1213,11 @@ class XendDomainInfo: if self.image: self.image.createDeviceModel() - def _releaseDevices(self): + def _releaseDevices(self, suspend = False): """Release all domain's devices. Nothrow guarantee.""" + if suspend and self.image: + self.image.destroy(suspend) + return while True: t = xstransact("%s/device" % self.dompath) @@ -1473,6 +1486,16 @@ class XendDomainInfo: self.console_mfn = console_mfn self._introduceDomain() + image_cfg = self.info.get('image', {}) + hvm = image_cfg.has_key('hvm') + if hvm: + self.image = image.create(self, + self.info, + self.info['image'], + self.info['devices']) + if self.image: + self.image.createDeviceModel(True) + self.image.register_shutdown_watch() self._storeDomDetails() self._registerWatches() self.refreshShutdown() diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/python/xen/xend/image.py Wed Dec 13 22:52:02 2006 +0800 @@ -157,7 +157,7 @@ class ImageHandler: """Build the domain. Define in subclass.""" raise NotImplementedError() - def createDeviceModel(self): + def createDeviceModel(self, restore = False): """Create device model for the domain (define in subclass if needed).""" pass @@ -405,7 +405,7 @@ class HVMImageHandler(ImageHandler): return ret - def createDeviceModel(self): + def createDeviceModel(self, restore = False): if self.pid: return # Execute device model. @@ -414,6 +414,8 @@ class HVMImageHandler(ImageHandler): args = args + ([ "-d", "%d" % self.vm.getDomid(), "-m", "%s" % (self.getRequiredInitialReservation() / 1024)]) args = args + self.dmargs + if restore: + args = args + ([ "-loadvm", "/tmp/xen.qemu-dm.%d" % self.vm.getDomid() ]) env = dict(os.environ) if self.display: env['DISPLAY'] = self.display @@ -432,12 +434,16 @@ class HVMImageHandler(ImageHandler): self.register_reboot_feature_watch() self.pid = self.vm.gatherDom(('image/device-model-pid', int)) - def destroy(self): + def destroy(self, suspend = False): self.unregister_shutdown_watch() self.unregister_reboot_feature_watch(); if self.pid: try: - os.kill(self.pid, signal.SIGKILL) + sig = signal.SIGKILL + if suspend: + log.info("use sigusr1 to signal qemu %d", self.pid) + sig = signal.SIGUSR1 + os.kill(self.pid, sig) except OSError, exn: log.exception(exn) try: diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_restore.c --- a/tools/xcutils/xc_restore.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/xcutils/xc_restore.c Wed Dec 13 22:52:02 2006 +0800 @@ -19,12 +19,13 @@ main(int argc, char **argv) main(int argc, char **argv) { unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn, console_evtchn; + unsigned int hvm, pae, apic; int ret; unsigned long store_mfn, console_mfn; - if (argc != 6) + if (argc != 9) errx(1, - "usage: %s iofd domid nr_pfns store_evtchn console_evtchn", + "usage: %s iofd domid nr_pfns store_evtchn console_evtchn hvm pae apic", argv[0]); xc_fd = xc_interface_open(); @@ -36,9 +37,19 @@ main(int argc, char **argv) nr_pfns = atoi(argv[3]); store_evtchn = atoi(argv[4]); console_evtchn = atoi(argv[5]); + hvm = atoi(argv[6]); + pae = atoi(argv[7]); + apic = atoi(argv[8]); - ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn, - &store_mfn, console_evtchn, &console_mfn); + if (hvm) { + /* pass the memsize to xc_hvm_restore to find the store_mfn */ + store_mfn = hvm; + ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn, + &store_mfn, console_evtchn, &console_mfn, pae, apic); + } else + ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn, + &store_mfn, console_evtchn, &console_mfn); + if (ret == 0) { printf("store-mfn %li\n", store_mfn); printf("console-mfn %li\n", console_mfn); diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_save.c --- a/tools/xcutils/xc_save.c Fri Sep 15 17:05:38 2006 +0800 +++ b/tools/xcutils/xc_save.c Wed Dec 13 22:52:02 2006 +0800 @@ -51,7 +51,10 @@ main(int argc, char **argv) max_f = atoi(argv[4]); flags = atoi(argv[5]); - ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend); + if (flags & XCFLAGS_HVM) + ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend); + else + ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend); xc_interface_close(xc_fd); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/domain.c Wed Dec 13 22:52:02 2006 +0800 @@ -330,6 +330,7 @@ int arch_set_info_guest( else { hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs); + hvm_load_cpu_context(v, &v->arch.guest_context.hvmcpu_ctxt); } if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/domctl.c Wed Dec 13 22:52:02 2006 +0800 @@ -297,6 +297,7 @@ void arch_getdomaininfo_ctxt( if ( is_hvm_vcpu(v) ) { hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg); + hvm_save_cpu_context(v, &c->hvmcpu_ctxt); } else { @@ -314,6 +315,22 @@ void arch_getdomaininfo_ctxt( c->ctrlreg[3] = xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table)); c->vm_assist = v->domain->vm_assist; +} + +int arch_gethvm_ctxt( + struct vcpu *v, struct hvm_domain_context *c) +{ + if ( !is_hvm_vcpu(v) ) + return -1; + + return hvm_save(v, c); + +} + +int arch_sethvm_ctxt( + struct vcpu *v, struct hvm_domain_context *c) +{ + return hvm_load(v, c); } /* diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/hvm.c Wed Dec 13 22:52:02 2006 +0800 @@ -182,9 +182,18 @@ int hvm_domain_initialise(struct domain void hvm_domain_destroy(struct domain *d) { + HVMStateEntry *se, *dse; kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer); rtc_deinit(d); pmtimer_deinit(d); + + se = d->arch.hvm_domain.first_se; + while (se) { + dse = se; + se = se->next; + xfree(dse); + } + if ( d->arch.hvm_domain.shared_page_va ) unmap_domain_page_global( @@ -225,6 +234,9 @@ int hvm_vcpu_initialise(struct vcpu *v) pit_init(v, cpu_khz); rtc_init(v, RTC_PORT(0), RTC_IRQ); pmtimer_init(v, ACPI_PM_TMR_BLK_ADDRESS); + + /* init hvm sharepage */ + shpage_init(v->domain, get_sp(v->domain)); /* Init guest TSC to start from zero. */ hvm_set_guest_time(v, 0); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/i8254.c --- a/xen/arch/x86/hvm/i8254.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/i8254.c Wed Dec 13 22:52:02 2006 +0800 @@ -203,11 +203,11 @@ static inline void pit_load_count(PITCha switch (s->mode) { case 2: /* create periodic time */ - s->pt = create_periodic_time (period, 0, 0, pit_time_fired, s); + s->pt = create_periodic_time (current->domain, period, 0, 0, pit_time_fired, s); break; case 1: /* create one shot time */ - s->pt = create_periodic_time (period, 0, 1, pit_time_fired, s); + s->pt = create_periodic_time (current->domain, period, 0, 1, pit_time_fired, s); #ifdef DEBUG_PIT printk("HVM_PIT: create one shot time.\n"); #endif @@ -345,6 +345,152 @@ static uint32_t pit_ioport_read(void *op return ret; } +#ifdef HVM_DEBUG_SUSPEND +static void pit_info(PITState *pit) +{ + PITChannelState *s; + int i; + + for(i = 0; i < 3; i++) { + printk("*****pit channel %d's state:*****\n", i); + s = &pit->channels[i]; + printk("pit 0x%x.\n", s->count); + printk("pit 0x%x.\n", s->latched_count); + printk("pit 0x%x.\n", s->count_latched); + printk("pit 0x%x.\n", s->status_latched); + printk("pit 0x%x.\n", s->status); + printk("pit 0x%x.\n", s->read_state); + printk("pit 0x%x.\n", s->write_state); + printk("pit 0x%x.\n", s->write_latch); + printk("pit 0x%x.\n", s->rw_mode); + printk("pit 0x%x.\n", s->mode); + printk("pit 0x%x.\n", s->bcd); + printk("pit 0x%x.\n", s->gate); + printk("pit %"PRId64"\n", s->count_load_time); + + if (s->pt) { + struct periodic_time *pt = s->pt; + printk("pit channel %d has a periodic timer:\n", i); + printk("pt %d.\n", pt->enabled); + printk("pt %d.\n", pt->one_shot); + printk("pt %d.\n", pt->irq); + printk("pt %d.\n", pt->first_injected); + + printk("pt %d.\n", pt->pending_intr_nr); + printk("pt %d.\n", pt->period); + printk("pt %"PRId64"\n", pt->period_cycles); + printk("pt %"PRId64"\n", pt->last_plt_gtime); + } + } + +} +#else +static void pit_info(PITState *pit) +{ +} +#endif + +static void pit_save(hvm_domain_context_t *h, void *opaque) +{ + struct domain *d = opaque; + PITState *pit = &d->arch.hvm_domain.pl_time.vpit; + PITChannelState *s; + struct periodic_time *pt; + int i, pti = -1; + + pit_info(pit); + + for(i = 0; i < 3; i++) { + s = &pit->channels[i]; + hvm_put_32u(h, s->count); + hvm_put_16u(h, s->latched_count); + hvm_put_8u(h, s->count_latched); + hvm_put_8u(h, s->status_latched); + hvm_put_8u(h, s->status); + hvm_put_8u(h, s->read_state); + hvm_put_8u(h, s->write_state); + hvm_put_8u(h, s->write_latch); + hvm_put_8u(h, s->rw_mode); + hvm_put_8u(h, s->mode); + hvm_put_8u(h, s->bcd); + hvm_put_8u(h, s->gate); + hvm_put_64u(h, s->count_load_time); + + if (s->pt && pti == -1) + pti = i; + } + + /* save guest time */ + pt = pit->channels[pti].pt; + hvm_put_8u(h, pti); + hvm_put_8u(h, pt->first_injected); + hvm_put_32u(h, pt->pending_intr_nr); + hvm_put_64u(h, pt->last_plt_gtime); + +} + +static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id) +{ + struct domain *d = opaque; + PITState *pit = &d->arch.hvm_domain.pl_time.vpit; + PITChannelState *s; + int i, pti; + u32 period; + + if (version_id != 1) + return -EINVAL; + + for(i = 0; i < 3; i++) { + s = &pit->channels[i]; + s->count = hvm_get_32u(h); + s->latched_count = hvm_get_16u(h); + s->count_latched = hvm_get_8u(h); + s->status_latched = hvm_get_8u(h); + s->status = hvm_get_8u(h); + s->read_state = hvm_get_8u(h); + s->write_state = hvm_get_8u(h); + s->write_latch = hvm_get_8u(h); + s->rw_mode = hvm_get_8u(h); + s->mode = hvm_get_8u(h); + s->bcd = hvm_get_8u(h); + s->gate = hvm_get_8u(h); + s->count_load_time = hvm_get_64u(h); + } + + pti = hvm_get_8u(h); + if ( pti < 0 || pti > 2) { + printk("pit load get a wrong channel %d when HVM resume.\n", pti); + return -EINVAL; + } + + s = &pit->channels[pti]; + period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ); + + printk("recreate periodic timer %d in mode %d, freq=%d.\n", pti, s->mode, period); + switch (s->mode) { + case 2: + /* create periodic time */ + s->pt = create_periodic_time (d, period, 0, 0, pit_time_fired, s); + s->pt->first_injected = hvm_get_8u(h); + s->pt->pending_intr_nr = hvm_get_32u(h); + s->pt->last_plt_gtime = hvm_get_64u(h); + break; + case 1: + /* create one shot time */ + s->pt = create_periodic_time (d, period, 0, 1, pit_time_fired, s); + break; + default: + printk("pit mode %"PRId8" should not use periodic timer!\n", s->mode); + return -EINVAL; + } + + /*XXX: need set_guest_time here or do this when post_inject? */ + + pit_info(pit); + + return 0; +} + static void pit_reset(void *opaque) { PITState *pit = opaque; @@ -373,6 +519,8 @@ void pit_init(struct vcpu *v, unsigned l s->vcpu = v; s++; s->vcpu = v; s++; s->vcpu = v; + + hvm_register_savevm(v->domain, "xen_hvm_i8254", PIT_BASE, 1, pit_save, pit_load, v->domain); register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io); /* register the speaker port */ diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/intercept.c --- a/xen/arch/x86/hvm/intercept.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/intercept.c Wed Dec 13 22:52:02 2006 +0800 @@ -29,6 +29,8 @@ #include #include #include +#include +#include extern struct hvm_mmio_handler vlapic_mmio_handler; @@ -314,13 +316,14 @@ void pickup_deactive_ticks(struct period * period: fire frequency in ns. */ struct periodic_time * create_periodic_time( + struct domain *d, u32 period, char irq, char one_shot, time_cb *cb, void *data) { - struct periodic_time *pt = &(current->domain->arch.hvm_domain.pl_time.periodic_tm); + struct periodic_time *pt = &(d->arch.hvm_domain.pl_time.periodic_tm); if ( pt->enabled ) { stop_timer (&pt->timer); pt->enabled = 0; @@ -353,6 +356,278 @@ void destroy_periodic_time(struct period stop_timer(&pt->timer); pt->enabled = 0; } +} + +/* save/restore support */ +#define HVM_FILE_MAGIC 0x54381286 +#define HVM_FILE_VERSION 0x00000001 + +int hvm_register_savevm(struct domain *d, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque) +{ + HVMStateEntry *se, **pse; + + if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){ + printk("allocat hvmstate entry fail.\n"); + return -1; + } + + strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN); + + se->instance_id = instance_id; + se->version_id = version_id; + se->save_state = save_state; + se->load_state = load_state; + se->opaque = opaque; + se->next = NULL; + + /* add at the end of list */ + pse = &d->arch.hvm_domain.first_se; + while (*pse != NULL) + pse = &(*pse)->next; + *pse = se; + return 0; +} + +int hvm_save(struct vcpu *v, hvm_domain_context_t *h) +{ + uint32_t len, len_pos, cur_pos; + uint32_t eax, ebx, ecx, edx; + HVMStateEntry *se; + char *chgset; + + if (!is_hvm_vcpu(v)) { + printk("hvm_save only for hvm guest!\n"); + return -1; + } + + memset(h, 0, sizeof(hvm_domain_context_t)); + hvm_put_32u(h, HVM_FILE_MAGIC); + hvm_put_32u(h, HVM_FILE_VERSION); + + /* save xen changeset */ + chgset = strrchr(XEN_CHANGESET, ' ') + 1; + + len = strlen(chgset); + hvm_put_8u(h, len); + hvm_put_buffer(h, chgset, len); + + /* save cpuid */ + cpuid(1, &eax, &ebx, &ecx, &edx); + hvm_put_32u(h, eax); + + for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se = se->next) { + /* ID string */ + len = strnlen(se->idstr, HVM_SE_IDSTR_LEN); + hvm_put_8u(h, len); + hvm_put_buffer(h, se->idstr, len); + + hvm_put_32u(h, se->instance_id); + hvm_put_32u(h, se->version_id); + + /* record size */ + len_pos = hvm_ctxt_tell(h); + hvm_put_32u(h, 0); + + se->save_state(h, se->opaque); + + cur_pos = hvm_ctxt_tell(h); + len = cur_pos - len_pos - 4; + hvm_ctxt_seek(h, len_pos); + hvm_put_32u(h, len); + hvm_ctxt_seek(h, cur_pos); + + } + + h->size = hvm_ctxt_tell(h); + hvm_ctxt_seek(h, 0); + + if (h->size >= HVM_CTXT_SIZE) { + printk("hvm_domain_context overflow when hvm_save! need %"PRId32" bytes for use.\n", h->size); + return -1; + } + + return 0; + +} + +static HVMStateEntry *find_se(struct domain *d, const char *idstr, int instance_id) +{ + HVMStateEntry *se; + + for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next) { + if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) && + instance_id == se->instance_id){ + return se; + } + } + return NULL; +} + +int hvm_load(struct vcpu *v, hvm_domain_context_t *h) +{ + uint32_t len, rec_len, rec_pos, magic, instance_id, version_id; + uint32_t eax, ebx, ecx, edx; + HVMStateEntry *se; + char idstr[HVM_SE_IDSTR_LEN]; + xen_changeset_info_t chgset; + char *cur_chgset; + int ret; + + if (!is_hvm_vcpu(v)) { + printk("hvm_load only for hvm guest!\n"); + return -1; + } + + if (h->size >= HVM_CTXT_SIZE) { + printk("hvm_load fail! seems hvm_domain_context overflow when hvm_save! need %"PRId32" bytes.\n", h->size); + return -1; + } + + hvm_ctxt_seek(h, 0); + + magic = hvm_get_32u(h); + if (magic != HVM_FILE_MAGIC) { + printk("HVM restore magic dismatch!\n"); + return -1; + } + + magic = hvm_get_32u(h); + if (magic != HVM_FILE_VERSION) { + printk("HVM restore version dismatch!\n"); + return -1; + } + + /* check xen change set */ + cur_chgset = strrchr(XEN_CHANGESET, ' ') + 1; + + len = hvm_get_8u(h); + if (len > 20) { /*typical length is 18 -- "revision number:changeset id" */ + printk("wrong change set length %d when hvm restore!\n", len); + return -1; + } + + hvm_get_buffer(h, chgset, len); + chgset[len] = '\0'; + if (strncmp(cur_chgset, chgset, len + 1)) + printk("warnings: try to restore hvm guest(%s) on a different changeset %s.\n", + chgset, cur_chgset); + + /* check cpuid */ + cpuid(1, &eax, &ebx, &ecx, &edx); + ebx = hvm_get_32u(h); + /*TODO: need difine how big difference is acceptable */ + if (ebx != eax) + printk("warnings: try to restore hvm guest(0x%"PRIx32") " + "on a different type processor(0x%"PRIx32").\n", + ebx, + eax); + + while(1) { + if (hvm_ctxt_end(h)) { + break; + } + + /* ID string */ + len = hvm_get_8u(h); + if (len > HVM_SE_IDSTR_LEN) { + printk("wrong HVM save entry idstr len %d!", len); + return -1; + } + + hvm_get_buffer(h, idstr, len); + idstr[len] = '\0'; + + instance_id = hvm_get_32u(h); + version_id = hvm_get_32u(h); + + rec_len = hvm_get_32u(h); + rec_pos = hvm_ctxt_tell(h); + + se = find_se(v->domain, idstr, instance_id); + if (se == NULL) { + printk("warnings: hvm load can't find device %s's instance %d!\n", + idstr, instance_id); + } else { + ret = se->load_state(h, se->opaque, version_id); + if (ret < 0) + printk("warnings: loading state fail for device %s instance %d!\n", + idstr, instance_id); + } + + + /* make sure to jump end of record */ + if ( hvm_ctxt_tell(h) - rec_pos != rec_len) { + printk("wrong hvm record size, maybe some dismatch between save&restore handler!\n"); + } + hvm_ctxt_seek(h, rec_pos + rec_len); + } + + return 0; +} + +#ifdef HVM_DEBUG_SUSPEND +static void shpage_info(shared_iopage_t *sh) +{ + + vcpu_iodata_t *p = &sh->vcpu_iodata[0]; + ioreq_t *req = &p->vp_ioreq; + printk("*****sharepage_info******!\n"); + printk("vp_eport=%d\n", p->vp_eport); + printk("io packet: " + "state:%x, pvalid: %x, dir:%x, port: %"PRIx64", " + "data: %"PRIx64", count: %"PRIx64", size: %"PRIx64"\n", + req->state, req->data_is_ptr, req->dir, req->addr, + req->data, req->count, req->size); +} +#else +static void shpage_info(shared_iopage_t *sh) +{ +} +#endif + +static void shpage_save(hvm_domain_context_t *h, void *opaque) +{ + /* XXX:no action required for shpage save/restore, since it's in guest memory + * keep it for debug purpose only */ + +#if 0 + struct shared_iopage *s = opaque; + /* XXX:smp */ + struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq; + + shpage_info(s); + + hvm_put_buffer(h, (char*)req, sizeof(struct ioreq)); +#endif +} + +static int shpage_load(hvm_domain_context_t *h, void *opaque, int version_id) +{ + struct shared_iopage *s = opaque; +#if 0 + /* XXX:smp */ + struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq; + + if (version_id != 1) + return -EINVAL; + + hvm_get_buffer(h, (char*)req, sizeof(struct ioreq)); + + +#endif + shpage_info(s); + return 0; +} + +void shpage_init(struct domain *d, shared_iopage_t *sp) +{ + hvm_register_savevm(d, "xen_hvm_shpage", 0x10, 1, shpage_save, shpage_load, sp); } /* diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vioapic.c --- a/xen/arch/x86/hvm/vioapic.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/vioapic.c Wed Dec 13 22:52:02 2006 +0800 @@ -466,10 +466,138 @@ void vioapic_update_EOI(struct domain *d spin_unlock(&hvm_irq->lock); } +#ifdef HVM_DEBUG_SUSPEND +static void ioapic_info(struct vioapic *s) +{ + int i; + printk("*****ioapic state:*****\n"); + printk("ioapic 0x%x.\n", s->ioregsel); + printk("ioapic 0x%x.\n", s->id); + printk("ioapic 0x%lx.\n", s->base_address); + for (i = 0; i < VIOAPIC_NUM_PINS; i++) { + printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i, s->redirtbl[i].bits); + } + +} +static void hvmirq_info(struct hvm_irq *hvm_irq) +{ + int i; + printk("*****hvmirq state:*****\n"); + for (i = 0; i < BITS_TO_LONGS(32*4); i++) + printk("hvmirq pci_intx[%d]:0x%lx.\n", i, hvm_irq->pci_intx[i]); + + for (i = 0; i < BITS_TO_LONGS(16); i++) + printk("hvmirq isa_irq[%d]:0x%lx.\n", i, hvm_irq->isa_irq[i]); + + for (i = 0; i < BITS_TO_LONGS(1); i++) + printk("hvmirq callback_irq_wire[%d]:0x%lx.\n", i, hvm_irq->callback_irq_wire[i]); + + printk("hvmirq callback_gsi:0x%x.\n", hvm_irq->callback_gsi); + + for (i = 0; i < 4; i++) + printk("hvmirq pci_link_route[%d]:0x%"PRIx8".\n", i, hvm_irq->pci_link_route[i]); + + for (i = 0; i < 4; i++) + printk("hvmirq pci_link_assert_count[%d]:0x%"PRIx8".\n", i, hvm_irq->pci_link_assert_count[i]); + + for (i = 0; i < 4; i++) + printk("hvmirq gsi_assert_count[%d]:0x%"PRIx8".\n", i, hvm_irq->gsi_assert_count[i]); + + printk("hvmirq round_robin_prev_vcpu:0x%"PRIx8".\n", hvm_irq->round_robin_prev_vcpu); +} +#else +static void ioapic_info(struct vioapic *s) +{ +} +static void hvmirq_info(struct hvm_irq *hvm_irq) +{ +} +#endif + +static void ioapic_save(hvm_domain_context_t *h, void *opaque) +{ + int i; + struct domain *d = opaque; + struct vioapic *s = domain_vioapic(d); + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + + ioapic_info(s); + hvmirq_info(hvm_irq); + + /* save iopaic state*/ + hvm_put_32u(h, s->ioregsel); + hvm_put_32u(h, s->id); + hvm_put_64u(h, s->base_address); + for (i = 0; i < VIOAPIC_NUM_PINS; i++) { + hvm_put_64u(h, s->redirtbl[i].bits); + } + + /* save hvm irq state */ + hvm_put_buffer(h, (char*)hvm_irq->pci_intx, 16); + hvm_put_buffer(h, (char*)hvm_irq->isa_irq, 2); + hvm_put_buffer(h, (char*)hvm_irq->callback_irq_wire, 1); + hvm_put_32u(h, hvm_irq->callback_gsi); + + for (i = 0; i < 4; i++) + hvm_put_8u(h, hvm_irq->pci_link_route[i]); + + for (i = 0; i < 4; i++) + hvm_put_8u(h, hvm_irq->pci_link_assert_count[i]); + + for (i = 0; i < VIOAPIC_NUM_PINS; i++) + hvm_put_8u(h, hvm_irq->gsi_assert_count[i]); + + hvm_put_8u(h, hvm_irq->round_robin_prev_vcpu); + +} + +static int ioapic_load(hvm_domain_context_t *h, void *opaque, int version_id) +{ + int i; + struct domain *d = opaque; + struct vioapic *s = domain_vioapic(d); + struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; + + if (version_id != 1) + return -EINVAL; + + /* restore ioapic state */ + s->ioregsel = hvm_get_32u(h); + s->id = hvm_get_32u(h); + s->base_address = hvm_get_64u(h); + for (i = 0; i < VIOAPIC_NUM_PINS; i++) { + s->redirtbl[i].bits = hvm_get_64u(h); + } + + /* restore irq state */ + hvm_get_buffer(h, (char*)hvm_irq->pci_intx, 16); + hvm_get_buffer(h, (char*)hvm_irq->isa_irq, 2); + hvm_get_buffer(h, (char*)hvm_irq->callback_irq_wire, 1); + hvm_irq->callback_gsi = hvm_get_32u(h); + + for (i = 0; i < 4; i++) + hvm_irq->pci_link_route[i] = hvm_get_8u(h); + + for (i = 0; i < 4; i++) + hvm_irq->pci_link_assert_count[i] = hvm_get_8u(h); + + for (i = 0; i < VIOAPIC_NUM_PINS; i++) + hvm_irq->gsi_assert_count[i] = hvm_get_8u(h); + + hvm_irq->round_robin_prev_vcpu = hvm_get_8u(h); + + ioapic_info(s); + hvmirq_info(hvm_irq); + + return 0; +} + void vioapic_init(struct domain *d) { struct vioapic *vioapic = domain_vioapic(d); int i; + + hvm_register_savevm(d, "xen_hvm_ioapic", 0, 1, ioapic_save, ioapic_load, d); memset(vioapic, 0, sizeof(*vioapic)); for ( i = 0; i < VIOAPIC_NUM_PINS; i++ ) diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vlapic.c --- a/xen/arch/x86/hvm/vlapic.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/vlapic.c Wed Dec 13 22:52:02 2006 +0800 @@ -921,6 +921,82 @@ static int vlapic_reset(struct vlapic *v return 1; } +#ifdef HVM_DEBUG_SUSPEND +static void lapic_info(struct vlapic *s) +{ + printk("*****lapic state:*****\n"); + printk("lapic 0x%"PRIx64".\n", s->apic_base_msr); + printk("lapic 0x%x.\n", s->disabled); + printk("lapic 0x%x.\n", s->timer_divisor); + printk("lapic 0x%x.\n", s->timer_pending_count); +} +#else +static void lapic_info(struct vlapic *s) +{ +} +#endif + +static void lapic_save(hvm_domain_context_t *h, void *opaque) +{ + struct vlapic *s = opaque; + + lapic_info(s); + + hvm_put_64u(h, s->apic_base_msr); + hvm_put_32u(h, s->disabled); + hvm_put_32u(h, s->timer_divisor); + + /*XXX: need this?*/ + hvm_put_32u(h, s->timer_pending_count); + + hvm_put_buffer(h, (char*)s->regs, 0x3f0); + +} + +static int lapic_load(hvm_domain_context_t *h, void *opaque, int version_id) +{ + struct vlapic *s = opaque; + uint32_t tmict; + + if (version_id != 1) + return -EINVAL; + + s->apic_base_msr = hvm_get_64u(h); + s->disabled = hvm_get_32u(h); + s->timer_divisor = hvm_get_32u(h); + + /*XXX: need this?*/ + s->timer_pending_count = hvm_get_32u(h); + + hvm_get_buffer(h, (char*)s->regs, 0x3f0); + + /* rearm the actiemr if needed */ + tmict = vlapic_get_reg(s, APIC_TMICT); + if (tmict > 0) { + s_time_t now = NOW(), offset; + stop_timer(&s->vlapic_timer); + vlapic_set_reg(s, APIC_TMCCT, tmict); + s->timer_last_update = now; + + offset = APIC_BUS_CYCLE_NS * s->timer_divisor * tmict; + + set_timer(&s->vlapic_timer, now + offset); + + printk("lapic_load to rearm the actimer:" + "bus cycle is %"PRId64"ns, now 0x%016"PRIx64", " + "timer initial count 0x%x, offset 0x%016"PRIx64", " + "expire @ 0x%016"PRIx64".", + APIC_BUS_CYCLE_NS, now, + vlapic_get_reg(s, APIC_TMICT), + offset, now + offset); + } + + + lapic_info(s); + + return 0; +} + int vlapic_init(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); @@ -939,6 +1015,7 @@ int vlapic_init(struct vcpu *v) vlapic->regs = map_domain_page_global(page_to_mfn(vlapic->regs_page)); memset(vlapic->regs, 0, PAGE_SIZE); + hvm_register_savevm(v->domain, "xen_hvm_lapic", v->vcpu_id, 1, lapic_save, lapic_load, vlapic); vlapic_reset(vlapic); vlapic->apic_base_msr = MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Dec 13 22:52:02 2006 +0800 @@ -426,6 +426,319 @@ static void vmx_store_cpu_guest_regs( vmx_vmcs_exit(v); } +static int __get_instruction_length(void); +int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c) +{ + unsigned long inst_len; + + inst_len = __get_instruction_length(); + c->eip = __vmread(GUEST_RIP); + +#ifdef HVM_DEBUG_SUSPEND + printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n", + inst_len, c->eip); +#endif + + c->esp = __vmread(GUEST_RSP); + c->eflags = __vmread(GUEST_RFLAGS); + + c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0; + c->cr3 = v->arch.hvm_vmx.cpu_cr3; + c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4; + +#ifdef HVM_DEBUG_SUSPEND + printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n", + c->cr3, + c->cr0, + c->cr4); +#endif + + c->idtr_limit = __vmread(GUEST_IDTR_LIMIT); + c->idtr_base = __vmread(GUEST_IDTR_BASE); + + c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT); + c->gdtr_base = __vmread(GUEST_GDTR_BASE); + + c->cs_sel = __vmread(GUEST_CS_SELECTOR); + c->cs_limit = __vmread(GUEST_CS_LIMIT); + c->cs_base = __vmread(GUEST_CS_BASE); + c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES); + + c->ds_sel = __vmread(GUEST_DS_SELECTOR); + c->ds_limit = __vmread(GUEST_DS_LIMIT); + c->ds_base = __vmread(GUEST_DS_BASE); + c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES); + + c->es_sel = __vmread(GUEST_ES_SELECTOR); + c->es_limit = __vmread(GUEST_ES_LIMIT); + c->es_base = __vmread(GUEST_ES_BASE); + c->es_arbytes = __vmread(GUEST_ES_AR_BYTES); + + c->ss_sel = __vmread(GUEST_SS_SELECTOR); + c->ss_limit = __vmread(GUEST_SS_LIMIT); + c->ss_base = __vmread(GUEST_SS_BASE); + c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES); + + c->fs_sel = __vmread(GUEST_FS_SELECTOR); + c->fs_limit = __vmread(GUEST_FS_LIMIT); + c->fs_base = __vmread(GUEST_FS_BASE); + c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES); + + c->gs_sel = __vmread(GUEST_GS_SELECTOR); + c->gs_limit = __vmread(GUEST_GS_LIMIT); + c->gs_base = __vmread(GUEST_GS_BASE); + c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES); + + c->tr_sel = __vmread(GUEST_TR_SELECTOR); + c->tr_limit = __vmread(GUEST_TR_LIMIT); + c->tr_base = __vmread(GUEST_TR_BASE); + c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES); + + c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR); + c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT); + c->ldtr_base = __vmread(GUEST_LDTR_BASE); + c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES); + + c->sysenter_cs = __vmread(GUEST_SYSENTER_CS); + c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP); + c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP); + + return 1; +} + +int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c) +{ + unsigned long mfn, old_cr4, old_base_mfn; + int error = 0; + + __vmwrite(GUEST_RIP, c->eip); + __vmwrite(GUEST_RSP, c->esp); + __vmwrite(GUEST_RFLAGS, c->eflags); + + v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0; + __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0); + + old_cr4 = __vmread(CR4_READ_SHADOW); + __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK)); + + v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4; + __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4); + +#ifdef HVM_DEBUG_SUSPEND + printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n", + c->cr3, + c->cr0, + c->cr4); +#endif + + if (!vmx_paging_enabled(v)) { + HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); + __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); + goto skip_cr3; + } + + if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) { + /* + * This is simple TLB flush, implying the guest has + * removed some translation or changed page attributes. + * We simply invalidate the shadow. + */ + mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT); + if (mfn != pagetable_get_pfn(v->arch.guest_table)) { + printk("Invalid CR3 value=%"PRIx64"", c->cr3); + domain_crash(v->domain); + return 0; + } + } else { + /* + * If different, make a shadow. Check if the PDBR is valid + * first. + */ + HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3); + if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) { + printk("Invalid CR3 value=%"PRIx64"", c->cr3); + domain_crash(v->domain); + return 0; + } + + /* current!=vcpu as not called by arch_vmx_do_launch */ + mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT); + if(!get_page(mfn_to_page(mfn), v->domain)) { + struct page_info *page = mfn_to_page(mfn); + printk("get_page for mfn failed. CR3 value=%"PRIx64", count_info=0x%"PRIx32", type_info=0x%lx, owner=%d.\n", c->cr3, + page->count_info, + page->u.inuse.type_info, + page->u.inuse._domain); + return 0; + } + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); + if (old_base_mfn) + put_page(mfn_to_page(old_base_mfn)); + /* + * arch.shadow_table should now hold the next CR3 for shadow + */ + v->arch.hvm_vmx.cpu_cr3 = c->cr3; + } + + skip_cr3: +#if defined(__x86_64__) + if (vmx_long_mode_enabled(v)) { + unsigned long vm_entry_value; + vm_entry_value = __vmread(VM_ENTRY_CONTROLS); + vm_entry_value |= VM_ENTRY_IA32E_MODE; + __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); + } +#endif + + shadow_update_paging_modes(v); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + + __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit); + __vmwrite(GUEST_IDTR_BASE, c->idtr_base); + + __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit); + __vmwrite(GUEST_GDTR_BASE, c->gdtr_base); + + __vmwrite(GUEST_CS_SELECTOR, c->cs_sel); + __vmwrite(GUEST_CS_LIMIT, c->cs_limit); + __vmwrite(GUEST_CS_BASE, c->cs_base); + __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes); + + __vmwrite(GUEST_DS_SELECTOR, c->ds_sel); + __vmwrite(GUEST_DS_LIMIT, c->ds_limit); + __vmwrite(GUEST_DS_BASE, c->ds_base); + __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes); + + __vmwrite(GUEST_ES_SELECTOR, c->es_sel); + __vmwrite(GUEST_ES_LIMIT, c->es_limit); + __vmwrite(GUEST_ES_BASE, c->es_base); + __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes); + + __vmwrite(GUEST_SS_SELECTOR, c->ss_sel); + __vmwrite(GUEST_SS_LIMIT, c->ss_limit); + __vmwrite(GUEST_SS_BASE, c->ss_base); + __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes); + + __vmwrite(GUEST_FS_SELECTOR, c->fs_sel); + __vmwrite(GUEST_FS_LIMIT, c->fs_limit); + __vmwrite(GUEST_FS_BASE, c->fs_base); + __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes); + + __vmwrite(GUEST_GS_SELECTOR, c->gs_sel); + __vmwrite(GUEST_GS_LIMIT, c->gs_limit); + __vmwrite(GUEST_GS_BASE, c->gs_base); + __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes); + + __vmwrite(GUEST_TR_SELECTOR, c->tr_sel); + __vmwrite(GUEST_TR_LIMIT, c->tr_limit); + __vmwrite(GUEST_TR_BASE, c->tr_base); + __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes); + + __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel); + __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit); + __vmwrite(GUEST_LDTR_BASE, c->ldtr_base); + __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes); + + __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs); + __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp); + __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip); + + return !error; +} + +#ifdef HVM_DEBUG_SUSPEND +static void dump_msr_state(struct vmx_msr_state *m) +{ + int i = 0; + printk("**** msr state ****\n"); + printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags); + for (i = 0; i < VMX_MSR_COUNT; i++) + printk("0x%lx,", m->msrs[i]); + printk("\n"); +} +#else +static void dump_msr_state(struct vmx_msr_state *m) +{ +} +#endif + +void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt) +{ + struct vmcs_data *data = &ctxt->data; + struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state; + unsigned long guest_flags = guest_state->flags; + int i = 0; + + data->shadow_gs = guest_state->shadow_gs; + data->vmxassist_enabled = v->arch.hvm_vmx.vmxassist_enabled; + /* save msrs */ + data->flags = guest_flags; + for (i = 0; i < VMX_MSR_COUNT; i++) + data->msr_items[i] = guest_state->msrs[i]; + + dump_msr_state(guest_state); +} + +void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt) +{ + int i = 0; + struct vmcs_data *data = &ctxt->data; + struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state; + + /* restore msrs */ + guest_state->flags = data->flags; + for (i = 0; i < VMX_MSR_COUNT; i++) + guest_state->msrs[i] = data->msr_items[i]; + + guest_state->shadow_gs = data->shadow_gs; + + /*XXX:no need to restore msrs, current!=vcpu as not called by arch_vmx_do_launch */ +/* vmx_restore_guest_msrs(v);*/ + + v->arch.hvm_vmx.vmxassist_enabled = data->vmxassist_enabled; + + dump_msr_state(guest_state); +} + +void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt) +{ + struct vmcs_data *data = &ctxt->data; + + /* set valid flag to recover whole vmcs when restore */ + ctxt->valid = 1; + + vmx_save_cpu_state(v, ctxt); + + vmx_vmcs_enter(v); + + vmx_vmcs_save(v, data); + + vmx_vmcs_exit(v); + +} + +void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt) +{ + if (!ctxt->valid) + return; + + vmx_load_cpu_state(v, ctxt); + + vmx_vmcs_enter(v); + + if (!vmx_vmcs_restore(v, &ctxt->data)) { + printk("vmx_vmcs restore failed!\n"); + domain_crash(v->domain); + } + + /* only load vmcs once */ + ctxt->valid = 0; + + vmx_vmcs_exit(v); + +} + /* * The VMX spec (section 4.3.1.2, Checks on Guest Segment * Registers) says that virtual-8086 mode guests' segment @@ -737,6 +1050,9 @@ static void vmx_setup_hvm_funcs(void) hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs; hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs; + + hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt; + hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt; hvm_funcs.paging_enabled = vmx_paging_enabled; hvm_funcs.long_mode_enabled = vmx_long_mode_enabled; diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vpic.c --- a/xen/arch/x86/hvm/vpic.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/hvm/vpic.c Wed Dec 13 22:52:02 2006 +0800 @@ -378,6 +378,87 @@ static int vpic_intercept_elcr_io(ioreq_ return 1; } +#ifdef HVM_DEBUG_SUSPEND +static void vpic_info(struct vpic *s) +{ + printk("*****pic state:*****\n"); + printk("pic 0x%x.\n", s->irr); + printk("pic 0x%x.\n", s->imr); + printk("pic 0x%x.\n", s->isr); + printk("pic 0x%x.\n", s->irq_base); + printk("pic 0x%x.\n", s->init_state); + printk("pic 0x%x.\n", s->priority_add); + printk("pic 0x%x.\n", s->readsel_isr); + printk("pic 0x%x.\n", s->poll); + printk("pic 0x%x.\n", s->auto_eoi); + printk("pic 0x%x.\n", s->rotate_on_auto_eoi); + printk("pic 0x%x.\n", s->special_fully_nested_mode); + printk("pic 0x%x.\n", s->special_mask_mode); + printk("pic 0x%x.\n", s->elcr); + printk("pic 0x%x.\n", s->int_output); + printk("pic 0x%x.\n", s->is_master); +} +#else +static void vpic_info(struct vpic *s) +{ +} +#endif + +static void vpic_save(hvm_domain_context_t *h, void *opaque) +{ + struct vpic *s = opaque; + + vpic_info(s); + + hvm_put_8u(h, s->irr); + hvm_put_8u(h, s->imr); + hvm_put_8u(h, s->isr); + hvm_put_8u(h, s->irq_base); + hvm_put_8u(h, s->init_state); + hvm_put_8u(h, s->priority_add); + hvm_put_8u(h, s->readsel_isr); + + hvm_put_8u(h, s->poll); + hvm_put_8u(h, s->auto_eoi); + + hvm_put_8u(h, s->rotate_on_auto_eoi); + hvm_put_8u(h, s->special_fully_nested_mode); + hvm_put_8u(h, s->special_mask_mode); + + hvm_put_8u(h, s->elcr); + hvm_put_8u(h, s->int_output); +} + +static int vpic_load(hvm_domain_context_t *h, void *opaque, int version_id) +{ + struct vpic *s = opaque; + + if (version_id != 1) + return -EINVAL; + + s->irr = hvm_get_8u(h); + s->imr = hvm_get_8u(h); + s->isr = hvm_get_8u(h); + s->irq_base = hvm_get_8u(h); + s->init_state = hvm_get_8u(h); + s->priority_add = hvm_get_8u(h); + s->readsel_isr = hvm_get_8u(h); + + s->poll = hvm_get_8u(h); + s->auto_eoi = hvm_get_8u(h); + + s->rotate_on_auto_eoi = hvm_get_8u(h); + s->special_fully_nested_mode = hvm_get_8u(h); + s->special_mask_mode = hvm_get_8u(h); + + s->elcr = hvm_get_8u(h); + s->int_output = hvm_get_8u(h); + + vpic_info(s); + + return 0; +} + void vpic_init(struct domain *d) { struct vpic *vpic; @@ -387,12 +468,14 @@ void vpic_init(struct domain *d) memset(vpic, 0, sizeof(*vpic)); vpic->is_master = 1; vpic->elcr = 1 << 2; + hvm_register_savevm(d, "xen_hvm_i8259", 0x20, 1, vpic_save, vpic_load, vpic); register_portio_handler(d, 0x20, 2, vpic_intercept_pic_io); register_portio_handler(d, 0x4d0, 1, vpic_intercept_elcr_io); /* Slave PIC. */ vpic++; memset(vpic, 0, sizeof(*vpic)); + hvm_register_savevm(d, "xen_hvm_i8259", 0xa0, 1, vpic_save, vpic_load, vpic); register_portio_handler(d, 0xa0, 2, vpic_intercept_pic_io); register_portio_handler(d, 0x4d1, 1, vpic_intercept_elcr_io); } diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/mm/shadow/common.c Wed Dec 13 22:52:02 2006 +0800 @@ -2145,7 +2145,7 @@ int shadow_remove_all_mappings(struct vc /* Don't complain if we're in HVM and there's one extra mapping: * The qemu helper process has an untyped mapping of this dom's RAM */ if ( !(shadow_mode_external(v->domain) - && (page->count_info & PGC_count_mask) <= 2 + && (page->count_info & PGC_count_mask) <= 3 /* vmx restore add one extra mapping*/ && (page->u.inuse.type_info & PGT_count_mask) == 0) ) { SHADOW_ERROR("can't find all mappings of mfn %lx: " diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/arch/x86/mm/shadow/multi.c Wed Dec 13 22:52:02 2006 +0800 @@ -1613,6 +1613,14 @@ sh_make_shadow(struct vcpu *v, mfn_t gmf } } + { + struct page_info *page = mfn_to_page(gmfn); + /* XXX: add it to emulate a touched page */ + if ((page->u.inuse.type_info & PGT_type_mask) == PGT_none){ + page->u.inuse.type_info |= (PGT_writable_page | PGT_validated); + } + } + shadow_promote(v, gmfn, shadow_type); set_shadow_status(v, gmfn, shadow_type, smfn); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domain.c --- a/xen/common/domain.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/common/domain.c Wed Dec 13 22:52:02 2006 +0800 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -454,8 +455,14 @@ int set_info_guest(struct domain *d, domain_pause(d); rc = -EFAULT; - if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 ) + if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 ) { rc = arch_set_info_guest(v, c); + if ( v->vcpu_id != 0 && + is_hvm_vcpu(v) && + test_and_clear_bit(_VCPUF_down, &v->vcpu_flags) ) { + vcpu_wake(v); + } + } domain_unpause(d); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domctl.c --- a/xen/common/domctl.c Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/common/domctl.c Wed Dec 13 22:52:02 2006 +0800 @@ -26,6 +26,10 @@ extern long arch_do_domctl( struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl); extern void arch_getdomaininfo_ctxt( struct vcpu *, struct vcpu_guest_context *); +extern int arch_gethvm_ctxt( + struct vcpu *, struct hvm_domain_context *); +extern int arch_sethvm_ctxt( + struct vcpu *, struct hvm_domain_context *); void cpumask_to_xenctl_cpumap( struct xenctl_cpumap *xenctl_cpumap, cpumask_t *cpumask) @@ -205,6 +209,37 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc } break; + case XEN_DOMCTL_sethvmcontext: + { + struct hvm_domain_context *c; + struct domain *d; + struct vcpu *v; + + ret = -ESRCH; + if ( (d = find_domain_by_id(op->domain)) == NULL ) + break; + + ret = -ENOMEM; + if ( (c = xmalloc(struct hvm_domain_context)) == NULL ) + goto sethvmcontext_out; + + /*XXX: need check input vcpu when smp */ + v = d->vcpu[0]; + + ret = -EFAULT; + if ( copy_from_guest(c, op->u.hvmcontext.ctxt, 1) != 0 ) + goto sethvmcontext_out; + + ret = arch_sethvm_ctxt(v, c); + + xfree(c); + + sethvmcontext_out: + put_domain(d); + + } + break; + case XEN_DOMCTL_pausedomain: { struct domain *d = find_domain_by_id(op->domain); @@ -489,6 +524,44 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc getvcpucontext_out: put_domain(d); + } + break; + + case XEN_DOMCTL_gethvmcontext: + { + struct hvm_domain_context *c; + struct domain *d; + struct vcpu *v; + + ret = -ESRCH; + if ( (d = find_domain_by_id(op->domain)) == NULL ) + break; + + ret = -ENOMEM; + if ( (c = xmalloc(struct hvm_domain_context)) == NULL ) + goto gethvmcontext_out; + + v = d->vcpu[0]; + + ret = -ENODATA; + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + goto gethvmcontext_out; + + ret = 0; + if (arch_gethvm_ctxt(v, c) == -1) + ret = -EFAULT; + + if ( copy_to_guest(op->u.hvmcontext.ctxt, c, 1) ) + ret = -EFAULT; + + xfree(c); + + if ( copy_to_guest(u_domctl, op, 1) ) + ret = -EFAULT; + + gethvmcontext_out: + put_domain(d); + } break; diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/domain.h --- a/xen/include/asm-x86/hvm/domain.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/asm-x86/hvm/domain.h Wed Dec 13 22:52:02 2006 +0800 @@ -27,6 +27,20 @@ #include #include +typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque); +typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int version_id); + +#define HVM_SE_IDSTR_LEN 32 +typedef struct HVMStateEntry { + char idstr[HVM_SE_IDSTR_LEN]; + int instance_id; + int version_id; + SaveStateHandler *save_state; + LoadStateHandler *load_state; + void *opaque; + struct HVMStateEntry *next; +} HVMStateEntry; + struct hvm_domain { unsigned long shared_page_va; unsigned long buffered_io_va; @@ -44,6 +58,9 @@ struct hvm_domain { spinlock_t pbuf_lock; uint64_t params[HVM_NR_PARAMS]; + + struct hvm_domain_context *hvm_ctxt; + HVMStateEntry *first_se; }; #endif /* __ASM_X86_HVM_DOMAIN_H__ */ diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/asm-x86/hvm/hvm.h Wed Dec 13 22:52:02 2006 +0800 @@ -79,6 +79,13 @@ struct hvm_function_table { struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs); void (*load_cpu_guest_regs)( struct vcpu *v, struct cpu_user_regs *r); + + /* save and load hvm guest cpu context for save/restore */ + void (*save_cpu_ctxt)( + struct vcpu *v, struct hvmcpu_context *ctxt); + void (*load_cpu_ctxt)( + struct vcpu *v, struct hvmcpu_context *ctxt); + /* * Examine specifics of the guest state: * 1) determine whether paging is enabled, @@ -152,6 +159,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v, hvm_funcs.load_cpu_guest_regs(v, r); } +static inline void +hvm_save_cpu_context( + struct vcpu *v, struct hvmcpu_context *ctxt) +{ + hvm_funcs.save_cpu_ctxt(v, ctxt); +} + +static inline void +hvm_load_cpu_context( + struct vcpu *v, struct hvmcpu_context *ctxt) +{ + hvm_funcs.load_cpu_ctxt(v, ctxt); +} + static inline int hvm_paging_enabled(struct vcpu *v) { diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/support.h --- a/xen/include/asm-x86/hvm/support.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/asm-x86/hvm/support.h Wed Dec 13 22:52:02 2006 +0800 @@ -121,6 +121,130 @@ extern unsigned int opt_hvm_debug_level; #define TRACE_VMEXIT(index, value) \ current->arch.hvm_vcpu.hvm_trace_values[index] = (value) +/* save/restore support */ + +//#define HVM_DEBUG_SUSPEND + +extern int hvm_register_savevm(struct domain *d, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); + +static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int pos) +{ + h->cur = pos; +} + +static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h) +{ + return h->cur; +} + +static inline int hvm_ctxt_end(hvm_domain_context_t *h) +{ + return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE); +} + +static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i) +{ + if (h->cur >= HVM_CTXT_SIZE) { + h->cur++; + return; + } + h->data[h->cur++] = (char)i; +} + +static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b) +{ + hvm_put_byte(h, b); +} + +static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b) +{ + hvm_put_8u(h, b >> 8); + hvm_put_8u(h, b); +} + +static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b) +{ + hvm_put_16u(h, b >> 16); + hvm_put_16u(h, b); +} + +static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b) +{ + hvm_put_32u(h, b >> 32); + hvm_put_32u(h, b); +} + +static inline void hvm_put_buffer(hvm_domain_context_t *h, const char *buf, int len) +{ + memcpy(&h->data[h->cur], buf, len); + h->cur += len; +} + + +static inline char hvm_get_byte(hvm_domain_context_t *h) +{ + if (h->cur >= HVM_CTXT_SIZE) { + printk("hvm_get_byte overflow.\n"); + return -1; + } + + if (h->cur >= h->size) { + printk("hvm_get_byte exceed data area.\n"); + return -1; + } + + return h->data[h->cur++]; +} + +static inline uint8_t hvm_get_8u(hvm_domain_context_t *h) +{ + return hvm_get_byte(h); +} + +static inline uint16_t hvm_get_16u(hvm_domain_context_t *h) +{ + uint16_t v; + v = hvm_get_8u(h) << 8; + v |= hvm_get_8u(h); + + return v; +} + +static inline uint32_t hvm_get_32u(hvm_domain_context_t *h) +{ + uint32_t v; + v = hvm_get_16u(h) << 16; + v |= hvm_get_16u(h); + + return v; +} + +static inline uint64_t hvm_get_64u(hvm_domain_context_t *h) +{ + uint64_t v; + v = (uint64_t)hvm_get_32u(h) << 32; + v |= hvm_get_32u(h); + + return v; +} + +static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int len) +{ + memcpy(buf, &h->data[h->cur], len); + h->cur += len; +} + +extern int hvm_save(struct vcpu*, hvm_domain_context_t *h); +extern int hvm_load(struct vcpu*, hvm_domain_context_t *h); + +extern void shpage_init(struct domain *d, shared_iopage_t *sp); + extern int hvm_enabled; int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/vpt.h --- a/xen/include/asm-x86/hvm/vpt.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/asm-x86/hvm/vpt.h Wed Dec 13 22:52:02 2006 +0800 @@ -123,7 +123,7 @@ extern void hvm_hooks_assist(struct vcpu extern void hvm_hooks_assist(struct vcpu *v); extern void pickup_deactive_ticks(struct periodic_time *vpit); extern struct periodic_time *create_periodic_time( - u32 period, char irq, char one_shot, time_cb *cb, void *data); + struct domain* d, u32 period, char irq, char one_shot, time_cb *cb, void *data); extern void destroy_periodic_time(struct periodic_time *pt); void pit_init(struct vcpu *v, unsigned long cpu_khz); void rtc_init(struct vcpu *v, int base, int irq); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_32.h --- a/xen/include/public/arch-x86_32.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/public/arch-x86_32.h Wed Dec 13 22:52:02 2006 +0800 @@ -181,6 +181,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t) DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ + +#include "vmcs_data.h" + +struct hvmcpu_context { + uint32_t valid; + struct vmcs_data data; +}; /* * The following is all CPU context. Note that the fpu_ctxt block is filled @@ -210,6 +217,7 @@ struct vcpu_guest_context { unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ unsigned long failsafe_callback_eip; unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ + struct hvmcpu_context hvmcpu_ctxt; /* whole vmcs region */ }; typedef struct vcpu_guest_context vcpu_guest_context_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_64.h --- a/xen/include/public/arch-x86_64.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/public/arch-x86_64.h Wed Dec 13 22:52:02 2006 +0800 @@ -255,6 +255,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t) typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ +#include "vmcs_data.h" + +struct hvmcpu_context { + uint32_t valid; + struct vmcs_data data; +}; + /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. @@ -288,6 +295,7 @@ struct vcpu_guest_context { uint64_t fs_base; uint64_t gs_base_kernel; uint64_t gs_base_user; + struct hvmcpu_context hvmcpu_ctxt; /* whole vmcs region */ }; typedef struct vcpu_guest_context vcpu_guest_context_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/domctl.h --- a/xen/include/public/domctl.h Fri Sep 15 17:05:38 2006 +0800 +++ b/xen/include/public/domctl.h Wed Dec 13 22:52:02 2006 +0800 @@ -384,6 +384,21 @@ struct xen_domctl_settimeoffset { }; typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t); + +#define HVM_CTXT_SIZE 6144 +typedef struct hvm_domain_context { + uint32_t cur; + uint32_t size; + uint8_t data[HVM_CTXT_SIZE]; +} hvm_domain_context_t; +DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t); + +#define XEN_DOMCTL_gethvmcontext 33 +#define XEN_DOMCTL_sethvmcontext 34 +typedef struct xen_domctl_hvmcontext { + XEN_GUEST_HANDLE(hvm_domain_context_t) ctxt; /* IN/OUT */ +} xen_domctl_hvmcontext_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t); struct xen_domctl { uint32_t cmd; @@ -410,6 +425,7 @@ struct xen_domctl { struct xen_domctl_hypercall_init hypercall_init; struct xen_domctl_arch_setup arch_setup; struct xen_domctl_settimeoffset settimeoffset; + struct xen_domctl_hvmcontext hvmcontext; uint8_t pad[128]; } u; }; diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_restore.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_hvm_restore.c Wed Dec 13 22:52:02 2006 +0800 @@ -0,0 +1,280 @@ +/****************************************************************************** + * xc_hvm_restore.c + * + * Restore the state of a HVM guest. + * + * Copyright (c) 2003, K A Fraser. + * Copyright (c) 2006 Intel Corperation + * rewriten for hvm guest by Zhai Edwin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include + +#include "xg_private.h" +#include "xg_save_restore.h" + +#include +#include +#include + +/* max mfn of the whole machine */ +static unsigned long max_mfn; + +/* virtual starting address of the hypervisor */ +static unsigned long hvirt_start; + +/* #levels of page tables used by the currrent guest */ +static unsigned int pt_levels; + +/* total number of pages used by the current guest */ +static unsigned long max_pfn; + +/* A table mapping each PFN to its new MFN. */ +static xen_pfn_t *p2m = NULL; + +static ssize_t +read_exact(int fd, void *buf, size_t count) +{ + int r = 0, s; + unsigned char *b = buf; + + while (r < count) { + s = read(fd, &b[r], count - r); + if ((s == -1) && (errno == EINTR)) + continue; + if (s <= 0) { + break; + } + r += s; + } + + return (r == count) ? 1 : 0; +} + +int xc_hvm_restore(int xc_handle, int io_fd, + uint32_t dom, unsigned long nr_pfns, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn, + unsigned int pae, unsigned int apic) +{ + DECLARE_DOMCTL; + + /* The new domain's shared-info frame number. */ + unsigned long shared_info_frame; + + /* A copy of the CPU context of the guest. */ + vcpu_guest_context_t ctxt; + + char *region_base; + + xc_mmu_t *mmu = NULL; + + xc_dominfo_t info; + unsigned int rc = 1, i; + uint32_t rec_len, nr_vcpus; + hvm_domain_context_t hvm_ctxt; + unsigned long long v_end, memsize; + unsigned long shared_page_nr; + + /* hvm guest mem size (Mb) */ + memsize = (unsigned long long)*store_mfn; + v_end = memsize << 20; + + DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, *store_mfn=%ld, console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n", + dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, *console_mfn, pae, apic); + + + + /*XXX: caculate the VGA hole, it's better derived from memsize*/ + max_pfn = nr_pfns + 0x20; + + if(!get_platform_info(xc_handle, dom, + &max_mfn, &hvirt_start, &pt_levels)) { + ERROR("Unable to get platform info."); + return 1; + } + + DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, hvirt_start=%lx, pt_levels=%d\n", + max_pfn, + max_mfn, + hvirt_start, + pt_levels); + + if (mlock(&ctxt, sizeof(ctxt))) { + /* needed for build dom0 op, but might as well do early */ + ERROR("Unable to mlock ctxt"); + return 1; + } + + + p2m = malloc(max_pfn * sizeof(xen_pfn_t)); + + if (p2m == NULL) { + ERROR("memory alloc failed"); + errno = ENOMEM; + goto out; + } + + /* Get the domain's shared-info frame. */ + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)dom; + if (xc_domctl(xc_handle, &domctl) < 0) { + ERROR("Could not get information on new domain"); + goto out; + } + shared_info_frame = domctl.u.getdomaininfo.shared_info_frame; + + if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { + errno = ENOMEM; + goto out; + } + + for ( i = 0; i < max_pfn; i++ ) + p2m[i] = i; + for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ ) + p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; + + /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */ + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn, + 0, 0, &p2m[0x00]); + if ( (rc == 0) && (max_pfn > 0xc0) ) + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]); + if ( rc != 0 ) + { + PERROR("Could not allocate memory for HVM guest.\n"); + goto out; + } + + + /**********XXXXXXXXXXXXXXXX******************/ + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) { + ERROR("Could not get domain info"); + return 1; + } + + domctl.cmd = XEN_DOMCTL_getdomaininfo; + domctl.domain = (domid_t)dom; + if (xc_domctl(xc_handle, &domctl) < 0) { + ERROR("Could not get information on new domain"); + goto out; + } + + for ( i = 0; i < max_pfn; i++) + p2m[i] = i; + + /* resotre memory */ + if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | PROT_WRITE, p2m, max_pfn) ) == 0) { + ERROR("HVM:map page_array failed!\n"); + goto out; + } + + for (i = 0; i < max_pfn; i++) { + void *zpage = region_base + i * PAGE_SIZE; + if ( p2m[i] == (~0UL)) { /*invalid mfn*/ + continue; + } + if (i >= 0xa0 && i < 0xc0) { + continue; + } + + if (!read_exact(io_fd, zpage, PAGE_SIZE)) { + ERROR("HVM:read page %d failed!\n", i); + goto out; + } + } + + (void)munmap(region_base, max_pfn*PAGE_SIZE); + + +/* xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/ + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae); + + if ( v_end > HVM_BELOW_4G_RAM_END ) + shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1; + else + shared_page_nr = (v_end >> PAGE_SHIFT) - 1; + + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2); + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr); + + /* caculate the store_mfn , wrong val cause hang when introduceDomain */ + *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2]; + DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end); + + /* restore hvm context including pic/pit/shpage */ + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) { + ERROR("error read hvm context size!\n"); + goto out; + } + if (rec_len != sizeof(hvm_ctxt)) { + ERROR("hvm context size dismatch!\n"); + goto out; + } + + if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) { + ERROR("error read hvm context!\n"); + goto out; + } + + if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) { + ERROR("error set hvm context!\n"); + goto out; + } + + if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) { + ERROR("error read nr vcpu !\n"); + goto out; + } + DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus); + + for (i =0; i < nr_vcpus; i++) { + if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) { + ERROR("error read vcpu context size!\n"); + goto out; + } + if (rec_len != sizeof(ctxt)) { + ERROR("vcpu context size dismatch!\n"); + goto out; + } + + if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) { + ERROR("error read vcpu context.\n"); + goto out; + } + + if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) { + ERROR("Could not set vcpu context, rc=%d", rc); + goto out; + } + } + + rc = 0; + goto out; + + out: + if ( (rc != 0) && (dom != 0) ) + xc_domain_destroy(xc_handle, dom); + free(mmu); + free(p2m); + + DPRINTF("Restore exit with rc=%d\n", rc); + + return rc; +} diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_save.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_hvm_save.c Wed Dec 13 22:52:02 2006 +0800 @@ -0,0 +1,248 @@ +/****************************************************************************** + * xc_hvm_save.c + * + * Save the state of a running HVM guest. + * + * Copyright (c) 2003, K A Fraser. + * Copyright (c) 2006 Intel Corperation + * rewriten for hvm guest by Zhai Edwin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include + +#include "xc_private.h" +#include "xg_private.h" +#include "xg_save_restore.h" + +#define DEF_MAX_ITERS (4 - 1) /* limit us to 4 times round loop */ +#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */ + +/* max mfn of the whole machine */ +static unsigned long max_mfn; + +/* virtual starting address of the hypervisor */ +static unsigned long hvirt_start; + +/* #levels of page tables used by the currrent guest */ +static unsigned int pt_levels; + +/* total number of pages used by the current guest */ +static unsigned long max_pfn; + +#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) + +int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, + uint32_t max_factor, uint32_t flags, int (*suspend)(int)) +{ + xc_dominfo_t info; + + int rc = 1, i; + int live = (flags & XCFLAGS_LIVE); + int debug = (flags & XCFLAGS_DEBUG); + + /* The new domain's shared-info frame number. */ + unsigned long shared_info_frame; + + /* A copy of the CPU context of the guest. */ + vcpu_guest_context_t ctxt; + + /* A copy of hvm domain context */ + hvm_domain_context_t hvm_ctxt; + + /* Live mapping of shared info structure */ + shared_info_t *live_shinfo = NULL; + + /* base of the region in which domain memory is mapped */ + unsigned char *region_base = NULL; + + uint32_t nr_pfns, max_pfns, rec_size, nr_vcpus; + unsigned long *page_array; + + DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x.\n", + dom, max_iters, max_factor, flags); + + /* If no explicit control parameters given, use defaults */ + if(!max_iters) + max_iters = DEF_MAX_ITERS; + if(!max_factor) + max_factor = DEF_MAX_FACTOR; + +/* initialize_mbit_rate();*/ + + if(!get_platform_info(xc_handle, dom, + &max_mfn, &hvirt_start, &pt_levels)) { + ERROR("HVM:Unable to get platform info."); + return 1; + } + + if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) { + ERROR("HVM:Could not get domain info"); + return 1; + } + nr_vcpus = info.nr_online_vcpus; + + if (mlock(&ctxt, sizeof(ctxt))) { + ERROR("HVM:Unable to mlock ctxt"); + return 1; + } + + /* Only have to worry about vcpu 0 even for SMP */ + if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) { + ERROR("HVM:Could not get vcpu context"); + goto out; + } + shared_info_frame = info.shared_info_frame; + + /* A cheesy test to see whether the domain contains valid state. */ + if (ctxt.ctrlreg[3] == 0) + { + ERROR("Domain is not in a valid HVM guest state"); + goto out; + } + + /* cheesy sanity check */ + if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) { + ERROR("Invalid HVM state record -- pfn count out of range: %lu", + (info.max_memkb >> (PAGE_SHIFT - 10))); + goto out; + } + + /* Map the shared info frame */ + if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, + PROT_READ, shared_info_frame))) { + ERROR("HVM:Couldn't map live_shinfo"); + goto out; + } + + max_pfn = live_shinfo->arch.max_pfn; + + DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); + + if (live) { + ERROR("hvm domain doesn't support live migration now.\n"); + if (debug) + ERROR("hvm domain debug on.\n"); + goto out; + } + + /* suspend hvm domain */ + if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) { + ERROR("HVM Domain appears not to have suspended"); + goto out; + } + + nr_pfns = info.nr_pages; + DPRINTF("after suspend hvm domain nr_pages=0x%x, max_memkb=0x%lx.\n", nr_pfns, info.max_memkb); + + /*XXX: caculate the VGA hole*/ + max_pfns = nr_pfns + 0x20; + + /* get all the HVM domain pfns */ + if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * max_pfns)) == NULL) { + ERROR("HVM:malloc fail!\n"); + goto out; + } + + for ( i = 0; i < max_pfns; i++) + page_array[i] = i; + + if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | PROT_WRITE, page_array, max_pfns) ) == 0) { + ERROR("HVM domain map pages failed!\n"); + goto out; + } + + + /* Start writing out the saved-domain record. begin with mem */ + if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) { + ERROR("write: nr_pfns"); + goto out; + } + + for (i = 0; i < max_pfns; i++) { + int ret; + void *zpage = region_base + i * PAGE_SIZE; + if ( page_array[i] == (~0UL)) { + continue; + } + if (i >= 0xa0 && i < 0xc0) { + continue; + } + + if ((ret = ratewrite(io_fd, zpage, PAGE_SIZE)) != PAGE_SIZE) { + ERROR("HVM:read page %d failed, mfn=0x%lx.\n", i, page_array[i]); + goto out; + } + } + + /* save hvm hypervisor state including pic/pit/shpage */ + if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) { + ERROR("Unable to mlock ctxt"); + return 1; + } + + if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){ + ERROR("HVM:Could not get hvm context"); + goto out; + } + + rec_size = sizeof(hvm_ctxt); + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) { + ERROR("error write hvm ctxt size"); + goto out; + } + + if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) { + ERROR("write HVM info failed!\n"); + } + + /* save vcpu/vmcs context */ + if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) { + ERROR("error write nr vcpus"); + goto out; + } + + /*XXX: need a online map to exclude down cpu */ + for (i = 0; i < nr_vcpus; i++) { + + if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) { + ERROR("HVM:Could not get vcpu context"); + goto out; + } + + rec_size = sizeof(ctxt); + DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus); + if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) { + ERROR("error write vcpu ctxt size"); + goto out; + } + + if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) { + ERROR("write vmcs failed!\n"); + goto out; + } + } + + /* Success! */ + rc = 0; + + out: + return !!rc; +} diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/vmcs_data.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/public/vmcs_data.h Wed Dec 13 22:52:02 2006 +0800 @@ -0,0 +1,68 @@ +/****************************************************************************** + * vmcs_data.h + * + * Copyright (c) 2006 Intel Corperation + * + */ + +#ifndef __XEN_PUBLIC_VMCS_DATA_H__ +#define __XEN_PUBLIC_VMCS_DATA_H__ + +/* + * World vmcs state + */ +struct vmcs_data { + uint64_t eip; /* execution pointer */ + uint64_t esp; /* stack pointer */ + uint64_t eflags; /* flags register */ + uint64_t cr0; + uint64_t cr3; /* page table directory */ + uint64_t cr4; + uint32_t idtr_limit; /* idt */ + uint64_t idtr_base; + uint32_t gdtr_limit; /* gdt */ + uint64_t gdtr_base; + uint32_t cs_sel; /* cs selector */ + uint32_t cs_limit; + uint64_t cs_base; + uint32_t cs_arbytes; + uint32_t ds_sel; /* ds selector */ + uint32_t ds_limit; + uint64_t ds_base; + uint32_t ds_arbytes; + uint32_t es_sel; /* es selector */ + uint32_t es_limit; + uint64_t es_base; + uint32_t es_arbytes; + uint32_t ss_sel; /* ss selector */ + uint32_t ss_limit; + uint64_t ss_base; + uint32_t ss_arbytes; + uint32_t fs_sel; /* fs selector */ + uint32_t fs_limit; + uint64_t fs_base; + uint32_t fs_arbytes; + uint32_t gs_sel; /* gs selector */ + uint32_t gs_limit; + uint64_t gs_base; + uint32_t gs_arbytes; + uint32_t tr_sel; /* task selector */ + uint32_t tr_limit; + uint64_t tr_base; + uint32_t tr_arbytes; + uint32_t ldtr_sel; /* ldtr selector */ + uint32_t ldtr_limit; + uint64_t ldtr_base; + uint32_t ldtr_arbytes; + uint32_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + /* msr for em64t */ + uint64_t shadow_gs; + uint64_t flags; + /* same size as VMX_MSR_COUNT */ + uint64_t msr_items[6]; + uint64_t vmxassist_enabled; +}; +typedef struct vmcs_data vmcs_data_t; +#endif