WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-ppc-devel

[XenPPC] [xenppc-unstable] merge

# HG changeset patch
# User Hollis Blanchard <hollisb@xxxxxxxxxx>
# Node ID 156a0963a1aed529e5c5517e7153b0ad64d99276
# Parent  d3e181fa238b93c616bd010edd45f707c359cf99
# Parent  c191c649cdb387e7ec573d218c9581c639c87700
merge
---
 linux-2.6-xen-sparse/arch/i386/mm/init-xen.c                     |   14 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c              |    7 
 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c                   |   27 
 linux-2.6-xen-sparse/drivers/xen/netback/netback.c               |  288 
++++++++--
 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c                |   26 
 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c             |  173 ++++--
 linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c               |   14 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h |    7 
 linux-2.6-xen-sparse/include/xen/public/privcmd.h                |   16 
 tools/debugger/libxendebug/xendebug.c                            |    2 
 tools/firmware/vmxassist/vm86.c                                  |   65 ++
 tools/ioemu/hw/cirrus_vga.c                                      |   12 
 tools/ioemu/vl.c                                                 |   15 
 tools/libxc/xc_core.c                                            |    8 
 tools/libxc/xc_domain.c                                          |   10 
 tools/libxc/xc_hvm_build.c                                       |    6 
 tools/libxc/xc_ia64_stubs.c                                      |   12 
 tools/libxc/xc_linux.c                                           |    2 
 tools/libxc/xc_linux_build.c                                     |   58 +-
 tools/libxc/xc_linux_restore.c                                   |  210 ++++++-
 tools/libxc/xc_linux_save.c                                      |   51 +
 tools/libxc/xc_load_aout9.c                                      |    4 
 tools/libxc/xc_load_bin.c                                        |    4 
 tools/libxc/xc_load_elf.c                                        |   19 
 tools/libxc/xc_private.c                                         |   62 +-
 tools/libxc/xenctrl.h                                            |   19 
 tools/libxc/xg_private.h                                         |    7 
 tools/libxc/xg_save_restore.h                                    |   12 
 tools/tests/test_x86_emulator.c                                  |  131 ++--
 xen/arch/x86/domain.c                                            |   21 
 xen/arch/x86/domain_build.c                                      |    3 
 xen/arch/x86/hvm/vmx/vmx.c                                       |   22 
 xen/arch/x86/hvm/vmx/x86_32/exits.S                              |   35 -
 xen/arch/x86/hvm/vmx/x86_64/exits.S                              |   71 +-
 xen/arch/x86/mm.c                                                |   15 
 xen/arch/x86/x86_32/asm-offsets.c                                |    2 
 xen/arch/x86/x86_32/entry.S                                      |    5 
 xen/arch/x86/x86_32/traps.c                                      |    6 
 xen/arch/x86/x86_64/asm-offsets.c                                |    3 
 xen/arch/x86/x86_64/entry.S                                      |   10 
 xen/arch/x86/x86_64/traps.c                                      |   12 
 xen/arch/x86/x86_emulate.c                                       |    4 
 xen/common/kernel.c                                              |    5 
 xen/common/keyhandler.c                                          |    5 
 xen/common/memory.c                                              |   20 
 xen/include/public/arch-ia64.h                                   |    3 
 xen/include/public/arch-x86_32.h                                 |   19 
 xen/include/public/arch-x86_64.h                                 |   21 
 xen/include/public/callback.h                                    |   15 
 xen/include/public/dom0_ops.h                                    |   56 -
 xen/include/public/grant_table.h                                 |    2 
 xen/include/public/io/netif.h                                    |    4 
 xen/include/public/io/ring.h                                     |   16 
 xen/include/public/memory.h                                      |   10 
 xen/include/public/xen.h                                         |   22 
 55 files changed, 1228 insertions(+), 460 deletions(-)

diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Tue Jun 06 13:25:31 
2006 -0500
@@ -558,15 +558,11 @@ void __init paging_init(void)
 
        kmap_init();
 
-       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
-           xen_start_info->shared_info >= xen_start_info->nr_pages) {
-               /* Switch to the real shared_info page, and clear the
-                * dummy page. */
-               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
+       /* Switch to the real shared_info page, and clear the
+        * dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
 
        /* Setup mapping of lower 1st MB */
        for (i = 0; i < NR_FIX_ISAMAPS; i++)
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Tue Jun 06 
13:25:31 2006 -0500
@@ -665,13 +665,6 @@ void __init setup_arch(char **cmdline_p)
 
        setup_xen_features();
 
-       if (xen_feature(XENFEAT_auto_translated_physmap) &&
-           xen_start_info->shared_info < xen_start_info->nr_pages) {
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)__va(xen_start_info->shared_info);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
-
        HYPERVISOR_vm_assist(VMASST_CMD_enable,
                             VMASST_TYPE_writable_pagetables);
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c    Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c    Tue Jun 06 13:25:31 
2006 -0500
@@ -666,7 +666,18 @@ void __meminit init_memory_mapping(unsig
                        set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
        }
 
-       BUG_ON(!after_bootmem && start_pfn != table_end);
+       if (!after_bootmem) {
+               BUG_ON(start_pfn != table_end);
+               /*
+                * Destroy the temporary mappings created above. Prevents
+                * overlap with modules area (if init mapping is very big).
+                */
+               start = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+               end   = __START_KERNEL_map + (table_end   << PAGE_SHIFT);
+               for (; start < end; start += PAGE_SIZE)
+                       WARN_ON(HYPERVISOR_update_va_mapping(
+                               start, __pte_ma(0), 0));
+       }
 
        __flush_tlb_all();
 }
@@ -752,15 +763,11 @@ void __init paging_init(void)
        free_area_init_node(0, NODE_DATA(0), zones,
                            __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
 
-       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
-           xen_start_info->shared_info >= xen_start_info->nr_pages) {
-               /* Switch to the real shared_info page, and clear the
-                * dummy page. */
-               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
+       /* Switch to the real shared_info page, and clear the
+        * dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
 
        init_mm.context.pinned = 1;
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netback/netback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Tue Jun 06 
13:25:31 2006 -0500
@@ -458,6 +458,9 @@ inline static void net_tx_action_dealloc
        dc = dealloc_cons;
        dp = dealloc_prod;
 
+       /* Ensure we see all indexes enqueued by netif_idx_release(). */
+       smp_rmb();
+
        /*
         * Free up any grants we have finished using
         */
@@ -487,6 +490,177 @@ inline static void net_tx_action_dealloc
        }
 }
 
+static void netbk_tx_err(netif_t *netif, RING_IDX end)
+{
+       RING_IDX cons = netif->tx.req_cons;
+
+       do {
+               netif_tx_request_t *txp = RING_GET_REQUEST(&netif->tx, cons);
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+       } while (++cons < end);
+       netif->tx.req_cons = cons;
+       netif_schedule_work(netif);
+       netif_put(netif);
+}
+
+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp,
+                               int work_to_do)
+{
+       netif_tx_request_t *first = txp;
+       RING_IDX cons = netif->tx.req_cons;
+       int frags = 1;
+
+       while (txp->flags & NETTXF_more_data) {
+               if (frags >= work_to_do) {
+                       DPRINTK("Need more frags\n");
+                       return -frags;
+               }
+
+               txp = RING_GET_REQUEST(&netif->tx, cons + frags);
+               if (txp->size > first->size) {
+                       DPRINTK("Frags galore\n");
+                       return -frags;
+               }
+
+               first->size -= txp->size;
+               frags++;
+
+               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+                       DPRINTK("txp->offset: %x, size: %u\n",
+                               txp->offset, txp->size);
+                       return -frags;
+               }
+       }
+
+       return frags;
+}
+
+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
+                                                 struct sk_buff *skb,
+                                                 gnttab_map_grant_ref_t *mop)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+       netif_tx_request_t *txp;
+       unsigned long pending_idx = *((u16 *)skb->data);
+       RING_IDX cons = netif->tx.req_cons + 1;
+       int i, start;
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+       for (i = start; i < shinfo->nr_frags; i++) {
+               txp = RING_GET_REQUEST(&netif->tx, cons++);
+               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
+
+               gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx),
+                                 GNTMAP_host_map | GNTMAP_readonly,
+                                 txp->gref, netif->domid);
+
+               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+               netif_get(netif);
+               pending_tx_info[pending_idx].netif = netif;
+               frags[i].page = (void *)pending_idx;
+       }
+
+       return mop;
+}
+
+static int netbk_tx_check_mop(struct sk_buff *skb,
+                              gnttab_map_grant_ref_t **mopp)
+{
+       gnttab_map_grant_ref_t *mop = *mopp;
+       int pending_idx = *((u16 *)skb->data);
+       netif_t *netif = pending_tx_info[pending_idx].netif;
+       netif_tx_request_t *txp;
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i, err, start;
+
+       /* Check status of header. */
+       err = mop->status;
+       if (unlikely(err)) {
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+               netif_put(netif);
+       } else {
+               set_phys_to_machine(
+                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
+                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+               grant_tx_handle[pending_idx] = mop->handle;
+       }
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+       for (i = start; i < nr_frags; i++) {
+               int j, newerr;
+
+               pending_idx = (unsigned long)shinfo->frags[i].page;
+
+               /* Check error status: if okay then remember grant handle. */
+               newerr = (++mop)->status;
+               if (likely(!newerr)) {
+                       set_phys_to_machine(
+                               __pa(MMAP_VADDR(pending_idx))>>PAGE_SHIFT,
+                               FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+                       grant_tx_handle[pending_idx] = mop->handle;
+                       /* Had a previous error? Invalidate this fragment. */
+                       if (unlikely(err))
+                               netif_idx_release(pending_idx);
+                       continue;
+               }
+
+               /* Error on this fragment: respond to client with an error. */
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+               netif_put(netif);
+
+               /* Not the first error? Preceding frags already invalidated. */
+               if (err)
+                       continue;
+
+               /* First error: invalidate header and preceding fragments. */
+               pending_idx = *((u16 *)skb->data);
+               netif_idx_release(pending_idx);
+               for (j = start; j < i; j++) {
+                       pending_idx = (unsigned long)shinfo->frags[i].page;
+                       netif_idx_release(pending_idx);
+               }
+
+               /* Remember the error: invalidate all subsequent fragments. */
+               err = newerr;
+       }
+
+       *mopp = mop + 1;
+       return err;
+}
+
+static void netbk_fill_frags(struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i;
+
+       for (i = 0; i < nr_frags; i++) {
+               skb_frag_t *frag = shinfo->frags + i;
+               netif_tx_request_t *txp;
+               unsigned long pending_idx;
+
+               pending_idx = (unsigned long)frag->page;
+               txp = &pending_tx_info[pending_idx].req;
+               frag->page = virt_to_page(MMAP_VADDR(pending_idx));
+               frag->size = txp->size;
+               frag->page_offset = txp->offset;
+
+               skb->len += txp->size;
+               skb->data_len += txp->size;
+               skb->truesize += txp->size;
+       }
+}
+
 /* Called after netfront has transmitted */
 static void net_tx_action(unsigned long unused)
 {
@@ -504,7 +678,7 @@ static void net_tx_action(unsigned long 
                net_tx_action_dealloc();
 
        mop = tx_map_ops;
-       while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
+       while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
                !list_empty(&net_schedule_list)) {
                /* Get a netif from the list with work to do. */
                ent = net_schedule_list.next;
@@ -552,38 +726,44 @@ static void net_tx_action(unsigned long 
                }
                netif->remaining_credit -= txreq.size;
 
-               netif->tx.req_cons++;
-
-               netif_schedule_work(netif);
-
-               if (unlikely(txreq.size < ETH_HLEN) || 
-                   unlikely(txreq.size > ETH_FRAME_LEN)) {
+               ret = netbk_count_requests(netif, &txreq, work_to_do);
+               if (unlikely(ret < 0)) {
+                       netbk_tx_err(netif, i - ret);
+                       continue;
+               }
+               i += ret;
+
+               if (unlikely(ret > MAX_SKB_FRAGS + 1)) {
+                       DPRINTK("Too many frags\n");
+                       netbk_tx_err(netif, i);
+                       continue;
+               }
+
+               if (unlikely(txreq.size < ETH_HLEN)) {
                        DPRINTK("Bad packet size: %d\n", txreq.size);
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        continue; 
                }
 
                /* No crossing a page as the payload mustn't fragment. */
-               if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
+               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
                        DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
                                txreq.offset, txreq.size, 
                                (txreq.offset &~PAGE_MASK) + txreq.size);
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        continue;
                }
 
                pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
 
-               data_len = (txreq.size > PKT_PROT_LEN) ?
+               data_len = (txreq.size > PKT_PROT_LEN &&
+                           ret < MAX_SKB_FRAGS + 1) ?
                        PKT_PROT_LEN : txreq.size;
 
                skb = alloc_skb(data_len+16, GFP_ATOMIC);
                if (unlikely(skb == NULL)) {
                        DPRINTK("Can't allocate a skb in start_xmit.\n");
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        break;
                }
 
@@ -600,9 +780,23 @@ static void net_tx_action(unsigned long 
                pending_tx_info[pending_idx].netif = netif;
                *((u16 *)skb->data) = pending_idx;
 
+               __skb_put(skb, data_len);
+
+               skb_shinfo(skb)->nr_frags = ret - 1;
+               if (data_len < txreq.size) {
+                       skb_shinfo(skb)->nr_frags++;
+                       skb_shinfo(skb)->frags[0].page =
+                               (void *)(unsigned long)pending_idx;
+               }
+
                __skb_queue_tail(&tx_queue, skb);
 
                pending_cons++;
+
+               mop = netbk_get_requests(netif, skb, mop);
+
+               netif->tx.req_cons = i;
+               netif_schedule_work(netif);
 
                if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
                        break;
@@ -617,75 +811,56 @@ static void net_tx_action(unsigned long 
 
        mop = tx_map_ops;
        while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+               netif_tx_request_t *txp;
+
                pending_idx = *((u16 *)skb->data);
                netif       = pending_tx_info[pending_idx].netif;
-               memcpy(&txreq, &pending_tx_info[pending_idx].req,
-                      sizeof(txreq));
+               txp         = &pending_tx_info[pending_idx].req;
 
                /* Check the remap error code. */
-               if (unlikely(mop->status)) {
+               if (unlikely(netbk_tx_check_mop(skb, &mop))) {
                        printk(KERN_ALERT "#### netback grant fails\n");
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       skb_shinfo(skb)->nr_frags = 0;
                        kfree_skb(skb);
-                       mop++;
-                       pending_ring[MASK_PEND_IDX(pending_prod++)] =
-                               pending_idx;
                        continue;
                }
-               set_phys_to_machine(
-                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
-                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-               grant_tx_handle[pending_idx] = mop->handle;
-
-               data_len = (txreq.size > PKT_PROT_LEN) ?
-                       PKT_PROT_LEN : txreq.size;
-
-               __skb_put(skb, data_len);
+
+               data_len = skb->len;
                memcpy(skb->data, 
-                      (void *)(MMAP_VADDR(pending_idx)|txreq.offset),
+                      (void *)(MMAP_VADDR(pending_idx)|txp->offset),
                       data_len);
-               if (data_len < txreq.size) {
+               if (data_len < txp->size) {
                        /* Append the packet payload as a fragment. */
-                       skb_shinfo(skb)->frags[0].page        = 
-                               virt_to_page(MMAP_VADDR(pending_idx));
-                       skb_shinfo(skb)->frags[0].size        =
-                               txreq.size - data_len;
-                       skb_shinfo(skb)->frags[0].page_offset = 
-                               txreq.offset + data_len;
-                       skb_shinfo(skb)->nr_frags = 1;
+                       txp->offset += data_len;
+                       txp->size -= data_len;
                } else {
                        /* Schedule a response immediately. */
                        netif_idx_release(pending_idx);
                }
-
-               skb->data_len  = txreq.size - data_len;
-               skb->len      += skb->data_len;
-               skb->truesize += skb->data_len;
-
-               skb->dev      = netif->dev;
-               skb->protocol = eth_type_trans(skb, skb->dev);
 
                /*
                 * Old frontends do not assert data_validated but we
                 * can infer it from csum_blank so test both flags.
                 */
-               if (txreq.flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
+               if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
                        skb->proto_data_valid = 1;
                } else {
                        skb->ip_summed = CHECKSUM_NONE;
                        skb->proto_data_valid = 0;
                }
-               skb->proto_csum_blank = !!(txreq.flags & NETTXF_csum_blank);
-
-               netif->stats.rx_bytes += txreq.size;
+               skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
+
+               netbk_fill_frags(skb);
+
+               skb->dev      = netif->dev;
+               skb->protocol = eth_type_trans(skb, skb->dev);
+
+               netif->stats.rx_bytes += skb->len;
                netif->stats.rx_packets++;
 
                netif_rx(skb);
                netif->dev->last_rx = jiffies;
-
-               mop++;
        }
 }
 
@@ -695,7 +870,10 @@ static void netif_idx_release(u16 pendin
        unsigned long flags;
 
        spin_lock_irqsave(&_lock, flags);
-       dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+       dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
+       /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+       smp_wmb();
+       dealloc_prod++;
        spin_unlock_irqrestore(&_lock, flags);
 
        tasklet_schedule(&net_tx_tasklet);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Tue Jun 06 13:25:31 
2006 -0500
@@ -69,6 +69,8 @@ static int netback_probe(struct xenbus_d
 static int netback_probe(struct xenbus_device *dev,
                         const struct xenbus_device_id *id)
 {
+       const char *message;
+       xenbus_transaction_t xbt;
        int err;
        struct backend_info *be = kzalloc(sizeof(struct backend_info),
                                          GFP_KERNEL);
@@ -86,6 +88,27 @@ static int netback_probe(struct xenbus_d
        if (err)
                goto fail;
 
+       do {
+               err = xenbus_transaction_start(&xbt);
+               if (err) {
+                       xenbus_dev_fatal(dev, err, "starting transaction");
+                       goto fail;
+               }
+
+               err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+               if (err) {
+                       message = "writing feature-sg";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_transaction_end(xbt, 0);
+       } while (err == -EAGAIN);
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto fail;
+       }
+
        err = xenbus_switch_state(dev, XenbusStateInitWait);
        if (err) {
                goto fail;
@@ -93,6 +116,9 @@ static int netback_probe(struct xenbus_d
 
        return 0;
 
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
 fail:
        DPRINTK("failed");
        netback_remove(dev);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Tue Jun 06 
13:25:31 2006 -0500
@@ -45,6 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/ethtool.h>
 #include <linux/in.h>
+#include <linux/if_ether.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/arp.h>
@@ -173,6 +174,11 @@ static void xennet_sysfs_delif(struct ne
 #define xennet_sysfs_delif(dev) do { } while(0)
 #endif
 
+static inline int xennet_can_sg(struct net_device *dev)
+{
+       return dev->features & NETIF_F_SG;
+}
+
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffers for communication with the backend, and
@@ -307,8 +313,6 @@ again:
                goto destroy_ring;
        }
 
-       xenbus_switch_state(dev, XenbusStateConnected);
-
        return 0;
 
  abort_transaction:
@@ -370,12 +374,9 @@ static int setup_device(struct xenbus_de
                goto fail;
 
        memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
-       network_connect(netdev);
        info->irq = bind_evtchn_to_irqhandler(
                info->evtchn, netif_int, SA_SAMPLE_RANDOM, netdev->name,
                netdev);
-       (void)send_fake_arp(netdev);
-       show_device(info);
 
        return 0;
 
@@ -391,15 +392,24 @@ static void backend_changed(struct xenbu
 static void backend_changed(struct xenbus_device *dev,
                            enum xenbus_state backend_state)
 {
+       struct netfront_info *np = dev->data;
+       struct net_device *netdev = np->netdev;
+
        DPRINTK("\n");
 
        switch (backend_state) {
        case XenbusStateInitialising:
-       case XenbusStateInitWait:
        case XenbusStateInitialised:
        case XenbusStateConnected:
        case XenbusStateUnknown:
        case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+               network_connect(netdev);
+               xenbus_switch_state(dev, XenbusStateConnected);
+               (void)send_fake_arp(netdev);
+               show_device(np);
                break;
 
        case XenbusStateClosing:
@@ -452,13 +462,17 @@ static int network_open(struct net_devic
        return 0;
 }
 
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 1;
+}
+
 static inline void network_maybe_wake_tx(struct net_device *dev)
 {
        struct netfront_info *np = netdev_priv(dev);
 
        if (unlikely(netif_queue_stopped(dev)) &&
-           !RING_FULL(&np->tx) &&
-           !gnttab_empty_grant_references(&np->gref_tx_head) &&
+           netfront_tx_slot_available(np) &&
            likely(netif_running(dev)))
                netif_wake_queue(dev);
 }
@@ -485,7 +499,7 @@ static void network_tx_buf_gc(struct net
                                printk(KERN_ALERT "network_tx_buf_gc: warning "
                                       "-- grant still in use by backend "
                                       "domain.\n");
-                               break; /* bail immediately */
+                               BUG();
                        }
                        gnttab_end_foreign_access_ref(
                                np->grant_tx_ref[id], GNTMAP_readonly);
@@ -638,36 +652,95 @@ static void network_alloc_rx_buffers(str
        RING_PUSH_REQUESTS(&np->rx);
 }
 
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+                             struct netif_tx_request *tx)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       char *data = skb->data;
+       unsigned long mfn;
+       RING_IDX prod = np->tx.req_prod_pvt;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+       unsigned int id;
+       grant_ref_t ref;
+       int i;
+
+       while (len > PAGE_SIZE - offset) {
+               tx->size = PAGE_SIZE - offset;
+               tx->flags |= NETTXF_more_data;
+               len -= tx->size;
+               data += tx->size;
+               offset = 0;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = virt_to_mfn(data);
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GNTMAP_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = offset;
+               tx->size = len;
+               tx->flags = 0;
+       }
+
+       for (i = 0; i < frags; i++) {
+               skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+               tx->flags |= NETTXF_more_data;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = pfn_to_mfn(page_to_pfn(frag->page));
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GNTMAP_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = frag->page_offset;
+               tx->size = frag->size;
+               tx->flags = 0;
+       }
+
+       np->tx.req_prod_pvt = prod;
+}
 
 static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        unsigned short id;
        struct netfront_info *np = netdev_priv(dev);
        struct netif_tx_request *tx;
+       char *data = skb->data;
        RING_IDX i;
        grant_ref_t ref;
        unsigned long mfn;
        int notify;
-
-       if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
-                    PAGE_SIZE)) {
-               struct sk_buff *nskb;
-               nskb = __dev_alloc_skb(skb->len, GFP_ATOMIC|__GFP_NOWARN);
-               if (unlikely(nskb == NULL))
-                       goto drop;
-               skb_put(nskb, skb->len);
-               memcpy(nskb->data, skb->data, skb->len);
-               /* Copy only the header fields we use in this driver. */
-               nskb->dev = skb->dev;
-               nskb->ip_summed = skb->ip_summed;
-               nskb->proto_data_valid = skb->proto_data_valid;
-               dev_kfree_skb(skb);
-               skb = nskb;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+
+       frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
+       if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+               printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
+                      frags);
+               dump_stack();
+               goto drop;
        }
 
        spin_lock_irq(&np->tx_lock);
 
-       if (unlikely(!netif_carrier_ok(dev))) {
+       if (unlikely(!netif_carrier_ok(dev) ||
+                    (frags > 1 && !xennet_can_sg(dev)))) {
                spin_unlock_irq(&np->tx_lock);
                goto drop;
        }
@@ -682,12 +755,12 @@ static int network_start_xmit(struct sk_
        tx->id   = id;
        ref = gnttab_claim_grant_reference(&np->gref_tx_head);
        BUG_ON((signed short)ref < 0);
-       mfn = virt_to_mfn(skb->data);
+       mfn = virt_to_mfn(data);
        gnttab_grant_foreign_access_ref(
                ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
        tx->gref = np->grant_tx_ref[id] = ref;
-       tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
-       tx->size = skb->len;
+       tx->offset = offset;
+       tx->size = len;
 
        tx->flags = 0;
        if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
@@ -696,14 +769,17 @@ static int network_start_xmit(struct sk_
                tx->flags |= NETTXF_data_validated;
 
        np->tx.req_prod_pvt = i + 1;
+
+       xennet_make_frags(skb, dev, tx);
+       tx->size = skb->len;
+
        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
        if (notify)
                notify_remote_via_irq(np->irq);
 
        network_tx_buf_gc(dev);
 
-       if (RING_FULL(&np->tx) ||
-           gnttab_empty_grant_references(&np->gref_tx_head))
+       if (!netfront_tx_slot_available(np))
                netif_stop_queue(dev);
 
        spin_unlock_irq(&np->tx_lock);
@@ -963,12 +1039,46 @@ static struct net_device_stats *network_
        return &np->stats;
 }
 
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+       if (data) {
+               struct netfront_info *np = netdev_priv(dev);
+               int val;
+
+               if (xenbus_scanf(XBT_NULL, np->xbdev->otherend, "feature-sg",
+                                "%d", &val) < 0)
+                       val = 0;
+               if (!val)
+                       return -ENOSYS;
+       } else if (dev->mtu > ETH_DATA_LEN)
+               dev->mtu = ETH_DATA_LEN;
+
+       return ethtool_op_set_sg(dev, data);
+}
+
+static void xennet_set_features(struct net_device *dev)
+{
+       xennet_set_sg(dev, 1);
+}
+
 static void network_connect(struct net_device *dev)
 {
        struct netfront_info *np;
        int i, requeue_idx;
        struct netif_tx_request *tx;
        struct sk_buff *skb;
+
+       xennet_set_features(dev);
 
        np = netdev_priv(dev);
        spin_lock_irq(&np->tx_lock);
@@ -1081,6 +1191,8 @@ static struct ethtool_ops network_ethtoo
 {
        .get_tx_csum = ethtool_op_get_tx_csum,
        .set_tx_csum = ethtool_op_set_tx_csum,
+       .get_sg = ethtool_op_get_sg,
+       .set_sg = xennet_set_sg,
 };
 
 #ifdef CONFIG_SYSFS
@@ -1297,6 +1409,7 @@ static struct net_device * __devinit cre
        netdev->poll            = netif_poll;
        netdev->set_multicast_list = network_set_multicast_list;
        netdev->uninit          = netif_uninit;
+       netdev->change_mtu      = xennet_change_mtu;
        netdev->weight          = 64;
        netdev->features        = NETIF_F_IP_CSUM;
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c
--- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c        Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c        Tue Jun 06 
13:25:31 2006 -0500
@@ -61,11 +61,11 @@ static int privcmd_ioctl(struct inode *i
                __asm__ __volatile__ (
                        "pushl %%ebx; pushl %%ecx; pushl %%edx; "
                        "pushl %%esi; pushl %%edi; "
-                       "movl  4(%%eax),%%ebx ;"
-                       "movl  8(%%eax),%%ecx ;"
-                       "movl 12(%%eax),%%edx ;"
-                       "movl 16(%%eax),%%esi ;"
-                       "movl 20(%%eax),%%edi ;"
+                       "movl  8(%%eax),%%ebx ;"
+                       "movl 16(%%eax),%%ecx ;"
+                       "movl 24(%%eax),%%edx ;"
+                       "movl 32(%%eax),%%esi ;"
+                       "movl 40(%%eax),%%edi ;"
                        "movl   (%%eax),%%eax ;"
                        "shll $5,%%eax ;"
                        "addl $hypercall_page,%%eax ;"
@@ -161,7 +161,7 @@ static int privcmd_ioctl(struct inode *i
        case IOCTL_PRIVCMD_MMAPBATCH: {
                privcmd_mmapbatch_t m;
                struct vm_area_struct *vma = NULL;
-               unsigned long __user *p;
+               xen_pfn_t __user *p;
                unsigned long addr, mfn; 
                int i;
 
@@ -210,7 +210,7 @@ static int privcmd_ioctl(struct inode *i
        batch_err:
                printk("batch_err ret=%d vma=%p addr=%lx "
                       "num=%d arr=%p %lx-%lx\n", 
-                      ret, vma, m.addr, m.num, m.arr,
+                      ret, vma, (unsigned long)m.addr, m.num, m.arr,
                       vma ? vma->vm_start : 0, vma ? vma->vm_end : 0);
                break;
        }
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h  Fri Jun 
02 12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h  Tue Jun 
06 13:25:31 2006 -0500
@@ -61,13 +61,6 @@ static void __init machine_specific_arch
                .address = { __KERNEL_CS, (unsigned long)nmi },
        };
 
-       if (xen_feature(XENFEAT_auto_translated_physmap) &&
-           xen_start_info->shared_info < xen_start_info->nr_pages) {
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)__va(xen_start_info->shared_info);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
-
        ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
        if (ret == 0)
                ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/include/xen/public/privcmd.h
--- a/linux-2.6-xen-sparse/include/xen/public/privcmd.h Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/include/xen/public/privcmd.h Tue Jun 06 13:25:31 
2006 -0500
@@ -33,20 +33,22 @@
 #ifndef __LINUX_PUBLIC_PRIVCMD_H__
 #define __LINUX_PUBLIC_PRIVCMD_H__
 
+#include <linux/types.h>
+
 #ifndef __user
 #define __user
 #endif
 
 typedef struct privcmd_hypercall
 {
-       unsigned long op;
-       unsigned long arg[5];
+       __u64 op;
+       __u64 arg[5];
 } privcmd_hypercall_t;
 
 typedef struct privcmd_mmap_entry {
-       unsigned long va;
-       unsigned long mfn;
-       unsigned long npages;
+       __u64 va;
+       __u64 mfn;
+       __u64 npages;
 } privcmd_mmap_entry_t; 
 
 typedef struct privcmd_mmap {
@@ -58,8 +60,8 @@ typedef struct privcmd_mmapbatch {
 typedef struct privcmd_mmapbatch {
        int num;     /* number of pages to populate */
        domid_t dom; /* target domain */
-       unsigned long addr;  /* virtual address */
-       unsigned long __user *arr; /* array of mfns - top nibble set on err */
+       __u64 addr;  /* virtual address */
+       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
 } privcmd_mmapbatch_t; 
 
 /*
diff -r d3e181fa238b -r 156a0963a1ae tools/debugger/libxendebug/xendebug.c
--- a/tools/debugger/libxendebug/xendebug.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/debugger/libxendebug/xendebug.c     Tue Jun 06 13:25:31 2006 -0500
@@ -57,7 +57,7 @@ typedef struct domain_context           
     vcpu_guest_context_t context[MAX_VIRT_CPUS];
 
     long            total_pages;
-    unsigned long  *page_array;
+    xen_pfn_t      *page_array;
 
     unsigned long   cr3_phys[MAX_VIRT_CPUS];
     unsigned long  *cr3_virt[MAX_VIRT_CPUS];
diff -r d3e181fa238b -r 156a0963a1ae tools/firmware/vmxassist/vm86.c
--- a/tools/firmware/vmxassist/vm86.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/firmware/vmxassist/vm86.c   Tue Jun 06 13:25:31 2006 -0500
@@ -36,6 +36,8 @@
 
 static unsigned prev_eip = 0;
 enum vm86_mode mode = 0;
+
+static struct regs saved_rm_regs;
 
 #ifdef DEBUG
 int traceset = 0;
@@ -795,6 +797,8 @@ protected_mode(struct regs *regs)
        oldctx.esp = regs->uesp;
        oldctx.eflags = regs->eflags;
 
+       memset(&saved_rm_regs, 0, sizeof(struct regs));
+
        /* reload all segment registers */
        if (!load_seg(regs->cs, &oldctx.cs_base,
                                &oldctx.cs_limit, &oldctx.cs_arbytes))
@@ -808,6 +812,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.es_base,
                            &oldctx.es_limit, &oldctx.es_arbytes);
                oldctx.es_sel = 0;
+               saved_rm_regs.ves = regs->ves;
        }
 
        if (load_seg(regs->uss, &oldctx.ss_base,
@@ -817,6 +822,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.ss_base,
                            &oldctx.ss_limit, &oldctx.ss_arbytes);
                oldctx.ss_sel = 0;
+               saved_rm_regs.uss = regs->uss;
        }
 
        if (load_seg(regs->vds, &oldctx.ds_base,
@@ -826,6 +832,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.ds_base,
                            &oldctx.ds_limit, &oldctx.ds_arbytes);
                oldctx.ds_sel = 0;
+               saved_rm_regs.vds = regs->vds;
        }
 
        if (load_seg(regs->vfs, &oldctx.fs_base,
@@ -835,6 +842,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.fs_base,
                            &oldctx.fs_limit, &oldctx.fs_arbytes);
                oldctx.fs_sel = 0;
+               saved_rm_regs.vfs = regs->vfs;
        }
 
        if (load_seg(regs->vgs, &oldctx.gs_base,
@@ -844,6 +852,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.gs_base,
                            &oldctx.gs_limit, &oldctx.gs_arbytes);
                oldctx.gs_sel = 0;
+               saved_rm_regs.vgs = regs->vgs;
        }
 
        /* initialize jump environment to warp back to protected mode */
@@ -880,16 +889,22 @@ real_mode(struct regs *regs)
                if (regs->uss >= HIGHMEM)
                        panic("%%ss 0x%lx higher than 1MB", regs->uss);
                regs->uss = address(regs, regs->uss, 0) >> 4;
+       } else {
+         regs->uss = saved_rm_regs.uss;
        }
        if (regs->vds != 0) {
                if (regs->vds >= HIGHMEM)
                        panic("%%ds 0x%lx higher than 1MB", regs->vds);
                regs->vds = address(regs, regs->vds, 0) >> 4;
+       } else {
+         regs->vds = saved_rm_regs.vds;
        }
        if (regs->ves != 0) {
                if (regs->ves >= HIGHMEM)
                        panic("%%es 0x%lx higher than 1MB", regs->ves);
                regs->ves = address(regs, regs->ves, 0) >> 4;
+       } else {
+         regs->ves = saved_rm_regs.ves;
        }
 
        /* this should get us into 16-bit mode */
@@ -971,6 +986,39 @@ jmpl(struct regs *regs, int prefix)
        } else if (mode == VM86_PROTECTED_TO_REAL) { /* jump to real mode */
                eip = (prefix & DATA32) ? fetch32(regs) : fetch16(regs);
                cs = fetch16(regs);
+
+               TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
+
+                regs->cs = cs;
+                regs->eip = eip;
+               set_mode(regs, VM86_REAL);
+       } else
+               panic("jmpl");
+}
+
+static void
+jmpl_indirect(struct regs *regs, int prefix, unsigned modrm)
+{
+       unsigned n = regs->eip;
+       unsigned cs, eip;
+       unsigned addr;
+
+       addr  = operand(prefix, regs, modrm);
+
+       if (mode == VM86_REAL_TO_PROTECTED) { /* jump to protected mode */
+               eip = (prefix & DATA32) ? read32(addr) : read16(addr);
+               addr += (prefix & DATA32) ? 4 : 2;
+               cs = read16(addr);
+
+               TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
+
+                regs->cs = cs;
+                regs->eip = eip;
+               set_mode(regs, VM86_PROTECTED);
+       } else if (mode == VM86_PROTECTED_TO_REAL) { /* jump to real mode */
+               eip = (prefix & DATA32) ? read32(addr) : read16(addr);
+               addr += (prefix & DATA32) ? 4 : 2;
+               cs = read16(addr);
 
                TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
 
@@ -1306,6 +1354,23 @@ opcode(struct regs *regs)
                        }
                        goto invalid;
 
+               case 0xFF: /* jmpl (indirect) */
+                       if ((mode == VM86_REAL_TO_PROTECTED) ||
+                           (mode == VM86_PROTECTED_TO_REAL)) {
+                               unsigned modrm = fetch8(regs);
+                               
+                               switch((modrm >> 3) & 7) {
+                               case 5:
+                                 jmpl_indirect(regs, prefix, modrm);
+                                 return OPC_INVALID;
+
+                               default:
+                                 break;
+                               }
+
+                       }
+                       goto invalid;
+
                case 0xEB: /* short jump */
                        if ((mode == VM86_REAL_TO_PROTECTED) ||
                            (mode == VM86_PROTECTED_TO_REAL)) {
diff -r d3e181fa238b -r 156a0963a1ae tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/ioemu/hw/cirrus_vga.c       Tue Jun 06 13:25:31 2006 -0500
@@ -2462,7 +2462,7 @@ extern FILE *logfile;
 extern FILE *logfile;
 static void * set_vram_mapping(unsigned long begin, unsigned long end)
 {
-    unsigned long * extent_start = NULL;
+    xen_pfn_t *extent_start = NULL;
     unsigned long nr_extents;
     void *vram_pointer = NULL;
     int i;
@@ -2473,14 +2473,14 @@ static void * set_vram_mapping(unsigned 
     end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
     nr_extents = (end - begin) >> TARGET_PAGE_BITS;
 
-    extent_start = malloc(sizeof(unsigned long) * nr_extents );
+    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents );
     if (extent_start == NULL)
     {
         fprintf(stderr, "Failed malloc on set_vram_mapping\n");
         return NULL;
     }
 
-    memset(extent_start, 0, sizeof(unsigned long) * nr_extents);
+    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
 
     for (i = 0; i < nr_extents; i++)
     {
@@ -2508,7 +2508,7 @@ static void * set_vram_mapping(unsigned 
 
 static int unset_vram_mapping(unsigned long begin, unsigned long end)
 {
-    unsigned long * extent_start = NULL;
+    xen_pfn_t *extent_start = NULL;
     unsigned long nr_extents;
     int i;
 
@@ -2519,7 +2519,7 @@ static int unset_vram_mapping(unsigned l
     end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
     nr_extents = (end - begin) >> TARGET_PAGE_BITS;
 
-    extent_start = malloc(sizeof(unsigned long) * nr_extents );
+    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents );
 
     if (extent_start == NULL)
     {
@@ -2527,7 +2527,7 @@ static int unset_vram_mapping(unsigned l
         return -1;
     }
 
-    memset(extent_start, 0, sizeof(unsigned long) * nr_extents);
+    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
 
     for (i = 0; i < nr_extents; i++)
         extent_start[i] = (begin + (i * TARGET_PAGE_SIZE)) >> TARGET_PAGE_BITS;
diff -r d3e181fa238b -r 156a0963a1ae tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/ioemu/vl.c  Tue Jun 06 13:25:31 2006 -0500
@@ -2458,7 +2458,7 @@ int unset_mm_mapping(int xc_handle,
                      uint32_t domid,
                      unsigned long nr_pages,
                      unsigned int address_bits,
-                     unsigned long *extent_start)
+                     xen_pfn_t *extent_start)
 {
     int err = 0;
     xc_dominfo_t info;
@@ -2491,7 +2491,7 @@ int set_mm_mapping(int xc_handle,
                     uint32_t domid,
                     unsigned long nr_pages,
                     unsigned int address_bits,
-                    unsigned long *extent_start)
+                    xen_pfn_t *extent_start)
 {
     xc_dominfo_t info;
     int err = 0;
@@ -2557,7 +2557,8 @@ int main(int argc, char **argv)
     int serial_device_index;
     char qemu_dm_logfilename[64];
     const char *loadvm = NULL;
-    unsigned long nr_pages, *page_array;
+    unsigned long nr_pages;
+    xen_pfn_t *page_array;
     extern void *shared_page;
 
 #if !defined(CONFIG_SOFTMMU)
@@ -3023,8 +3024,8 @@ int main(int argc, char **argv)
 
     xc_handle = xc_interface_open();
 
-    if ( (page_array = (unsigned long *)
-                        malloc(nr_pages * sizeof(unsigned long))) == NULL)
+    if ( (page_array = (xen_pfn_t *)
+                        malloc(nr_pages * sizeof(xen_pfn_t))) == NULL)
     {
         fprintf(logfile, "malloc returned error %d\n", errno);
         exit(-1);
@@ -3079,8 +3080,8 @@ int main(int argc, char **argv)
                                        page_array[0]);
 #endif
 
-    fprintf(logfile, "shared page at pfn:%lx, mfn: %lx\n", (nr_pages-1),
-           (page_array[nr_pages - 1]));
+    fprintf(logfile, "shared page at pfn:%lx, mfn: %"PRIx64"\n", (nr_pages-1),
+           (uint64_t)(page_array[nr_pages - 1]));
 
     /* we always create the cdrom drive, even if no disk is there */
     bdrv_init();
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_core.c
--- a/tools/libxc/xc_core.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_core.c     Tue Jun 06 13:25:31 2006 -0500
@@ -28,7 +28,7 @@ xc_domain_dumpcore_via_callback(int xc_h
                                 dumpcore_rtn_t dump_rtn)
 {
     unsigned long nr_pages;
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     xc_dominfo_t info;
     int i, nr_vcpus = 0;
     char *dump_mem, *dump_mem_start = NULL;
@@ -70,7 +70,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         sizeof(vcpu_guest_context_t)*nr_vcpus;
     dummy_len = (sizeof(struct xc_core_header) +
                  (sizeof(vcpu_guest_context_t) * nr_vcpus) +
-                 (nr_pages * sizeof(unsigned long)));
+                 (nr_pages * sizeof(xen_pfn_t)));
     header.xch_pages_offset = round_pgup(dummy_len);
 
     sts = dump_rtn(args, (char *)&header, sizeof(struct xc_core_header));
@@ -81,7 +81,7 @@ xc_domain_dumpcore_via_callback(int xc_h
     if ( sts != 0 )
         goto error_out;
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
     {
         printf("Could not allocate memory\n");
         goto error_out;
@@ -91,7 +91,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         printf("Could not get the page frame list\n");
         goto error_out;
     }
-    sts = dump_rtn(args, (char *)page_array, nr_pages * sizeof(unsigned long));
+    sts = dump_rtn(args, (char *)page_array, nr_pages * sizeof(xen_pfn_t));
     if ( sts != 0 )
         goto error_out;
 
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_domain.c   Tue Jun 06 13:25:31 2006 -0500
@@ -291,7 +291,7 @@ int xc_domain_memory_increase_reservatio
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -324,7 +324,7 @@ int xc_domain_memory_decrease_reservatio
                                           uint32_t domid,
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -363,7 +363,7 @@ int xc_domain_memory_populate_physmap(in
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -392,8 +392,8 @@ int xc_domain_translate_gpfn_list(int xc
 int xc_domain_translate_gpfn_list(int xc_handle,
                                   uint32_t domid,
                                   unsigned long nr_gpfns,
-                                  unsigned long *gpfn_list,
-                                  unsigned long *mfn_list)
+                                  xen_pfn_t *gpfn_list,
+                                  xen_pfn_t *mfn_list)
 {
     struct xen_translate_gpfn_list op = {
         .domid        = domid,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_hvm_build.c        Tue Jun 06 13:25:31 2006 -0500
@@ -135,7 +135,7 @@ static void set_hvm_info_checksum(struct
  * hvmloader will use this info to set BIOS accordingly
  */
 static int set_hvm_info(int xc_handle, uint32_t dom,
-                        unsigned long *pfn_list, unsigned int vcpus,
+                        xen_pfn_t *pfn_list, unsigned int vcpus,
                         unsigned int pae, unsigned int acpi, unsigned int apic)
 {
     char *va_map;
@@ -178,7 +178,7 @@ static int setup_guest(int xc_handle,
                        unsigned int store_evtchn,
                        unsigned long *store_mfn)
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     unsigned long count, i;
     unsigned long long ptr;
     xc_mmu_t *mmu = NULL;
@@ -223,7 +223,7 @@ static int setup_guest(int xc_handle,
         goto error_out;
     }
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_ia64_stubs.c
--- a/tools/libxc/xc_ia64_stubs.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_ia64_stubs.c       Tue Jun 06 13:25:31 2006 -0500
@@ -57,7 +57,7 @@ xc_plan9_build(int xc_handle,
 
 int xc_ia64_get_pfn_list(int xc_handle,
                          uint32_t domid,
-                         unsigned long *pfn_buf,
+                         xen_pfn_t *pfn_buf,
                          unsigned int start_page,
                          unsigned int nr_pages)
 {
@@ -65,7 +65,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
     int num_pfns,ret;
     unsigned int __start_page, __nr_pages;
     unsigned long max_pfns;
-    unsigned long *__pfn_buf;
+    xen_pfn_t *__pfn_buf;
 
     __start_page = start_page;
     __nr_pages = nr_pages;
@@ -80,7 +80,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
         set_xen_guest_handle(op.u.getmemlist.buffer, __pfn_buf);
 
         if ( (max_pfns != -1UL)
-            && mlock(__pfn_buf, __nr_pages * sizeof(unsigned long)) != 0 )
+            && mlock(__pfn_buf, __nr_pages * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not lock pfn list buffer");
             return -1;
@@ -89,7 +89,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
         ret = do_dom0_op(xc_handle, &op);
 
         if (max_pfns != -1UL)
-            (void)munlock(__pfn_buf, __nr_pages * sizeof(unsigned long));
+            (void)munlock(__pfn_buf, __nr_pages * sizeof(xen_pfn_t));
 
         if (max_pfns == -1UL)
             return 0;
@@ -122,10 +122,10 @@ int xc_ia64_copy_to_domain_pages(int xc_
 {
     // N.B. gva should be page aligned
 
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     int i;
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL ){
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ){
         PERROR("Could not allocate memory");
         goto error_out;
     }
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux.c
--- a/tools/libxc/xc_linux.c    Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux.c    Tue Jun 06 13:25:31 2006 -0500
@@ -28,7 +28,7 @@ int xc_interface_close(int xc_handle)
 }
 
 void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot,
-                           unsigned long *arr, int num)
+                           xen_pfn_t *arr, int num)
 {
     privcmd_mmapbatch_t ioctlx;
     void *addr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_build.c      Tue Jun 06 13:25:31 2006 -0500
@@ -10,6 +10,7 @@
 #include "xc_aout9.h"
 #include <stdlib.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <zlib.h>
 
 #if defined(__i386__)
@@ -136,7 +137,7 @@ int load_initrd(int xc_handle, domid_t d
 int load_initrd(int xc_handle, domid_t dom,
                 struct initrd_info *initrd,
                 unsigned long physbase,
-                unsigned long *phys_to_mach)
+                xen_pfn_t *phys_to_mach)
 {
     char page[PAGE_SIZE];
     unsigned long pfn_start, pfn, nr_pages;
@@ -189,7 +190,7 @@ static int setup_pg_tables(int xc_handle
                            vcpu_guest_context_t *ctxt,
                            unsigned long dsi_v_start,
                            unsigned long v_end,
-                           unsigned long *page_array,
+                           xen_pfn_t *page_array,
                            unsigned long vpt_start,
                            unsigned long vpt_end,
                            unsigned shadow_mode_enabled)
@@ -251,19 +252,35 @@ static int setup_pg_tables_pae(int xc_ha
                                vcpu_guest_context_t *ctxt,
                                unsigned long dsi_v_start,
                                unsigned long v_end,
-                               unsigned long *page_array,
+                               xen_pfn_t *page_array,
                                unsigned long vpt_start,
                                unsigned long vpt_end,
-                               unsigned shadow_mode_enabled)
+                               unsigned shadow_mode_enabled,
+                               unsigned pae_mode)
 {
     l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL;
     l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
     l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
     uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
-    unsigned long ppt_alloc, count;
+    unsigned long ppt_alloc, count, nmfn;
 
     /* First allocate page for page dir. */
     ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
+
+    if ( pae_mode == PAEKERN_extended_cr3 )
+    {
+        ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
+    }
+    else if ( page_array[ppt_alloc] > 0xfffff )
+    {
+        nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
+        if ( nmfn == 0 )
+        {
+            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
+            goto error_out;
+        }
+        page_array[ppt_alloc] = nmfn;
+    }
 
     alloc_pt(l3tab, vl3tab, pl3tab);
     vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
@@ -340,7 +357,7 @@ static int setup_pg_tables_64(int xc_han
                               vcpu_guest_context_t *ctxt,
                               unsigned long dsi_v_start,
                               unsigned long v_end,
-                              unsigned long *page_array,
+                              xen_pfn_t *page_array,
                               unsigned long vpt_start,
                               unsigned long vpt_end,
                               int shadow_mode_enabled)
@@ -451,7 +468,7 @@ static int setup_guest(int xc_handle,
                        unsigned int console_evtchn, unsigned long *console_mfn,
                        uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     struct load_funcs load_funcs;
     struct domain_setup_info dsi;
     unsigned long vinitrd_start;
@@ -478,7 +495,7 @@ static int setup_guest(int xc_handle,
 
     start_page = dsi.v_start >> PAGE_SHIFT;
     pgnr = (v_end - dsi.v_start) >> PAGE_SHIFT;
-    if ( (page_array = malloc(pgnr * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(pgnr * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory");
         goto error_out;
@@ -579,11 +596,11 @@ static int compat_check(int xc_handle, s
     }
 
     if (strstr(xen_caps, "xen-3.0-x86_32p")) {
-        if (!dsi->pae_kernel) {
+        if (dsi->pae_kernel == PAEKERN_no) {
             ERROR("Non PAE-kernel on PAE host.");
             return 0;
         }
-    } else if (dsi->pae_kernel) {
+    } else if (dsi->pae_kernel != PAEKERN_no) {
         ERROR("PAE-kernel on non-PAE host.");
         return 0;
     }
@@ -606,7 +623,7 @@ static int setup_guest(int xc_handle,
                        unsigned int console_evtchn, unsigned long *console_mfn,
                        uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     unsigned long count, i, hypercall_pfn;
     start_info_t *start_info;
     shared_info_t *shared_info;
@@ -617,7 +634,7 @@ static int setup_guest(int xc_handle,
 
     unsigned long nr_pt_pages;
     unsigned long physmap_pfn;
-    unsigned long *physmap, *physmap_e;
+    xen_pfn_t *physmap, *physmap_e;
 
     struct load_funcs load_funcs;
     struct domain_setup_info dsi;
@@ -673,7 +690,8 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
     {
-        if ( (supported_features[i]&required_features[i]) != 
required_features[i] )
+        if ( (supported_features[i] & required_features[i]) !=
+             required_features[i] )
         {
             ERROR("Guest kernel does not support a required feature.");
             goto error_out;
@@ -719,7 +737,7 @@ static int setup_guest(int xc_handle,
     (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
     ((_l) & ~((1UL<<(_s))-1))) >> (_s))
 #if defined(__i386__)
-        if ( dsi.pae_kernel )
+        if ( dsi.pae_kernel != PAEKERN_no )
         {
             if ( (1 + /* # L3 */
                   NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT_PAE) + /* # L2 */
@@ -797,11 +815,11 @@ static int setup_guest(int xc_handle,
 
     /* setup page tables */
 #if defined(__i386__)
-    if (dsi.pae_kernel)
+    if (dsi.pae_kernel != PAEKERN_no)
         rc = setup_pg_tables_pae(xc_handle, dom, ctxt,
                                  dsi.v_start, v_end,
                                  page_array, vpt_start, vpt_end,
-                                 shadow_mode_enabled);
+                                 shadow_mode_enabled, dsi.pae_kernel);
     else
         rc = setup_pg_tables(xc_handle, dom, ctxt,
                              dsi.v_start, v_end,
@@ -824,7 +842,7 @@ static int setup_guest(int xc_handle,
      */
     if ( !shadow_mode_enabled )
     {
-        if ( dsi.pae_kernel )
+        if ( dsi.pae_kernel != PAEKERN_no )
         {
             if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE,
                            xen_cr3_to_pfn(ctxt->ctrlreg[3]), dom) )
@@ -865,8 +883,8 @@ static int setup_guest(int xc_handle,
             ((uint64_t)page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
             count) )
         {
-            fprintf(stderr,"m2p update failure p=%lx m=%lx\n",
-                    count, page_array[count]);
+            fprintf(stderr,"m2p update failure p=%lx m=%"PRIx64"\n",
+                    count, (uint64_t)page_array[count]);
             munmap(physmap, PAGE_SIZE);
             goto error_out;
         }
@@ -958,7 +976,7 @@ static int setup_guest(int xc_handle,
     rc = xc_version(xc_handle, XENVER_version, NULL);
     sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
             rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
-            dsi.pae_kernel ? "p" : "");
+            (dsi.pae_kernel != PAEKERN_no) ? "p" : "");
     start_info->nr_pages     = nr_pages;
     start_info->shared_info  = guest_shared_info_mfn << PAGE_SHIFT;
     start_info->flags        = flags;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c    Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_restore.c    Tue Jun 06 13:25:31 2006 -0500
@@ -25,10 +25,10 @@ static unsigned long max_pfn;
 static unsigned long max_pfn;
 
 /* Live mapping of the table mapping each PFN to its current MFN. */
-static unsigned long *live_p2m = NULL;
+static xen_pfn_t *live_p2m = NULL;
 
 /* A table mapping each PFN to its new MFN. */
-static unsigned long *p2m = NULL;
+static xen_pfn_t *p2m = NULL;
 
 
 static ssize_t
@@ -108,7 +108,7 @@ int xc_linux_restore(int xc_handle, int 
                      unsigned int console_evtchn, unsigned long *console_mfn)
 {
     DECLARE_DOM0_OP;
-    int rc = 1, i, n;
+    int rc = 1, i, n, pae_extended_cr3 = 0;
     unsigned long mfn, pfn;
     unsigned int prev_pc, this_pc;
     int verify = 0;
@@ -126,7 +126,7 @@ int xc_linux_restore(int xc_handle, int 
     unsigned long *pfn_type = NULL;
 
     /* A table of MFNs to map in the current region */
-    unsigned long *region_mfn = NULL;
+    xen_pfn_t *region_mfn = NULL;
 
     /* Types of the pfns in the current region */
     unsigned long region_pfn_type[MAX_BATCH_SIZE];
@@ -135,7 +135,7 @@ int xc_linux_restore(int xc_handle, int 
     unsigned long *page = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long *p2m_frame_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
 
     /* A temporary mapping of the guest's start_info page. */
     start_info_t *start_info;
@@ -162,30 +162,88 @@ int xc_linux_restore(int xc_handle, int 
         return 1;
     }
 
-
     if (mlock(&ctxt, sizeof(ctxt))) {
         /* needed for build dom0 op, but might as well do early */
         ERR("Unable to mlock ctxt");
         return 1;
     }
 
-
-    /* Read the saved P2M frame list */
-    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
+    if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
         ERR("Couldn't allocate p2m_frame_list array");
         goto out;
     }
 
-    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
+    /* Read first entry of P2M list, or extended-info signature (~0UL). */
+    if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+        ERR("read extended-info signature failed");
+        goto out;
+    }
+
+    if (p2m_frame_list[0] == ~0UL) {
+        uint32_t tot_bytes;
+
+        /* Next 4 bytes: total size of following extended info. */
+        if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
+            ERR("read extended-info size failed");
+            goto out;
+        }
+
+        while (tot_bytes) {
+            uint32_t chunk_bytes;
+            char     chunk_sig[4];
+
+            /* 4-character chunk signature + 4-byte remaining chunk size. */
+            if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
+                !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
+                ERR("read extended-info chunk signature failed");
+                goto out;
+            }
+            tot_bytes -= 8;
+
+            /* VCPU context structure? */
+            if (!strncmp(chunk_sig, "vcpu", 4)) {
+                if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
+                    ERR("read extended-info vcpu context failed");
+                    goto out;
+                }
+                tot_bytes   -= sizeof(struct vcpu_guest_context);
+                chunk_bytes -= sizeof(struct vcpu_guest_context);
+
+                if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
+                    pae_extended_cr3 = 1;
+            }
+
+            /* Any remaining bytes of this chunk: read and discard. */
+            while (chunk_bytes) {
+                unsigned long sz = chunk_bytes;
+                if ( sz > P2M_FL_SIZE )
+                    sz = P2M_FL_SIZE;
+                if (!read_exact(io_fd, p2m_frame_list, sz)) {
+                    ERR("read-and-discard extended-info chunk bytes failed");
+                    goto out;
+                }
+                chunk_bytes -= sz;
+                tot_bytes   -= sz;
+            }
+        }
+
+        /* Now read the real first entry of P2M list. */
+        if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+            ERR("read first entry of p2m_frame_list failed");
+            goto out;
+        }
+    }
+
+    /* First entry is already read into the p2m array. */
+    if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
         ERR("read p2m_frame_list failed");
         goto out;
     }
 
-
     /* We want zeroed memory so use calloc rather than malloc. */
-    p2m        = calloc(max_pfn, sizeof(unsigned long));
+    p2m        = calloc(max_pfn, sizeof(xen_pfn_t));
     pfn_type   = calloc(max_pfn, sizeof(unsigned long));
-    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
+    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
 
     if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
         ERR("memory alloc failed");
@@ -193,7 +251,7 @@ int xc_linux_restore(int xc_handle, int 
         goto out;
     }
 
-    if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
+    if (mlock(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
         ERR("Could not mlock region_mfn");
         goto out;
     }
@@ -331,17 +389,27 @@ int xc_linux_restore(int xc_handle, int 
                 ** A page table page - need to 'uncanonicalize' it, i.e.
                 ** replace all the references to pfns with the corresponding
                 ** mfns for the new domain.
+                **
+                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+                ** so we may need to update the p2m after the main loop.
+                ** Hence we defer canonicalization of L1s until then.
                 */
-                if(!uncanonicalize_pagetable(pagetype, page)) {
-                    /*
-                    ** Failing to uncanonicalize a page table can be ok
-                    ** under live migration since the pages type may have
-                    ** changed by now (and we'll get an update later).
-                    */
-                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                            pagetype >> 28, pfn, mfn);
-                    nraces++;
-                    continue;
+                if ((pt_levels != 3) ||
+                    pae_extended_cr3 ||
+                    (pagetype != L1TAB)) {
+
+                    if (!uncanonicalize_pagetable(pagetype, page)) {
+                        /*
+                        ** Failing to uncanonicalize a page table can be ok
+                        ** under live migration since the pages type may have
+                        ** changed by now (and we'll get an update later).
+                        */
+                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                                pagetype >> 28, pfn, mfn);
+                        nraces++;
+                        continue;
+                    }
+
                 }
 
             } else if(pagetype != NOTAB) {
@@ -389,6 +457,100 @@ int xc_linux_restore(int xc_handle, int 
     }
 
     DPRINTF("Received all pages (%d races)\n", nraces);
+
+    if ((pt_levels == 3) && !pae_extended_cr3) {
+
+        /*
+        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
+        ** is a little awkward and involves (a) finding all such PGDs and
+        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
+        ** with the new info; and (c) canonicalizing all the L1s using the
+        ** (potentially updated) p2m[].
+        **
+        ** This is relatively slow (and currently involves two passes through
+        ** the pfn_type[] array), but at least seems to be correct. May wish
+        ** to consider more complex approaches to optimize this later.
+        */
+
+        int j, k;
+
+        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
+        for (i = 0; i < max_pfn; i++) {
+
+            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
+
+                unsigned long new_mfn;
+                uint64_t l3ptes[4];
+                uint64_t *l3tab;
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3ptes[j] = l3tab[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
+                    ERR("Couldn't get a page below 4GB :-(");
+                    goto out;
+                }
+
+                p2m[i] = new_mfn;
+                if (xc_add_mmu_update(xc_handle, mmu,
+                                      (((unsigned long long)new_mfn)
+                                       << PAGE_SHIFT) |
+                                      MMU_MACHPHYS_UPDATE, i)) {
+                    ERR("Couldn't m2p on PAE root pgdir");
+                    goto out;
+                }
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ | PROT_WRITE, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3tab[j] = l3ptes[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+            }
+        }
+
+        /* Second pass: find all L1TABs and uncanonicalize them */
+        j = 0;
+
+        for(i = 0; i < max_pfn; i++) {
+
+            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
+                region_mfn[j] = p2m[i];
+                j++;
+            }
+
+            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
+
+                if (!(region_base = xc_map_foreign_batch(
+                          xc_handle, dom, PROT_READ | PROT_WRITE,
+                          region_mfn, j))) {
+                    ERR("map batch failed");
+                    goto out;
+                }
+
+                for(k = 0; k < j; k++) {
+                    if(!uncanonicalize_pagetable(L1TAB,
+                                                 region_base + k*PAGE_SIZE)) {
+                        ERR("failed uncanonicalize pt!");
+                        goto out;
+                    }
+                }
+
+                munmap(region_base, j*PAGE_SIZE);
+                j = 0;
+            }
+        }
+
+    }
 
 
     if (xc_finish_mmu_updates(xc_handle, mmu)) {
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_save.c       Tue Jun 06 13:25:31 2006 -0500
@@ -40,10 +40,10 @@ static unsigned long max_pfn;
 static unsigned long max_pfn;
 
 /* Live mapping of the table mapping each PFN to its current MFN. */
-static unsigned long *live_p2m = NULL;
+static xen_pfn_t *live_p2m = NULL;
 
 /* Live mapping of system MFN to PFN table. */
-static unsigned long *live_m2p = NULL;
+static xen_pfn_t *live_m2p = NULL;
 
 /* grep fodder: machine_to_phys */
 
@@ -501,22 +501,22 @@ void canonicalize_pagetable(unsigned lon
 
 
 
-static unsigned long *xc_map_m2p(int xc_handle,
+static xen_pfn_t *xc_map_m2p(int xc_handle,
                                  unsigned long max_mfn,
                                  int prot)
 {
     struct xen_machphys_mfn_list xmml;
     privcmd_mmap_entry_t *entries;
     unsigned long m2p_chunks, m2p_size;
-    unsigned long *m2p;
-    unsigned long *extent_start;
+    xen_pfn_t *m2p;
+    xen_pfn_t *extent_start;
     int i, rc;
 
     m2p_size   = M2P_SIZE(max_mfn);
     m2p_chunks = M2P_CHUNKS(max_mfn);
 
     xmml.max_extents = m2p_chunks;
-    if (!(extent_start = malloc(m2p_chunks * sizeof(unsigned long)))) {
+    if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
         ERR("failed to allocate space for m2p mfns");
         return NULL;
     }
@@ -583,11 +583,11 @@ int xc_linux_save(int xc_handle, int io_
     char page[PAGE_SIZE];
 
     /* Double and single indirect references to the live P2M table */
-    unsigned long *live_p2m_frame_list_list = NULL;
-    unsigned long *live_p2m_frame_list = NULL;
+    xen_pfn_t *live_p2m_frame_list_list = NULL;
+    xen_pfn_t *live_p2m_frame_list = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long *p2m_frame_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
 
     /* Live mapping of shared info structure */
     shared_info_t *live_shinfo = NULL;
@@ -712,11 +712,11 @@ int xc_linux_save(int xc_handle, int io_
     memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
 
     /* Canonicalise the pfn-to-mfn table frame-number list. */
-    for (i = 0; i < max_pfn; i += ulpp) {
-        if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
+    for (i = 0; i < max_pfn; i += fpp) {
+        if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
             ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
-            ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
-                p2m_frame_list[i/ulpp]);
+            ERR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
+                (uint64_t)p2m_frame_list[i/fpp]);
             goto out;
         }
     }
@@ -818,12 +818,33 @@ int xc_linux_save(int xc_handle, int io_
 
     /* Start writing out the saved-domain record. */
 
-    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
+    if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
         ERR("write: max_pfn");
         goto out;
     }
 
-    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
+    /*
+     * Write an extended-info structure to inform the restore code that
+     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
+     * slow paths in the restore code.
+     */
+    if ((pt_levels == 3) &&
+        (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
+        unsigned long signature = ~0UL;
+        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
+        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
+        char chunk_sig[]  = "vcpu";
+        if (!write_exact(io_fd, &signature, sizeof(signature)) ||
+            !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
+            !write_exact(io_fd, &chunk_sig, 4) ||
+            !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
+            !write_exact(io_fd, &ctxt,      sizeof(ctxt))) {
+            ERR("write: extended info");
+            goto out;
+        }
+    }
+
+    if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
         ERR("write: p2m_frame_list");
         goto out;
     }
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_aout9.c
--- a/tools/libxc/xc_load_aout9.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_aout9.c       Tue Jun 06 13:25:31 2006 -0500
@@ -17,7 +17,7 @@
 #define KOFFSET(_p)       ((_p)&~KZERO)
 
 static int parseaout9image(const char *, unsigned long, struct 
domain_setup_info *);
-static int loadaout9image(const char *, unsigned long, int, uint32_t, unsigned 
long *, struct domain_setup_info *);
+static int loadaout9image(const char *, unsigned long, int, uint32_t, 
xen_pfn_t *, struct domain_setup_info *);
 static void copyout(int, uint32_t, unsigned long *, unsigned long, const char 
*, int);
 struct Exec *get_header(const char *, unsigned long, struct Exec *);
 
@@ -79,7 +79,7 @@ loadaout9image(
     const char *image,
     unsigned long image_size,
     int xch, uint32_t dom,
-    unsigned long *parray,
+    xen_pfn_t *parray,
     struct domain_setup_info *dsi)
 {
     struct Exec ehdr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_bin.c
--- a/tools/libxc/xc_load_bin.c Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_bin.c Tue Jun 06 13:25:31 2006 -0500
@@ -107,7 +107,7 @@ static int
 static int
 loadbinimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi);
+    xen_pfn_t *parray, struct domain_setup_info *dsi);
 
 int probe_bin(const char *image,
               unsigned long image_size,
@@ -235,7 +235,7 @@ static int
 static int
 loadbinimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi)
+    xen_pfn_t *parray, struct domain_setup_info *dsi)
 {
     unsigned long size;
     char         *va;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_elf.c
--- a/tools/libxc/xc_load_elf.c Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_elf.c Tue Jun 06 13:25:31 2006 -0500
@@ -17,10 +17,10 @@ static int
 static int
 loadelfimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi);
+    xen_pfn_t *parray, struct domain_setup_info *dsi);
 static int
 loadelfsymtab(
-    const char *image, int xch, uint32_t dom, unsigned long *parray,
+    const char *image, int xch, uint32_t dom, xen_pfn_t *parray,
     struct domain_setup_info *dsi);
 
 int probe_elf(const char *image,
@@ -138,8 +138,15 @@ static int parseelfimage(const char *ima
             ERROR("Actually saw: '%s'", guestinfo);
             return -EINVAL;
         }
-        if ( (strstr(guestinfo, "PAE=yes") != NULL) )
-            dsi->pae_kernel = 1;
+
+        dsi->pae_kernel = PAEKERN_no;
+        p = strstr(guestinfo, "PAE=yes");
+        if ( p != NULL )
+        {
+            dsi->pae_kernel = PAEKERN_yes;
+            if ( !strncmp(p+7, "[extended-cr3]", 14) )
+                dsi->pae_kernel = PAEKERN_extended_cr3;
+        }
 
         break;
     }
@@ -220,7 +227,7 @@ static int
 static int
 loadelfimage(
     const char *image, unsigned long elfsize, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi)
+    xen_pfn_t *parray, struct domain_setup_info *dsi)
 {
     Elf_Ehdr *ehdr = (Elf_Ehdr *)image;
     Elf_Phdr *phdr;
@@ -274,7 +281,7 @@ loadelfimage(
 
 static int
 loadelfsymtab(
-    const char *image, int xch, uint32_t dom, unsigned long *parray,
+    const char *image, int xch, uint32_t dom, xen_pfn_t *parray,
     struct domain_setup_info *dsi)
 {
     Elf_Ehdr *ehdr = (Elf_Ehdr *)image, *sym_ehdr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_private.c  Tue Jun 06 13:25:31 2006 -0500
@@ -4,6 +4,7 @@
  * Helper functions for the rest of the library.
  */
 
+#include <inttypes.h>
 #include "xc_private.h"
 
 /* NB: arr must be mlock'ed */
@@ -134,9 +135,9 @@ int xc_memory_op(int xc_handle,
     struct xen_memory_reservation *reservation = arg;
     struct xen_machphys_mfn_list *xmml = arg;
     struct xen_translate_gpfn_list *trans = arg;
-    unsigned long *extent_start;
-    unsigned long *gpfn_list;
-    unsigned long *mfn_list;
+    xen_pfn_t *extent_start;
+    xen_pfn_t *gpfn_list;
+    xen_pfn_t *mfn_list;
     long ret = -EINVAL;
 
     hypercall.op     = __HYPERVISOR_memory_op;
@@ -156,7 +157,7 @@ int xc_memory_op(int xc_handle,
         get_xen_guest_handle(extent_start, reservation->extent_start);
         if ( (extent_start != NULL) &&
              (mlock(extent_start,
-                    reservation->nr_extents * sizeof(unsigned long)) != 0) )
+                    reservation->nr_extents * sizeof(xen_pfn_t)) != 0) )
         {
             PERROR("Could not mlock");
             safe_munlock(reservation, sizeof(*reservation));
@@ -171,7 +172,7 @@ int xc_memory_op(int xc_handle,
         }
         get_xen_guest_handle(extent_start, xmml->extent_start);
         if ( mlock(extent_start,
-                   xmml->max_extents * sizeof(unsigned long)) != 0 )
+                   xmml->max_extents * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not mlock");
             safe_munlock(xmml, sizeof(*xmml));
@@ -192,17 +193,17 @@ int xc_memory_op(int xc_handle,
             goto out1;
         }
         get_xen_guest_handle(gpfn_list, trans->gpfn_list);
-        if ( mlock(gpfn_list, trans->nr_gpfns * sizeof(long)) != 0 )
+        if ( mlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not mlock");
             safe_munlock(trans, sizeof(*trans));
             goto out1;
         }
         get_xen_guest_handle(mfn_list, trans->mfn_list);
-        if ( mlock(mfn_list, trans->nr_gpfns * sizeof(long)) != 0 )
-        {
-            PERROR("Could not mlock");
-            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(long));
+        if ( mlock(mfn_list, trans->nr_gpfns * sizeof(xen_pfn_t)) != 0 )
+        {
+            PERROR("Could not mlock");
+            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             safe_munlock(trans, sizeof(*trans));
             goto out1;
         }
@@ -220,22 +221,22 @@ int xc_memory_op(int xc_handle,
         get_xen_guest_handle(extent_start, reservation->extent_start);
         if ( extent_start != NULL )
             safe_munlock(extent_start,
-                         reservation->nr_extents * sizeof(unsigned long));
+                         reservation->nr_extents * sizeof(xen_pfn_t));
         break;
     case XENMEM_machphys_mfn_list:
         safe_munlock(xmml, sizeof(*xmml));
         get_xen_guest_handle(extent_start, xmml->extent_start);
         safe_munlock(extent_start,
-                     xmml->max_extents * sizeof(unsigned long));
+                     xmml->max_extents * sizeof(xen_pfn_t));
         break;
     case XENMEM_add_to_physmap:
         safe_munlock(arg, sizeof(struct xen_add_to_physmap));
         break;
     case XENMEM_translate_gpfn_list:
             get_xen_guest_handle(mfn_list, trans->mfn_list);
-            safe_munlock(mfn_list, trans->nr_gpfns * sizeof(long));
+            safe_munlock(mfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             get_xen_guest_handle(gpfn_list, trans->gpfn_list);
-            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(long));
+            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             safe_munlock(trans, sizeof(*trans));
         break;
     }
@@ -263,7 +264,7 @@ long long xc_domain_get_cpu_usage( int x
 
 int xc_get_pfn_list(int xc_handle,
                     uint32_t domid,
-                    unsigned long *pfn_buf,
+                    xen_pfn_t *pfn_buf,
                     unsigned long max_pfns)
 {
     DECLARE_DOM0_OP;
@@ -274,10 +275,10 @@ int xc_get_pfn_list(int xc_handle,
     set_xen_guest_handle(op.u.getmemlist.buffer, pfn_buf);
 
 #ifdef VALGRIND
-    memset(pfn_buf, 0, max_pfns * sizeof(unsigned long));
+    memset(pfn_buf, 0, max_pfns * sizeof(xen_pfn_t));
 #endif
 
-    if ( mlock(pfn_buf, max_pfns * sizeof(unsigned long)) != 0 )
+    if ( mlock(pfn_buf, max_pfns * sizeof(xen_pfn_t)) != 0 )
     {
         PERROR("xc_get_pfn_list: pfn_buf mlock failed");
         return -1;
@@ -285,7 +286,7 @@ int xc_get_pfn_list(int xc_handle,
 
     ret = do_dom0_op(xc_handle, &op);
 
-    safe_munlock(pfn_buf, max_pfns * sizeof(unsigned long));
+    safe_munlock(pfn_buf, max_pfns * sizeof(xen_pfn_t));
 
 #if 0
 #ifdef DEBUG
@@ -364,7 +365,7 @@ unsigned long xc_get_filesz(int fd)
 }
 
 void xc_map_memcpy(unsigned long dst, const char *src, unsigned long size,
-                   int xch, uint32_t dom, unsigned long *parray,
+                   int xch, uint32_t dom, xen_pfn_t *parray,
                    unsigned long vstart)
 {
     char *va;
@@ -428,6 +429,29 @@ int xc_version(int xc_handle, int cmd, v
         safe_munlock(arg, argsize);
 
     return rc;
+}
+
+unsigned long xc_make_page_below_4G(
+    int xc_handle, uint32_t domid, unsigned long mfn)
+{
+    xen_pfn_t old_mfn = mfn;
+    xen_pfn_t new_mfn;
+
+    if ( xc_domain_memory_decrease_reservation(
+        xc_handle, domid, 1, 0, &old_mfn) != 0 )
+    {
+        fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
+        return 0;
+    }
+
+    if ( xc_domain_memory_increase_reservation(
+        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
+    {
+        fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
+        return 0;
+    }
+
+    return new_mfn;
 }
 
 /*
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xenctrl.h     Tue Jun 06 13:25:31 2006 -0500
@@ -420,26 +420,26 @@ int xc_domain_memory_increase_reservatio
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start);
+                                          xen_pfn_t *extent_start);
 
 int xc_domain_memory_decrease_reservation(int xc_handle,
                                           uint32_t domid,
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
-                                          unsigned long *extent_start);
+                                          xen_pfn_t *extent_start);
 
 int xc_domain_memory_populate_physmap(int xc_handle,
                                       uint32_t domid,
                                       unsigned long nr_extents,
                                       unsigned int extent_order,
                                       unsigned int address_bits,
-                                      unsigned long *extent_start);
+                                      xen_pfn_t *extent_start);
 
 int xc_domain_translate_gpfn_list(int xc_handle,
                                   uint32_t domid,
                                   unsigned long nr_gpfns,
-                                  unsigned long *gpfn_list,
-                                  unsigned long *mfn_list);
+                                  xen_pfn_t *gpfn_list,
+                                  xen_pfn_t *mfn_list);
 
 int xc_domain_ioport_permission(int xc_handle,
                                 uint32_t domid,
@@ -458,6 +458,9 @@ int xc_domain_iomem_permission(int xc_ha
                                unsigned long nr_mfns,
                                uint8_t allow_access);
 
+unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
+                                    unsigned long mfn);
+
 typedef dom0_perfc_desc_t xc_perfc_desc_t;
 /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
 int xc_perfc_control(int xc_handle,
@@ -489,7 +492,7 @@ void *xc_map_foreign_range(int xc_handle
                             unsigned long mfn );
 
 void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot,
-                           unsigned long *arr, int num );
+                           xen_pfn_t *arr, int num );
 
 /**
  * Translates a virtual address in the context of a given domain and
@@ -504,11 +507,11 @@ unsigned long xc_translate_foreign_addre
 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
                                            int vcpu, unsigned long long virt);
 
-int xc_get_pfn_list(int xc_handle, uint32_t domid, unsigned long *pfn_buf,
+int xc_get_pfn_list(int xc_handle, uint32_t domid, xen_pfn_t *pfn_buf,
                     unsigned long max_pfns);
 
 int xc_ia64_get_pfn_list(int xc_handle, uint32_t domid,
-                         unsigned long *pfn_buf,
+                         xen_pfn_t *pfn_buf,
                          unsigned int start_page, unsigned int nr_pages);
 
 int xc_copy_to_domain_page(int xc_handle, uint32_t domid,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xg_private.h  Tue Jun 06 13:25:31 2006 -0500
@@ -156,6 +156,9 @@ struct domain_setup_info
 
     unsigned long elf_paddr_offset;
 
+#define PAEKERN_no           0
+#define PAEKERN_yes          1
+#define PAEKERN_extended_cr3 2
     unsigned int  pae_kernel;
 
     unsigned int  load_symtab;
@@ -170,7 +173,7 @@ typedef int (*parseimagefunc)(const char
                               struct domain_setup_info *dsi);
 typedef int (*loadimagefunc)(const char *image, unsigned long image_size,
                              int xch,
-                             uint32_t dom, unsigned long *parray,
+                             uint32_t dom, xen_pfn_t *parray,
                              struct domain_setup_info *dsi);
 
 struct load_funcs
@@ -198,7 +201,7 @@ unsigned long xc_get_filesz(int fd);
 unsigned long xc_get_filesz(int fd);
 
 void xc_map_memcpy(unsigned long dst, const char *src, unsigned long size,
-                   int xch, uint32_t dom, unsigned long *parray,
+                   int xch, uint32_t dom, xen_pfn_t *parray,
                    unsigned long vstart);
 
 int pin_table(int xc_handle, unsigned int type, unsigned long mfn,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xg_save_restore.h     Tue Jun 06 13:25:31 2006 -0500
@@ -105,23 +105,23 @@ static int get_platform_info(int xc_hand
 */
 #define M2P_SHIFT       L2_PAGETABLE_SHIFT_PAE
 #define M2P_CHUNK_SIZE  (1 << M2P_SHIFT)
-#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(unsigned long)), M2P_SHIFT)
+#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT)
 #define M2P_CHUNKS(_m)  (M2P_SIZE((_m)) >> M2P_SHIFT)
 
 /* Size in bytes of the P2M (rounded up to the nearest PAGE_SIZE bytes) */
-#define P2M_SIZE        ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT)
+#define P2M_SIZE        ROUNDUP((max_pfn * sizeof(xen_pfn_t)), PAGE_SHIFT)
 
-/* Number of unsigned longs in a page */
-#define ulpp            (PAGE_SIZE/sizeof(unsigned long))
+/* Number of xen_pfn_t in a page */
+#define fpp             (PAGE_SIZE/sizeof(xen_pfn_t))
 
 /* Number of entries in the pfn_to_mfn_frame_list */
-#define P2M_FL_ENTRIES  (((max_pfn)+ulpp-1)/ulpp)
+#define P2M_FL_ENTRIES  (((max_pfn)+fpp-1)/fpp)
 
 /* Size in bytes of the pfn_to_mfn_frame_list     */
 #define P2M_FL_SIZE     ((P2M_FL_ENTRIES)*sizeof(unsigned long))
 
 /* Number of entries in the pfn_to_mfn_frame_list_list */
-#define P2M_FLL_ENTRIES (((max_pfn)+(ulpp*ulpp)-1)/(ulpp*ulpp))
+#define P2M_FLL_ENTRIES (((max_pfn)+(fpp*fpp)-1)/(fpp*fpp))
 
 /* Current guests allow 8MB 'slack' in their P2M */
 #define NR_SLACK_ENTRIES   ((8 * 1024 * 1024) / PAGE_SIZE)
diff -r d3e181fa238b -r 156a0963a1ae tools/tests/test_x86_emulator.c
--- a/tools/tests/test_x86_emulator.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/tests/test_x86_emulator.c   Tue Jun 06 13:25:31 2006 -0500
@@ -13,6 +13,7 @@ typedef int64_t            s64;
 typedef int64_t            s64;
 #include <public/xen.h>
 #include <asm-x86/x86_emulate.h>
+#include <sys/mman.h>
 
 static int read_any(
     unsigned long addr,
@@ -85,23 +86,30 @@ int main(int argc, char **argv)
     struct x86_emulate_ctxt ctxt;
     struct cpu_user_regs regs;
     char instr[20] = { 0x01, 0x08 }; /* add %ecx,(%eax) */
-    unsigned int res = 0x7FFFFFFF;
-    u32 cmpxchg8b_res[2] = { 0x12345678, 0x87654321 };
+    unsigned int *res;
     int rc;
 
     ctxt.regs = &regs;
     ctxt.mode = X86EMUL_MODE_PROT32;
 
+    res = mmap((void *)0x100000, 0x1000, PROT_READ|PROT_WRITE,
+               MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+    if ( res == MAP_FAILED )
+    {
+        fprintf(stderr, "mmap to low address failed\n");
+        exit(1);
+    }
+
     printf("%-40s", "Testing addl %%ecx,(%%eax)...");
     instr[0] = 0x01; instr[1] = 0x08;
     regs.eflags = 0x200;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x7FFFFFFF;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x92345677) || 
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x7FFFFFFF;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
          (regs.eflags != 0xa94) ||
          (regs.eip != (unsigned long)&instr[2]) )
         goto fail;
@@ -116,11 +124,25 @@ int main(int argc, char **argv)
 #else
     regs.ecx    = 0x12345678UL;
 #endif
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x92345677) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
          (regs.ecx != 0x8000000FUL) ||
+         (regs.eip != (unsigned long)&instr[2]) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing movl (%%eax),%%ecx...");
+    instr[0] = 0x8b; instr[1] = 0x08;
+    regs.eflags = 0x200;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.ecx    = ~0UL;
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
+         (regs.ecx != 0x92345677UL) ||
          (regs.eip != (unsigned long)&instr[2]) )
         goto fail;
     printf("okay\n");
@@ -131,10 +153,10 @@ int main(int argc, char **argv)
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0x92345677UL;
     regs.ecx    = 0xAA;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x923456AA) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x923456AA) || 
          (regs.eflags != 0x244) ||
          (regs.eax != 0x92345677UL) ||
          (regs.eip != (unsigned long)&instr[4]) )
@@ -147,10 +169,10 @@ int main(int argc, char **argv)
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0xAABBCC77UL;
     regs.ecx    = 0xFF;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x923456AA) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x923456AA) || 
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eax != 0xAABBCCAA) ||
          (regs.ecx != 0xFF) ||
@@ -163,10 +185,10 @@ int main(int argc, char **argv)
     regs.eflags = 0x200;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x12345678) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x12345678) || 
          (regs.eflags != 0x200) ||
          (regs.ecx != 0x923456AA) ||
          (regs.eip != (unsigned long)&instr[2]) )
@@ -176,14 +198,14 @@ int main(int argc, char **argv)
     printf("%-40s", "Testing lock cmpxchgl %%ecx,(%%eax)...");
     instr[0] = 0xf0; instr[1] = 0x0f; instr[2] = 0xb1; instr[3] = 0x08;
     regs.eflags = 0x200;
-    res         = 0x923456AA;
+    *res        = 0x923456AA;
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0x923456AAUL;
     regs.ecx    = 0xDDEEFF00L;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0xDDEEFF00) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0xDDEEFF00) || 
          (regs.eflags != 0x244) ||
          (regs.eax != 0x923456AAUL) ||
          (regs.eip != (unsigned long)&instr[4]) )
@@ -192,54 +214,57 @@ int main(int argc, char **argv)
 
     printf("%-40s", "Testing rep movsw...");
     instr[0] = 0xf3; instr[1] = 0x66; instr[2] = 0xa5;
-    res         = 0x22334455;
+    *res        = 0x22334455;
     regs.eflags = 0x200;
     regs.ecx    = 23;
     regs.eip    = (unsigned long)&instr[0];
-    regs.esi    = (unsigned long)&res + 0;
-    regs.edi    = (unsigned long)&res + 2;
+    regs.esi    = (unsigned long)res + 0;
+    regs.edi    = (unsigned long)res + 2;
     regs.error_code = 0; /* read fault */
     ctxt.cr2    = regs.esi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (res != 0x44554455) ||
+         (*res != 0x44554455) ||
          (regs.eflags != 0x200) ||
          (regs.ecx != 22) || 
-         (regs.esi != ((unsigned long)&res + 2)) ||
-         (regs.edi != ((unsigned long)&res + 4)) ||
+         (regs.esi != ((unsigned long)res + 2)) ||
+         (regs.edi != ((unsigned long)res + 4)) ||
          (regs.eip != (unsigned long)&instr[0]) )
         goto fail;
     printf("okay\n");
 
     printf("%-40s", "Testing btrl $0x1,(%edi)...");
     instr[0] = 0x0f; instr[1] = 0xba; instr[2] = 0x37; instr[3] = 0x01;
-    res         = 0x2233445F;
-    regs.eflags = 0x200;
-    regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)&res;
+    *res        = 0x2233445F;
+    regs.eflags = 0x200;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (res != 0x2233445D) ||
+         (*res != 0x2233445D) ||
          ((regs.eflags&0x201) != 0x201) ||
          (regs.eip != (unsigned long)&instr[4]) )
         goto fail;
     printf("okay\n");
+
+    res[0] = 0x12345678;
+    res[1] = 0x87654321;
 
     printf("%-40s", "Testing cmpxchg8b (%edi) [succeeding]...");
     instr[0] = 0x0f; instr[1] = 0xc7; instr[2] = 0x0f;
     regs.eflags = 0x200;
-    regs.eax    = cmpxchg8b_res[0];
-    regs.edx    = cmpxchg8b_res[1];
+    regs.eax    = res[0];
+    regs.edx    = res[1];
     regs.ebx    = 0x9999AAAA;
     regs.ecx    = 0xCCCCFFFF;
     regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)cmpxchg8b_res;
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (cmpxchg8b_res[0] != 0x9999AAAA) ||
-         (cmpxchg8b_res[1] != 0xCCCCFFFF) ||
+         (res[0] != 0x9999AAAA) ||
+         (res[1] != 0xCCCCFFFF) ||
          ((regs.eflags&0x240) != 0x240) ||
          (regs.eip != (unsigned long)&instr[3]) )
         goto fail;
@@ -248,12 +273,12 @@ int main(int argc, char **argv)
     printf("%-40s", "Testing cmpxchg8b (%edi) [failing]...");
     instr[0] = 0x0f; instr[1] = 0xc7; instr[2] = 0x0f;
     regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)cmpxchg8b_res;
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (cmpxchg8b_res[0] != 0x9999AAAA) ||
-         (cmpxchg8b_res[1] != 0xCCCCFFFF) ||
+         (res[0] != 0x9999AAAA) ||
+         (res[1] != 0xCCCCFFFF) ||
          (regs.eax != 0x9999AAAA) ||
          (regs.edx != 0xCCCCFFFF) ||
          ((regs.eflags&0x240) != 0x200) ||
@@ -265,11 +290,11 @@ int main(int argc, char **argv)
     instr[0] = 0x0f; instr[1] = 0xbe; instr[2] = 0x08;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x82;
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x82;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) ||
-         (res != 0x82) ||
+         (*res != 0x82) ||
          (regs.ecx != 0xFFFFFF82) ||
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eip != (unsigned long)&instr[3]) )
@@ -280,11 +305,11 @@ int main(int argc, char **argv)
     instr[0] = 0x0f; instr[1] = 0xb7; instr[2] = 0x08;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x1234aa82;
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x1234aa82;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) ||
-         (res != 0x1234aa82) ||
+         (*res != 0x1234aa82) ||
          (regs.ecx != 0xaa82) ||
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eip != (unsigned long)&instr[3]) )
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/domain.c     Tue Jun 06 13:25:31 2006 -0500
@@ -259,7 +259,7 @@ int arch_set_info_guest(
     struct vcpu *v, struct vcpu_guest_context *c)
 {
     struct domain *d = v->domain;
-    unsigned long cr3_pfn;
+    unsigned long cr3_pfn = INVALID_MFN;
     int i, rc;
 
     if ( !(c->flags & VGCF_HVM_GUEST) )
@@ -524,20 +524,29 @@ static void load_segments(struct vcpu *n
     if ( unlikely(!all_segs_okay) )
     {
         struct cpu_user_regs *regs = guest_cpu_user_regs();
-        unsigned long   *rsp =
+        unsigned long *rsp =
             (n->arch.flags & TF_kernel_mode) ?
             (unsigned long *)regs->rsp :
             (unsigned long *)nctxt->kernel_sp;
+        unsigned long cs_and_mask, rflags;
 
         if ( !(n->arch.flags & TF_kernel_mode) )
             toggle_guest_mode(n);
         else
             regs->cs &= ~3;
 
+        /* CS longword also contains full evtchn_upcall_mask. */
+        cs_and_mask = (unsigned long)regs->cs |
+            ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
+
+        /* Fold upcall mask into RFLAGS.IF. */
+        rflags  = regs->rflags & ~X86_EFLAGS_IF;
+        rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
+
         if ( put_user(regs->ss,            rsp- 1) |
              put_user(regs->rsp,           rsp- 2) |
-             put_user(regs->rflags,        rsp- 3) |
-             put_user(regs->cs,            rsp- 4) |
+             put_user(rflags,              rsp- 3) |
+             put_user(cs_and_mask,         rsp- 4) |
              put_user(regs->rip,           rsp- 5) |
              put_user(nctxt->user_regs.gs, rsp- 6) |
              put_user(nctxt->user_regs.fs, rsp- 7) |
@@ -549,6 +558,10 @@ static void load_segments(struct vcpu *n
             DPRINTK("Error while creating failsafe callback frame.\n");
             domain_crash(n->domain);
         }
+
+        if ( test_bit(_VGCF_failsafe_disables_events,
+                      &n->arch.guest_context.flags) )
+            n->vcpu_info->evtchn_upcall_mask = 1;
 
         regs->entry_vector  = TRAP_syscall;
         regs->rflags       &= 0xFFFCBEFFUL;
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/domain_build.c       Tue Jun 06 13:25:31 2006 -0500
@@ -301,6 +301,9 @@ int construct_dom0(struct domain *d,
                xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
         return -EINVAL;
     }
+
+    if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") )
+        set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
 
     if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
     {
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Jun 06 13:25:31 2006 -0500
@@ -1970,7 +1970,6 @@ static inline void vmx_vmexit_do_extint(
         __hvm_bug(regs);
 
     vector &= INTR_INFO_VECTOR_MASK;
-    local_irq_disable();
     TRACE_VMEXIT(1,vector);
 
     switch(vector) {
@@ -2065,30 +2064,33 @@ asmlinkage void vmx_vmexit_handler(struc
     struct vcpu *v = current;
     int error;
 
-    if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
-        __hvm_bug(&regs);
+    error = __vmread(VM_EXIT_REASON, &exit_reason);
+    BUG_ON(error);
 
     perfc_incra(vmexits, exit_reason);
 
-    /* don't bother H/W interrutps */
-    if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
-        exit_reason != EXIT_REASON_VMCALL &&
-        exit_reason != EXIT_REASON_IO_INSTRUCTION) 
+    if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
+         (exit_reason != EXIT_REASON_VMCALL) &&
+         (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
         HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
 
-    if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+    if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
+        local_irq_enable();
+
+    if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
+    {
         printk("Failed vm entry (reason 0x%x)\n", exit_reason);
         printk("*********** VMCS Area **************\n");
         vmcs_dump_vcpu();
         printk("**************************************\n");
         domain_crash_synchronous();
-        return;
     }
 
     __vmread(GUEST_RIP, &eip);
     TRACE_VMEXIT(0,exit_reason);
 
-    switch (exit_reason) {
+    switch ( exit_reason )
+    {
     case EXIT_REASON_EXCEPTION_NMI:
     {
         /*
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/x86_32/exits.S
--- a/xen/arch/x86/hvm/vmx/x86_32/exits.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/x86_32/exits.S       Tue Jun 06 13:25:31 2006 -0500
@@ -55,29 +55,26 @@
  * domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers.
  */
 
-#define HVM_MONITOR_EFLAGS     0x202 /* IF on */
 #define NR_SKIPPED_REGS        6       /* See the above explanation */
-#define HVM_SAVE_ALL_NOSEGREGS \
-        pushl $HVM_MONITOR_EFLAGS; \
-        popf; \
-        subl $(NR_SKIPPED_REGS*4), %esp; \
+#define HVM_SAVE_ALL_NOSEGREGS                                              \
+        subl $(NR_SKIPPED_REGS*4), %esp;                                    \
         movl $0, 0xc(%esp);  /* XXX why do we need to force eflags==0 ?? */ \
-        pushl %eax; \
-        pushl %ebp; \
-        pushl %edi; \
-        pushl %esi; \
-        pushl %edx; \
-        pushl %ecx; \
+        pushl %eax;                                                         \
+        pushl %ebp;                                                         \
+        pushl %edi;                                                         \
+        pushl %esi;                                                         \
+        pushl %edx;                                                         \
+        pushl %ecx;                                                         \
         pushl %ebx;
 
-#define HVM_RESTORE_ALL_NOSEGREGS   \
-        popl %ebx;  \
-        popl %ecx;  \
-        popl %edx;  \
-        popl %esi;  \
-        popl %edi;  \
-        popl %ebp;  \
-        popl %eax;  \
+#define HVM_RESTORE_ALL_NOSEGREGS               \
+        popl %ebx;                              \
+        popl %ecx;                              \
+        popl %edx;                              \
+        popl %esi;                              \
+        popl %edi;                              \
+        popl %ebp;                              \
+        popl %eax;                              \
         addl $(NR_SKIPPED_REGS*4), %esp
 
         ALIGN
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/x86_64/exits.S
--- a/xen/arch/x86/hvm/vmx/x86_64/exits.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/x86_64/exits.S       Tue Jun 06 13:25:31 2006 -0500
@@ -51,45 +51,42 @@
  * (2/1)  u32 entry_vector;
  * (1/1)  u32 error_code;
  */
-#define HVM_MONITOR_RFLAGS     0x202 /* IF on */
 #define NR_SKIPPED_REGS        6       /* See the above explanation */
-#define HVM_SAVE_ALL_NOSEGREGS \
-        pushq $HVM_MONITOR_RFLAGS; \
-        popfq; \
-        subq $(NR_SKIPPED_REGS*8), %rsp; \
-        pushq %rdi; \
-        pushq %rsi; \
-        pushq %rdx; \
-        pushq %rcx; \
-        pushq %rax; \
-        pushq %r8;  \
-        pushq %r9;  \
-        pushq %r10; \
-        pushq %r11; \
-        pushq %rbx; \
-        pushq %rbp; \
-        pushq %r12; \
-        pushq %r13; \
-        pushq %r14; \
-        pushq %r15; \
+#define HVM_SAVE_ALL_NOSEGREGS                  \
+        subq $(NR_SKIPPED_REGS*8), %rsp;        \
+        pushq %rdi;                             \
+        pushq %rsi;                             \
+        pushq %rdx;                             \
+        pushq %rcx;                             \
+        pushq %rax;                             \
+        pushq %r8;                              \
+        pushq %r9;                              \
+        pushq %r10;                             \
+        pushq %r11;                             \
+        pushq %rbx;                             \
+        pushq %rbp;                             \
+        pushq %r12;                             \
+        pushq %r13;                             \
+        pushq %r14;                             \
+        pushq %r15;
 
-#define HVM_RESTORE_ALL_NOSEGREGS \
-        popq %r15; \
-        popq %r14; \
-        popq %r13; \
-        popq %r12; \
-        popq %rbp; \
-        popq %rbx; \
-        popq %r11; \
-        popq %r10; \
-        popq %r9;  \
-        popq %r8;  \
-        popq %rax; \
-        popq %rcx; \
-        popq %rdx; \
-        popq %rsi; \
-        popq %rdi; \
-        addq $(NR_SKIPPED_REGS*8), %rsp; \
+#define HVM_RESTORE_ALL_NOSEGREGS               \
+        popq %r15;                              \
+        popq %r14;                              \
+        popq %r13;                              \
+        popq %r12;                              \
+        popq %rbp;                              \
+        popq %rbx;                              \
+        popq %r11;                              \
+        popq %r10;                              \
+        popq %r9;                               \
+        popq %r8;                               \
+        popq %rax;                              \
+        popq %rcx;                              \
+        popq %rdx;                              \
+        popq %rsi;                              \
+        popq %rdi;                              \
+        addq $(NR_SKIPPED_REGS*8), %rsp;
 
 ENTRY(vmx_asm_vmexit_handler)
         /* selectors are restored/saved by VMX */
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/mm.c Tue Jun 06 13:25:31 2006 -0500
@@ -996,6 +996,21 @@ static int alloc_l3_table(struct page_in
     int            i;
 
     ASSERT(!shadow_mode_refcounts(d));
+
+#ifdef CONFIG_X86_PAE
+    /*
+     * PAE pgdirs above 4GB are unacceptable if the guest does not understand
+     * the weird 'extended cr3' format for dealing with high-order address
+     * bits. We cut some slack for control tools (before vcpu0 is initialised).
+     */
+    if ( (pfn >= 0x100000) &&
+         unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
+         d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
+    {
+        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
+        return 0;
+    }
+#endif
 
     pl3e = map_domain_page(pfn);
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Jun 06 13:25:31 2006 -0500
@@ -64,11 +64,13 @@ void __dummy__(void)
            arch.guest_context.kernel_ss);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_arch_guest_fpu_ctxt, struct vcpu, arch.guest_context.fpu_ctxt);
     OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
     OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
     DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
+    DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     BLANK();
 
     OFFSET(TSS_ss0, struct tss_struct, ss0);
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/entry.S       Tue Jun 06 13:25:31 2006 -0500
@@ -130,7 +130,10 @@ failsafe_callback:
         movl  VCPU_failsafe_sel(%ebx),%eax
         movw  %ax,TRAPBOUNCE_cs(%edx)
         movw  $TBF_FAILSAFE,TRAPBOUNCE_flags(%edx)
-        call  create_bounce_frame
+        bt    $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%ebx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx)
+1:      call  create_bounce_frame
         xorl  %eax,%eax
         movl  %eax,UREGS_ds(%esp)
         movl  %eax,UREGS_es(%esp)
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/traps.c       Tue Jun 06 13:25:31 2006 -0500
@@ -346,6 +346,12 @@ static long register_guest_callback(stru
     case CALLBACKTYPE_failsafe:
         v->arch.guest_context.failsafe_callback_cs  = reg->address.cs;
         v->arch.guest_context.failsafe_callback_eip = reg->address.eip;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_failsafe_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_failsafe_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Jun 06 13:25:31 2006 -0500
@@ -64,11 +64,14 @@ void __dummy__(void)
            arch.guest_context.syscall_callback_eip);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_arch_guest_fpu_ctxt, struct vcpu, arch.guest_context.fpu_ctxt);
     OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
     OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
     DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
+    DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
+    DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
     BLANK();
 
     OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/entry.S       Tue Jun 06 13:25:31 2006 -0500
@@ -30,7 +30,10 @@ switch_to_kernel:
         movq  VCPU_syscall_addr(%rbx),%rax
         movq  %rax,TRAPBOUNCE_eip(%rdx)
         movw  $0,TRAPBOUNCE_flags(%rdx)
-        call  create_bounce_frame
+        bt    $_VGCF_syscall_disables_events,VCPU_guest_context_flags(%rbx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx)
+1:      call  create_bounce_frame
         jmp   test_all_events
 
 /* %rbx: struct vcpu, interrupts disabled */
@@ -77,7 +80,10 @@ failsafe_callback:
         movq  VCPU_failsafe_addr(%rbx),%rax
         movq  %rax,TRAPBOUNCE_eip(%rdx)
         movw  $TBF_FAILSAFE,TRAPBOUNCE_flags(%rdx)
-        call  create_bounce_frame
+        bt    $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%rbx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx)
+1:      call  create_bounce_frame
         jmp   test_all_events
 .previous
 .section __pre_ex_table,"a"
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/traps.c       Tue Jun 06 13:25:31 2006 -0500
@@ -334,10 +334,22 @@ static long register_guest_callback(stru
 
     case CALLBACKTYPE_failsafe:
         v->arch.guest_context.failsafe_callback_eip = reg->address;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_failsafe_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_failsafe_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
     case CALLBACKTYPE_syscall:
         v->arch.guest_context.syscall_callback_eip  = reg->address;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_syscall_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_syscall_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
     case CALLBACKTYPE_nmi:
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_emulate.c
--- a/xen/arch/x86/x86_emulate.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_emulate.c        Tue Jun 06 13:25:31 2006 -0500
@@ -100,8 +100,8 @@ static uint8_t opcode_table[256] = {
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     /* 0x88 - 0x8F */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstMem|SrcReg|ModRM|Mov, DstMem|SrcReg|ModRM|Mov,
+    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     0, 0, 0, DstMem|SrcNone|ModRM|Mov,
     /* 0x90 - 0x9F */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff -r d3e181fa238b -r 156a0963a1ae xen/common/kernel.c
--- a/xen/common/kernel.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/kernel.c       Tue Jun 06 13:25:31 2006 -0500
@@ -184,6 +184,7 @@ long do_xen_version(int cmd, XEN_GUEST_H
     case XENVER_get_features:
     {
         xen_feature_info_t fi;
+        struct domain *d = current->domain;
 
         if ( copy_from_guest(&fi, arg, 1) )
             return -EFAULT;
@@ -191,7 +192,9 @@ long do_xen_version(int cmd, XEN_GUEST_H
         switch ( fi.submap_idx )
         {
         case 0:
-            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
+            fi.submap = 0;
+            if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) )
+                fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb);
             if ( shadow_mode_translate(current->domain) )
                 fi.submap |= 
                     (1U << XENFEAT_writable_page_tables) |
diff -r d3e181fa238b -r 156a0963a1ae xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/keyhandler.c   Tue Jun 06 13:25:31 2006 -0500
@@ -128,11 +128,12 @@ static void dump_domains(unsigned char k
                d->domain_flags, atomic_read(&d->refcnt),
                d->tot_pages, d->xenheap_pages, cpuset);
         printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
-               "%02x%02x-%02x%02x%02x%02x%02x%02x\n",
+               "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
                d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3],
                d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7],
                d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11],
-               d->handle[12], d->handle[13], d->handle[14], d->handle[15]);
+               d->handle[12], d->handle[13], d->handle[14], d->handle[15],
+               d->vm_assist);
 
         arch_dump_domain_info(d);
 
diff -r d3e181fa238b -r 156a0963a1ae xen/common/memory.c
--- a/xen/common/memory.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/memory.c       Tue Jun 06 13:25:31 2006 -0500
@@ -31,14 +31,15 @@ static long
 static long
 increase_reservation(
     struct domain *d, 
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
     int           *preempted)
 {
     struct page_info *page;
-    unsigned long     i, mfn;
+    unsigned long i;
+    xen_pfn_t mfn;
 
     if ( !guest_handle_is_null(extent_list) &&
          !guest_handle_okay(extent_list, nr_extents) )
@@ -80,14 +81,16 @@ static long
 static long
 populate_physmap(
     struct domain *d, 
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int  nr_extents,
     unsigned int  extent_order,
     unsigned int  flags,
     int          *preempted)
 {
     struct page_info *page;
-    unsigned long    i, j, gpfn, mfn;
+    unsigned long i, j;
+    xen_pfn_t gpfn;
+    xen_pfn_t mfn;
 
     if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
@@ -177,13 +180,14 @@ static long
 static long
 decrease_reservation(
     struct domain *d,
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
     int           *preempted)
 {
-    unsigned long    i, j, gmfn;
+    unsigned long i, j;
+    xen_pfn_t gmfn;
 
     if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
@@ -214,7 +218,9 @@ translate_gpfn_list(
     XEN_GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
 {
     struct xen_translate_gpfn_list op;
-    unsigned long i, gpfn, mfn;
+    unsigned long i;
+    xen_pfn_t gpfn;
+    xen_pfn_t mfn;
     struct domain *d;
 
     if ( copy_from_guest(&op, uop, 1) )
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-ia64.h
--- a/xen/include/public/arch-ia64.h    Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-ia64.h    Tue Jun 06 13:25:31 2006 -0500
@@ -26,6 +26,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /* Arch specific VIRQs definition */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-x86_32.h  Tue Jun 06 13:25:31 2006 -0500
@@ -28,6 +28,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /*
@@ -138,9 +141,17 @@ struct vcpu_guest_context {
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_HVM_GUEST                 (1<<1)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_hvm_guest                1
+#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -169,7 +180,7 @@ struct arch_shared_info {
 struct arch_shared_info {
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
 };
 typedef struct arch_shared_info arch_shared_info_t;
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-x86_64.h  Tue Jun 06 13:25:31 2006 -0500
@@ -28,6 +28,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /*
@@ -211,9 +214,19 @@ struct vcpu_guest_context {
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_HVM_GUEST                 (1<<1)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_hvm_guest                1
+#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -240,7 +253,7 @@ struct arch_shared_info {
 struct arch_shared_info {
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
 };
 typedef struct arch_shared_info arch_shared_info_t;
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/callback.h
--- a/xen/include/public/callback.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/callback.h     Tue Jun 06 13:25:31 2006 -0500
@@ -29,12 +29,20 @@
 #define CALLBACKTYPE_nmi                   4
 
 /*
+ * Disable event deliver during callback? This flag is ignored for event and
+ * NMI callbacks: event delivery is unconditionally disabled.
+ */
+#define _CALLBACKF_mask_events             0
+#define CALLBACKF_mask_events              (1U << _CALLBACKF_mask_events)
+
+/*
  * Register a callback.
  */
 #define CALLBACKOP_register                0
 struct callback_register {
-     int type;
-     xen_callback_t address;
+    uint16_t type;
+    uint16_t flags;
+    xen_callback_t address;
 };
 typedef struct callback_register callback_register_t;
 DEFINE_XEN_GUEST_HANDLE(callback_register_t);
@@ -47,7 +55,8 @@ DEFINE_XEN_GUEST_HANDLE(callback_registe
  */
 #define CALLBACKOP_unregister              1
 struct callback_unregister {
-     int type;
+    uint16_t type;
+    uint16_t _unused;
 };
 typedef struct callback_unregister callback_unregister_t;
 DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/dom0_ops.h     Tue Jun 06 13:25:31 2006 -0500
@@ -19,7 +19,7 @@
  * This makes sure that old versions of dom0 tools will stop working in a
  * well-defined way (rather than crashing the machine, for instance).
  */
-#define DOM0_INTERFACE_VERSION   0x03000000
+#define DOM0_INTERFACE_VERSION   0x03000001
 
 /************************************************************************/
 
@@ -27,10 +27,10 @@ struct dom0_getmemlist {
 struct dom0_getmemlist {
     /* IN variables. */
     domid_t       domain;
-    unsigned long max_pfns;
-    XEN_GUEST_HANDLE(ulong) buffer;
-    /* OUT variables. */
-    unsigned long num_pfns;
+    uint64_t max_pfns;
+    XEN_GUEST_HANDLE(xen_pfn_t) buffer;
+    /* OUT variables. */
+    uint64_t num_pfns;
 };
 typedef struct dom0_getmemlist dom0_getmemlist_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_getmemlist_t);
@@ -96,9 +96,9 @@ struct dom0_getdomaininfo {
 #define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
 #define DOMFLAGS_SHUTDOWNSHIFT 16
     uint32_t flags;
-    unsigned long tot_pages;
-    unsigned long max_pages;
-    unsigned long shared_info_frame;       /* MFN of shared_info struct */
+    uint64_t tot_pages;
+    uint64_t max_pages;
+    xen_pfn_t shared_info_frame;  /* MFN of shared_info struct */
     uint64_t cpu_time;
     uint32_t nr_online_vcpus;     /* Number of VCPUs currently online. */
     uint32_t max_vcpu_id;         /* Maximum VCPUID in use by this domain. */
@@ -162,7 +162,7 @@ DEFINE_XEN_GUEST_HANDLE(dom0_settime_t);
 
 struct dom0_getpageframeinfo {
     /* IN variables. */
-    unsigned long mfn;     /* Machine page frame number to query.       */
+    xen_pfn_t mfn;         /* Machine page frame number to query.       */
     domid_t domain;        /* To which domain does the frame belong?    */
     /* OUT variables. */
     /* Is the page PINNED to a type? */
@@ -213,7 +213,7 @@ struct dom0_tbufcontrol {
     cpumap_t      cpu_mask;
     uint32_t      evt_mask;
     /* OUT variables */
-    unsigned long buffer_mfn;
+    xen_pfn_t buffer_mfn;
     uint32_t size;
 };
 typedef struct dom0_tbufcontrol dom0_tbufcontrol_t;
@@ -229,8 +229,8 @@ struct dom0_physinfo {
     uint32_t sockets_per_node;
     uint32_t nr_nodes;
     uint32_t cpu_khz;
-    unsigned long total_pages;
-    unsigned long free_pages;
+    uint64_t total_pages;
+    uint64_t free_pages;
     uint32_t hw_cap[8];
 };
 typedef struct dom0_physinfo dom0_physinfo_t;
@@ -276,7 +276,7 @@ struct dom0_shadow_control {
     uint32_t       op;
     XEN_GUEST_HANDLE(ulong) dirty_bitmap;
     /* IN/OUT variables. */
-    unsigned long  pages;        /* size of buffer, updated with actual size */
+    uint64_t       pages;        /* size of buffer, updated with actual size */
     /* OUT variables. */
     struct dom0_shadow_control_stats stats;
 };
@@ -286,8 +286,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_shadow_cont
 #define DOM0_SETDOMAINMAXMEM   28
 struct dom0_setdomainmaxmem {
     /* IN variables. */
-    domid_t       domain;
-    unsigned long max_memkb;
+    domid_t  domain;
+    uint64_t max_memkb;
 };
 typedef struct dom0_setdomainmaxmem dom0_setdomainmaxmem_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_setdomainmaxmem_t);
@@ -295,8 +295,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_setdomainma
 #define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
 struct dom0_getpageframeinfo2 {
     /* IN variables. */
-    domid_t        domain;
-    unsigned long  num;
+    domid_t  domain;
+    uint64_t num;
     /* IN/OUT variables. */
     XEN_GUEST_HANDLE(ulong) array;
 };
@@ -313,12 +313,12 @@ DEFINE_XEN_GUEST_HANDLE(dom0_getpagefram
 #define DOM0_ADD_MEMTYPE         31
 struct dom0_add_memtype {
     /* IN variables. */
-    unsigned long mfn;
-    unsigned long nr_mfns;
-    uint32_t      type;
-    /* OUT variables. */
-    uint32_t      handle;
-    uint32_t      reg;
+    xen_pfn_t mfn;
+    uint64_t nr_mfns;
+    uint32_t type;
+    /* OUT variables. */
+    uint32_t handle;
+    uint32_t reg;
 };
 typedef struct dom0_add_memtype dom0_add_memtype_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_add_memtype_t);
@@ -345,8 +345,8 @@ struct dom0_read_memtype {
     /* IN variables. */
     uint32_t reg;
     /* OUT variables. */
-    unsigned long mfn;
-    unsigned long nr_mfns;
+    xen_pfn_t mfn;
+    uint64_t nr_mfns;
     uint32_t type;
 };
 typedef struct dom0_read_memtype dom0_read_memtype_t;
@@ -499,8 +499,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_irq_permiss
 #define DOM0_IOMEM_PERMISSION 47
 struct dom0_iomem_permission {
     domid_t  domain;          /* domain to be affected */
-    unsigned long first_mfn;  /* first page (physical page number) in range */
-    unsigned long nr_mfns;    /* number of pages in range (>0) */
+    xen_pfn_t first_mfn;      /* first page (physical page number) in range */
+    uint64_t nr_mfns;         /* number of pages in range (>0) */
     uint8_t allow_access;     /* allow (!0) or deny (0) access to range? */
 };
 typedef struct dom0_iomem_permission dom0_iomem_permission_t;
@@ -509,7 +509,7 @@ DEFINE_XEN_GUEST_HANDLE(dom0_iomem_permi
 #define DOM0_HYPERCALL_INIT   48
 struct dom0_hypercall_init {
     domid_t  domain;          /* domain to be affected */
-    unsigned long mfn;        /* machine frame to be initialised */
+    xen_pfn_t mfn;            /* machine frame to be initialised */
 };
 typedef struct dom0_hypercall_init dom0_hypercall_init_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_hypercall_init_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/grant_table.h
--- a/xen/include/public/grant_table.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/grant_table.h  Tue Jun 06 13:25:31 2006 -0500
@@ -244,7 +244,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_dump_tabl
 #define GNTTABOP_transfer                4
 struct gnttab_transfer {
     /* IN parameters. */
-    unsigned long mfn;
+    xen_pfn_t     mfn;
     domid_t       domid;
     grant_ref_t   ref;
     /* OUT parameters. */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/io/netif.h
--- a/xen/include/public/io/netif.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/io/netif.h     Tue Jun 06 13:25:31 2006 -0500
@@ -26,6 +26,10 @@
 /* Packet data has been validated against protocol checksum. */
 #define _NETTXF_data_validated (1)
 #define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
+
+/* Packet continues in the request. */
+#define _NETTXF_more_data      (2)
+#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
 
 struct netif_tx_request {
     grant_ref_t gref;      /* Reference to buffer page */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/io/ring.h
--- a/xen/include/public/io/ring.h      Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/io/ring.h      Tue Jun 06 13:25:31 2006 -0500
@@ -151,19 +151,27 @@ typedef struct __name##_back_ring __name
 #define RING_SIZE(_r)                                                   \
     ((_r)->nr_ents)
 
+/* Number of free requests (for use on front side only). */
+#define RING_FREE_REQUESTS(_r)                                         \
+    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
+
 /* Test if there is an empty slot available on the front ring.
  * (This is only meaningful from the front. )
  */
 #define RING_FULL(_r)                                                   \
-    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
+    (RING_FREE_REQUESTS(_r) == 0)
 
 /* Test if there are outstanding messages to be processed on a ring. */
 #define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
-    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
+    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
 
 #define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
-    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
-     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
+    ({                                                                 \
+       unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
+       unsigned int rsp = RING_SIZE(_r) -                              \
+                          ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
+       req < rsp ? req : rsp;                                          \
+    })
 
 /* Direct access to individual ring elements, by index. */
 #define RING_GET_REQUEST(_r, _idx)                                      \
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/memory.h
--- a/xen/include/public/memory.h       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/memory.h       Tue Jun 06 13:25:31 2006 -0500
@@ -29,7 +29,7 @@ struct xen_memory_reservation {
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
      */
-    XEN_GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /* Number of extents, and size/alignment of each (2^extent_order pages). */
     unsigned long  nr_extents;
@@ -87,7 +87,7 @@ struct xen_machphys_mfn_list {
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-    XEN_GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /*
      * Number of extents written to the above array. This will be smaller
@@ -117,7 +117,7 @@ struct xen_add_to_physmap {
     unsigned long idx;
 
     /* GPFN where the source mapping page should appear. */
-    unsigned long gpfn;
+    xen_pfn_t     gpfn;
 };
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
@@ -135,13 +135,13 @@ struct xen_translate_gpfn_list {
     unsigned long nr_gpfns;
 
     /* List of GPFNs to translate. */
-    XEN_GUEST_HANDLE(ulong) gpfn_list;
+    XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
 
     /*
      * Output list to contain MFN translations. May be the same as the input
      * list (in which case each input GPFN is overwritten with the output MFN).
      */
-    XEN_GUEST_HANDLE(ulong) mfn_list;
+    XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
 };
 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
 DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/xen.h
--- a/xen/include/public/xen.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/xen.h  Tue Jun 06 13:25:31 2006 -0500
@@ -199,7 +199,7 @@ struct mmuext_op {
     unsigned int cmd;
     union {
         /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
-        unsigned long mfn;
+        xen_pfn_t     mfn;
         /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
         unsigned long linear_addr;
     } arg1;
@@ -236,10 +236,24 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  */
 #define VMASST_CMD_enable                0
 #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
 #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
 #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
 #define VMASST_TYPE_writable_pagetables  2
-#define MAX_VMASST_TYPE 2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
+#define VMASST_TYPE_pae_extended_cr3     3
+
+#define MAX_VMASST_TYPE                  3
 
 #ifndef __ASSEMBLY__
 
@@ -449,9 +463,9 @@ struct start_info {
     unsigned long nr_pages;     /* Total pages allocated to this domain.  */
     unsigned long shared_info;  /* MACHINE address of shared info struct. */
     uint32_t flags;             /* SIF_xxx flags.                         */
-    unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+    xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
     uint32_t store_evtchn;      /* Event channel for store communication. */
-    unsigned long console_mfn;  /* MACHINE address of console page.       */
+    xen_pfn_t console_mfn;      /* MACHINE page number of console page.   */
     uint32_t console_evtchn;    /* Event channel for console messages.    */
     /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
     unsigned long pt_base;      /* VIRTUAL address of page directory.     */

_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ppc-devel

<Prev in Thread] Current Thread [Next in Thread>