These structures are used by Xen, and hence guests must not be able
to fiddle with them.
qemu-dm currently plays with the MSI-X table, requiring Dom0 to
still have write access. This is broken (explicitly allowing the guest
write access to the mask bit) and should be fixed in qemu-dm, at which
time Dom0 won't need any special casing anymore.
The changes are made under the assumption that p2m_mmio_direct will
only ever be used for order 0 pages.
An open question is whether dealing with pv guests (including the
IOMMU-less case) is necessary, as handling mappings a domain may
already have in place at the time the first interrupt gets set up
would require scanning all of the guest's L1 page table pages.
Currently a hole still remains allowing PV guests to map these ranges
before actually setting up any MSI-X vector for a device.
An alternative would be to determine and insert the address ranges
earlier into mmio_ro_ranges, but that would require a hook in the
PCI config space writes, which is particularly problematic in case
MMCONFIG accesses are being used.
A second alternative would be to require Dom0 to report all devices
(or at least all MSI-X capable ones) regardless of whether they would
be used by that domain, and do so after resources got determined/
assigned for them (i.e. a second notification later than the one
currently happening from the PCI bus scan would be needed).
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
Acked-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
--- 2010-09-20.orig/xen/arch/x86/mm.c 2010-09-20 14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm.c 2010-09-20 10:00:07.000000000 +0200
@@ -822,7 +822,13 @@ get_page_from_l1e(
return 0;
}
- return 1;
+ if ( !(l1f & _PAGE_RW) || IS_PRIV(pg_owner) ||
+ !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
+ return 1;
+ dprintk(XENLOG_G_WARNING,
+ "d%d: Forcing read-only access to MFN %lx\n",
+ l1e_owner->domain_id, mfn);
+ return -1;
}
if ( unlikely(real_pg_owner != pg_owner) )
@@ -1178,9 +1184,15 @@ static int alloc_l1_table(struct page_in
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
- if ( is_guest_l1_slot(i) &&
- unlikely(!get_page_from_l1e(pl1e[i], d, d)) )
- goto fail;
+ if ( is_guest_l1_slot(i) )
+ switch ( get_page_from_l1e(pl1e[i], d, d) )
+ {
+ case 0:
+ goto fail;
+ case -1:
+ l1e_remove_flags(pl1e[i], _PAGE_RW);
+ break;
+ }
adjust_guest_l1e(pl1e[i], d);
}
@@ -1764,8 +1776,14 @@ static int mod_l1_entry(l1_pgentry_t *pl
return rc;
}
- if ( unlikely(!get_page_from_l1e(nl1e, pt_dom, pg_dom)) )
+ switch ( get_page_from_l1e(nl1e, pt_dom, pg_dom) )
+ {
+ case 0:
return 0;
+ case -1:
+ l1e_remove_flags(nl1e, _PAGE_RW);
+ break;
+ }
adjust_guest_l1e(nl1e, pt_dom);
if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
@@ -4919,8 +4937,9 @@ static int ptwr_emulated_update(
/* Check the new PTE. */
nl1e = l1e_from_intpte(val);
- if ( unlikely(!get_page_from_l1e(nl1e, d, d)) )
+ switch ( get_page_from_l1e(nl1e, d, d) )
{
+ case 0:
if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
!do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
{
@@ -4939,6 +4958,10 @@ static int ptwr_emulated_update(
MEM_LOG("ptwr_emulate: could not get_page_from_l1e()");
return X86EMUL_UNHANDLEABLE;
}
+ break;
+ case -1:
+ l1e_remove_flags(nl1e, _PAGE_RW);
+ break;
}
adjust_guest_l1e(nl1e, d);
--- 2010-09-20.orig/xen/arch/x86/mm/hap/p2m-ept.c 2010-09-20
14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm/hap/p2m-ept.c 2010-09-20 10:00:07.000000000
+0200
@@ -72,9 +72,13 @@ static void ept_p2m_type_to_flags(ept_en
entry->r = entry->w = entry->x = 0;
return;
case p2m_ram_rw:
- case p2m_mmio_direct:
entry->r = entry->w = entry->x = 1;
return;
+ case p2m_mmio_direct:
+ entry->r = entry->x = 1;
+ entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
+ entry->mfn);
+ return;
case p2m_ram_logdirty:
case p2m_ram_ro:
case p2m_ram_shared:
@@ -722,6 +726,9 @@ static void ept_change_entry_type_global
if ( ept_get_asr(d) == 0 )
return;
+ BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+ BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
+
ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt);
ept_sync_domain(d);
--- 2010-09-20.orig/xen/arch/x86/mm/p2m.c 2010-09-20 14:29:04.000000000
+0200
+++ 2010-09-20/xen/arch/x86/mm/p2m.c 2010-09-20 10:00:07.000000000 +0200
@@ -72,7 +72,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
#define SUPERPAGE_PAGES (1UL << 9)
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
-static unsigned long p2m_type_to_flags(p2m_type_t t)
+static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
#ifdef __x86_64__
@@ -101,7 +101,9 @@ static unsigned long p2m_type_to_flags(p
case p2m_mmio_dm:
return flags;
case p2m_mmio_direct:
- return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
+ if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
+ flags |= _PAGE_RW;
+ return flags | P2M_BASE_FLAGS | _PAGE_PCD;
case p2m_populate_on_demand:
return flags;
}
@@ -1299,8 +1301,10 @@ p2m_set_entry(struct p2m_domain *p2m, un
domain_crash(p2m->domain);
goto out;
}
+ ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
l3e_content = mfn_valid(mfn)
- ? l3e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt) | _PAGE_PSE)
+ ? l3e_from_pfn(mfn_x(mfn),
+ p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
: l3e_empty();
entry_content.l1 = l3e_content.l3;
paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
@@ -1334,7 +1338,8 @@ p2m_set_entry(struct p2m_domain *p2m, un
ASSERT(p2m_entry);
if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
- entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+ entry_content = l1e_from_pfn(mfn_x(mfn),
+ p2m_type_to_flags(p2mt, mfn));
else
entry_content = l1e_empty();
@@ -1358,9 +1363,11 @@ p2m_set_entry(struct p2m_domain *p2m, un
goto out;
}
+ ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
l2e_content = l2e_from_pfn(mfn_x(mfn),
- p2m_type_to_flags(p2mt) | _PAGE_PSE);
+ p2m_type_to_flags(p2mt, mfn) |
+ _PAGE_PSE);
else
l2e_content = l2e_empty();
@@ -2437,6 +2444,7 @@ void p2m_change_type_global(struct p2m_d
#endif /* CONFIG_PAGING_LEVELS == 4 */
BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+ BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
if ( !paging_mode_translate(p2m->domain) )
return;
@@ -2478,7 +2486,7 @@ void p2m_change_type_global(struct p2m_d
continue;
mfn = l3e_get_pfn(l3e[i3]);
gfn = get_gpfn_from_mfn(mfn);
- flags = p2m_type_to_flags(nt);
+ flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
paging_write_p2m_entry(p2m->domain, gfn,
(l1_pgentry_t *)&l3e[i3],
@@ -2509,7 +2517,7 @@ void p2m_change_type_global(struct p2m_d
#endif
)
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
- flags = p2m_type_to_flags(nt);
+ flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
paging_write_p2m_entry(p2m->domain, gfn,
(l1_pgentry_t *)&l2e[i2],
@@ -2533,7 +2541,7 @@ void p2m_change_type_global(struct p2m_d
)
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
/* create a new 1le entry with the new type */
- flags = p2m_type_to_flags(nt);
+ flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags);
paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
l1mfn, l1e_content, 1);
--- 2010-09-20.orig/xen/arch/x86/mm/shadow/multi.c 2010-09-20
14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/mm/shadow/multi.c 2010-09-20 10:00:07.000000000
+0200
@@ -674,7 +674,9 @@ _sh_propagate(struct vcpu *v,
}
/* Read-only memory */
- if ( p2m_is_readonly(p2mt) )
+ if ( p2m_is_readonly(p2mt) ||
+ (p2mt == p2m_mmio_direct &&
+ rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn))) )
sflags &= ~_PAGE_RW;
// protect guest page tables
@@ -1225,15 +1227,19 @@ static int shadow_set_l1e(struct vcpu *v
/* About to install a new reference */
if ( shadow_mode_refcounts(d) ) {
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
- if ( shadow_get_page_from_l1e(new_sl1e, d, new_type) == 0 )
+ switch ( shadow_get_page_from_l1e(new_sl1e, d, new_type) )
{
+ case 0:
/* Doesn't look like a pagetable. */
flags |= SHADOW_SET_ERROR;
new_sl1e = shadow_l1e_empty();
- }
- else
- {
+ break;
+ case -1:
+ shadow_l1e_remove_flags(new_sl1e, _PAGE_RW);
+ /* fall through */
+ default:
shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
+ break;
}
}
}
--- 2010-09-20.orig/xen/arch/x86/msi.c 2010-09-20 14:29:04.000000000 +0200
+++ 2010-09-20/xen/arch/x86/msi.c 2010-09-20 14:29:34.000000000 +0200
@@ -16,12 +16,14 @@
#include <xen/errno.h>
#include <xen/pci.h>
#include <xen/pci_regs.h>
+#include <xen/iocap.h>
#include <xen/keyhandler.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/desc.h>
#include <asm/msi.h>
#include <asm/fixmap.h>
+#include <asm/p2m.h>
#include <mach_apic.h>
#include <io_ports.h>
#include <public/physdev.h>
@@ -520,6 +522,43 @@ static int msi_capability_init(struct pc
return 0;
}
+static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 bir)
+{
+ u8 limit;
+ u32 addr;
+
+ switch ( pci_conf_read8(bus, slot, func, PCI_HEADER_TYPE) )
+ {
+ case PCI_HEADER_TYPE_NORMAL:
+ limit = 6;
+ break;
+ case PCI_HEADER_TYPE_BRIDGE:
+ limit = 2;
+ break;
+ case PCI_HEADER_TYPE_CARDBUS:
+ limit = 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if ( bir >= limit )
+ return 0;
+ addr = pci_conf_read32(bus, slot, func, PCI_BASE_ADDRESS_0 + bir * 4);
+ if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO )
+ return 0;
+ if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
PCI_BASE_ADDRESS_MEM_TYPE_64 )
+ {
+ addr &= ~PCI_BASE_ADDRESS_MEM_MASK;
+ if ( ++bir >= limit )
+ return 0;
+ return addr |
+ ((u64)pci_conf_read32(bus, slot, func,
+ PCI_BASE_ADDRESS_0 + bir * 4) << 32);
+ }
+ return addr & ~PCI_BASE_ADDRESS_MEM_MASK;
+}
+
/**
* msix_capability_init - configure device's MSI-X capability
* @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -532,7 +571,8 @@ static int msi_capability_init(struct pc
**/
static int msix_capability_init(struct pci_dev *dev,
struct msi_info *msi,
- struct msi_desc **desc)
+ struct msi_desc **desc,
+ unsigned int nr_entries)
{
struct msi_desc *entry;
int pos;
@@ -587,6 +627,66 @@ static int msix_capability_init(struct p
list_add_tail(&entry->list, &dev->msi_list);
+ if ( !dev->msix_nr_entries )
+ {
+ u64 pba_paddr;
+ u32 pba_offset;
+
+ ASSERT(!dev->msix_used_entries);
+ WARN_ON(msi->table_base != read_pci_mem_bar(bus, slot, func, bir));
+
+ dev->msix_nr_entries = nr_entries;
+ dev->msix_table.first = PFN_DOWN(table_paddr);
+ dev->msix_table.last = PFN_DOWN(table_paddr +
+ nr_entries * PCI_MSIX_ENTRY_SIZE - 1);
+ WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_table.first,
+ dev->msix_table.last));
+
+ pba_offset = pci_conf_read32(bus, slot, func,
+ msix_pba_offset_reg(pos));
+ bir = (u8)(pba_offset & PCI_MSIX_BIRMASK);
+ pba_paddr = read_pci_mem_bar(bus, slot, func, bir);
+ WARN_ON(!pba_paddr);
+ pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK;
+
+ dev->msix_pba.first = PFN_DOWN(pba_paddr);
+ dev->msix_pba.last = PFN_DOWN(pba_paddr +
+ BITS_TO_LONGS(nr_entries) - 1);
+ WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
+ dev->msix_pba.last));
+
+ if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
+ dev->msix_table.last) )
+ WARN();
+ if ( rangeset_add_range(mmio_ro_ranges, dev->msix_pba.first,
+ dev->msix_pba.last) )
+ WARN();
+
+ if ( dev->domain )
+ p2m_change_entry_type_global(p2m_get_hostp2m(dev->domain),
+ p2m_mmio_direct, p2m_mmio_direct);
+ if ( !dev->domain || !paging_mode_translate(dev->domain) )
+ {
+ struct domain *d = dev->domain;
+
+ if ( !d )
+ for_each_domain(d)
+ if ( !paging_mode_translate(d) &&
+ (iomem_access_permitted(d, dev->msix_table.first,
+ dev->msix_table.last) ||
+ iomem_access_permitted(d, dev->msix_pba.first,
+ dev->msix_pba.last)) )
+ break;
+ if ( d )
+ {
+ /* XXX How to deal with existing mappings? */
+ }
+ }
+ }
+ WARN_ON(dev->msix_nr_entries != nr_entries);
+ WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
+ ++dev->msix_used_entries;
+
/* Mask interrupt here */
writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
@@ -717,7 +817,7 @@ static int __pci_enable_msix(struct msi_
}
- status = msix_capability_init(pdev, msi, desc);
+ status = msix_capability_init(pdev, msi, desc, nr_entries);
return status;
}
@@ -742,6 +842,16 @@ static void __pci_disable_msix(struct ms
writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
+
+ if ( !--dev->msix_used_entries )
+ {
+ if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+ dev->msix_table.last) )
+ WARN();
+ if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+ dev->msix_pba.last) )
+ WARN();
+ }
}
/*
--- 2010-09-20.orig/xen/drivers/passthrough/io.c 2010-09-20
14:29:04.000000000 +0200
+++ 2010-09-20/xen/drivers/passthrough/io.c 2010-09-20 10:00:07.000000000
+0200
@@ -26,6 +26,8 @@
#include <xen/hvm/irq.h>
#include <xen/tasklet.h>
+struct rangeset *__read_mostly mmio_ro_ranges;
+
static void hvm_dirq_assist(unsigned long _d);
bool_t pt_irq_need_timer(uint32_t flags)
@@ -565,3 +567,11 @@ void hvm_dpci_eoi(struct domain *d, unsi
unlock:
spin_unlock(&d->event_lock);
}
+
+static int __init setup_mmio_ro_ranges(void)
+{
+ mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
+ RANGESETF_prettyprint_hex);
+ return 0;
+}
+__initcall(setup_mmio_ro_ranges);
--- 2010-09-20.orig/xen/include/xen/iommu.h 2010-09-20 14:29:04.000000000
+0200
+++ 2010-09-20/xen/include/xen/iommu.h 2010-09-20 10:00:07.000000000 +0200
@@ -31,6 +31,8 @@ extern bool_t force_iommu, iommu_verbose
extern bool_t iommu_workaround_bios_bug, iommu_passthrough;
extern bool_t iommu_snoop, iommu_qinval, iommu_intremap;
+extern struct rangeset *mmio_ro_ranges;
+
#define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu)
#define MAX_IOMMUS 32
--- 2010-09-20.orig/xen/include/xen/pci.h 2010-09-20 14:29:04.000000000
+0200
+++ 2010-09-20/xen/include/xen/pci.h 2010-09-20 10:00:07.000000000 +0200
@@ -45,6 +45,10 @@ struct pci_dev {
struct list_head domain_list;
struct list_head msi_list;
+ unsigned int msix_nr_entries, msix_used_entries;
+ struct {
+ unsigned long first, last;
+ } msix_table, msix_pba;
int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
int msix_table_idx[MAX_MSIX_TABLE_PAGES];
spinlock_t msix_table_lock;
x86-msix-protect-table-and-pba.patch
Description: Text document
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|