WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH V3 07/10] Introduce Xen PCI Passthrough, qdevice (1/3

To: QEMU-devel <qemu-devel@xxxxxxxxxx>, Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH V3 07/10] Introduce Xen PCI Passthrough, qdevice (1/3)
From: Anthony PERARD <anthony.perard@xxxxxxxxxx>
Date: Fri, 28 Oct 2011 16:07:33 +0100
Cc: Anthony PERARD <anthony.perard@xxxxxxxxxx>, Guy Zana <guy@xxxxxxxxxxxx>, Xen Devel <xen-devel@xxxxxxxxxxxxxxxxxxx>, Allen Kay <allen.m.kay@xxxxxxxxx>
Delivery-date: Fri, 28 Oct 2011 08:25:12 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <1319814456-8158-1-git-send-email-anthony.perard@xxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <1319814456-8158-1-git-send-email-anthony.perard@xxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
From: Allen Kay <allen.m.kay@xxxxxxxxx>

Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx>
Signed-off-by: Guy Zana <guy@xxxxxxxxxxxx>
Signed-off-by: Anthony PERARD <anthony.perard@xxxxxxxxxx>
---
 Makefile.target                  |    2 +
 hw/xen_pci_passthrough.c         |  838 ++++++++++++++++++++++++++++++++++++++
 hw/xen_pci_passthrough.h         |  223 ++++++++++
 hw/xen_pci_passthrough_helpers.c |   46 ++
 4 files changed, 1109 insertions(+), 0 deletions(-)
 create mode 100644 hw/xen_pci_passthrough.c
 create mode 100644 hw/xen_pci_passthrough.h
 create mode 100644 hw/xen_pci_passthrough_helpers.c

diff --git a/Makefile.target b/Makefile.target
index 243f9f2..36ea47d 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -217,6 +217,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
 
 # Xen PCI Passthrough
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_helpers.o
 
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
new file mode 100644
index 0000000..b97c5b6
--- /dev/null
+++ b/hw/xen_pci_passthrough.c
@@ -0,0 +1,838 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@xxxxxxxxxxxx>
+ * Allen Kay <allen.m.kay@xxxxxxxxx>
+ * Guy Zana <guy@xxxxxxxxxxxx>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+/*
+ * Interrupt Disable policy:
+ *
+ * INTx interrupt:
+ *   Initialize(register_real_device)
+ *     Map INTx(xc_physdev_map_pirq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Set machine_irq and assigned_device->machine_irq to '0'.
+ *         * Don't bind INTx.
+ *
+ *     Bind INTx(xc_domain_bind_pt_pci_irq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Unmap INTx.
+ *         - Decrement mapped_machine_irq[machine_irq]
+ *         - Set assigned_device->machine_irq to '0'.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
+ *
+ *     Write '1'
+ *       <ptdev->msi_trans_en is false>
+ *         - Set real bit to '1'.
+ *
+ * MSI-INTx translation.
+ *   Initialize(xc_physdev_map_pirq_msi/pt_msi_setup)
+ *     Bind MSI-INTx(xc_domain_bind_pt_irq)
+ *       <fail>
+ *         - Unmap MSI.
+ *           <success>
+ *             - Set dev->msi->pirq to '-1'.
+ *           <fail>
+ *             - Do nothing.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       <ptdev->msi_trans_en is true>
+ *         - Set MSI Enable bit to '1'.
+ *
+ *     Write '1'
+ *       <ptdev->msi_trans_en is true>
+ *         - Set MSI Enable bit to '0'.
+ *
+ * MSI interrupt:
+ *   Initialize MSI register(pt_msi_setup, pt_msi_update)
+ *     Bind MSI(xc_domain_update_msi_irq)
+ *       <fail>
+ *         - Unmap MSI.
+ *         - Set dev->msi->pirq to '-1'.
+ *
+ * MSI-X interrupt:
+ *   Initialize MSI-X register(pt_msix_update_one)
+ *     Bind MSI-X(xc_domain_update_msi_irq)
+ *       <fail>
+ *         - Unmap MSI-X.
+ *         - Set entry->pirq to '-1'.
+ */
+
+#include <sys/ioctl.h>
+
+#include "pci.h"
+#include "xen.h"
+#include "xen_backend.h"
+#include "xen_pci_passthrough.h"
+
+#define PCI_BAR_ENTRIES (6)
+
+#define PT_NR_IRQS          (256)
+char mapped_machine_irq[PT_NR_IRQS] = {0};
+
+/* Config Space */
+static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
+{
+    /* check offset range */
+    if (address >= 0xFF) {
+        PT_LOG("Error: Failed to access register with offset exceeding FFh. "
+               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+        return -1;
+    }
+
+    /* check read size */
+    if ((len != 1) && (len != 2) && (len != 4)) {
+        PT_LOG("Error: Failed to access register with invalid access length. "
+               "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+        return -1;
+    }
+
+    /* check offset alignment */
+    if (address & (len - 1)) {
+        PT_LOG("Error: Failed to access register with invalid access size "
+            "alignment. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+            pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+            address, len);
+        return -1;
+    }
+
+    return 0;
+}
+
+int pt_bar_offset_to_index(uint32_t offset)
+{
+    int index = 0;
+
+    /* check Exp ROM BAR */
+    if (offset == PCI_ROM_ADDRESS) {
+        return PCI_ROM_SLOT;
+    }
+
+    /* calculate BAR index */
+    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
+    if (index >= PCI_NUM_REGIONS) {
+        return -1;
+    }
+
+    return index;
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    uint32_t val = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    int rc = 0;
+    int emul_len = 0;
+    uint32_t find_addr = address;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        goto exit;
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept until previous power state transition is completed.
+         * so finished previous request here.
+         */
+        PT_LOG("Warning: guest want to write durring power state 
transition\n");
+        goto exit;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* no need to emulate, just return 0 */
+            val = 0;
+            goto exit;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address, (uint8_t *)&val, len);
+    if (!rc) {
+        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
+        memset(&val, 0xff, len);
+    }
+
+    /* just return the I/O device register value for
+     * passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto exit;
+    }
+
+    /* adjust the read value to appropriate CFC-CFF window */
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop Guest request size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            XenPTRegInfo *reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation depend on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.read) {
+                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.read) {
+                    rc = reg->u.w.read(s, reg_entry,
+                                       (uint16_t *)ptr_val, valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.read) {
+                    rc = reg->u.dw.read(s, reg_entry,
+                                        (uint32_t *)ptr_val, valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                hw_error("Internal error: Invalid read emulation "
+                         "return value[%d]. I/O emulator exit.\n", rc);
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before returning them to pci bus emulator */
+    val >>= ((address & 3) << 3);
+
+exit:
+    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
+                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                  address, val, len);
+    return val;
+}
+
+static void pt_pci_write_config(PCIDevice *d, uint32_t address,
+                                uint32_t val, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    int index = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    int rc = 0;
+    uint32_t read_val = 0;
+    int emul_len = 0;
+    XenPTReg *reg_entry = NULL;
+    uint32_t find_addr = address;
+    XenPTRegInfo *reg = NULL;
+
+    if (pt_pci_config_access_check(d, address, len)) {
+        return;
+    }
+
+    PT_LOG_CONFIG("[%02x:%02x.%x]: address=%04x val=0x%08x len=%d\n",
+                  pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                  address, val, len);
+
+    /* check unused BAR register */
+    index = pt_bar_offset_to_index(address);
+    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
+        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
+        PT_LOG("Warning: Guest attempt to set address to unused Base Address "
+               "Register. [%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+               pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+               address, len);
+    }
+
+    /* check power state transition flags */
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        /* can't accept untill previous power state transition is completed.
+         * so finished previous request here.
+         */
+        PT_LOG("Warning: guest want to write durring power state 
transition\n");
+        return;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, address);
+    if (reg_grp_entry) {
+        /* check 0 Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* ignore silently */
+            PT_LOG("Warning: Access to 0 Hardwired register. "
+                   "[%02x:%02x.%x][Offset:%02xh][Length:%d]\n",
+                   pci_bus_num(d->bus), PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
+                   address, len);
+            return;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, address,
+                             (uint8_t *)&read_val, len);
+    if (!rc) {
+        PT_LOG("Error: pci_read_block failed. return value[%d].\n", rc);
+        memset(&read_val, 0xff, len);
+    }
+
+    /* pass directly to libpci for passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto out;
+    }
+
+    /* adjust the read and write value to appropriate CFC-CFF window */
+    read_val <<= (address & 3) << 3;
+    val <<= (address & 3) << 3;
+    emul_len = len;
+
+    /* loop Guest request size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation depend on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.write) {
+                    rc = reg->u.b.write(s, reg_entry, ptr_val,
+                                        read_val >> ((real_offset & 3) << 3),
+                                        valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.write) {
+                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
+                                        (read_val >> ((real_offset & 3) << 3)),
+                                        valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.write) {
+                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
+                                         (read_val >> ((real_offset & 3) << 
3)),
+                                         valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                hw_error("Internal error: Invalid write emulation "
+                         "return value[%d]. I/O emulator exit.\n", rc);
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before passing them to libpci */
+    val >>= (address & 3) << 3;
+
+out:
+    if (!(reg && reg->no_wb)) {
+        /* unknown regs are passed through */
+        rc = host_pci_set_block(s->real_device, address, (uint8_t *)&val, len);
+
+        if (!rc) {
+            PT_LOG("Error: pci_write_block failed. return value[%d].\n", rc);
+        }
+    }
+
+    if (s->pm_state != NULL && s->pm_state->flags & PT_FLAG_TRANSITING) {
+        qemu_mod_timer(s->pm_state->pm_timer,
+                       qemu_get_clock_ms(rt_clock) + s->pm_state->pm_delay);
+    }
+}
+
+/* ioport/iomem space*/
+static void pt_iomem_map(XenPCIPassthroughState *s, int i,
+                         pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG("e_phys=%#"PRIx64" maddr=%#"PRIx64" type=%%d"
+           " len=%#"PRIx64" index=%d first_map=%d\n",
+           e_phys, s->bases[i].access.maddr, /*type,*/
+           e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != -1) {
+        /* Remove old mapping */
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                               old_ebase >> XC_PAGE_SHIFT,
+                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
+                               DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address */
+    if (e_phys != -1) {
+        /* Create new mapping */
+        ret = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                                   DPCI_ADD_MAPPING);
+
+        if (ret != 0) {
+            PT_LOG("Error: create new mapping failed!\n");
+        }
+    }
+}
+
+static void pt_ioport_map(XenPCIPassthroughState *s, int i,
+                          pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int ret = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG("e_phys=%#04"PRIx64" pio_base=%#04"PRIx64" len=%"PRId64" index=%d"
+           " first_map=%d\n",
+           e_phys, s->bases[i].access.pio_base, e_size, i, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != -1) {
+        /* Remove old mapping */
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_REMOVE_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: remove old mapping failed!\n");
+            return;
+        }
+    }
+
+    /* map only valid guest address (include 0) */
+    if (e_phys != -1) {
+        /* Create new mapping */
+        ret = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
+                                       s->bases[i].access.pio_base, e_size,
+                                       DPCI_ADD_MAPPING);
+        if (ret != 0) {
+            PT_LOG("Error: create new mapping failed!\n");
+        }
+    }
+
+}
+
+
+/* mapping BAR */
+
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable)
+{
+    PCIDevice *dev = &s->dev;
+    PCIIORegion *r;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    pcibus_t r_size = 0, r_addr = -1;
+    int rc = 0;
+
+    r = &dev->io_regions[bar];
+
+    /* check valid region */
+    if (!r->size) {
+        return;
+    }
+
+    base = &s->bases[bar];
+    /* skip unused BAR or upper 64bit BAR */
+    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
+        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
+           return;
+    }
+
+    /* copy region address to temporary */
+    r_addr = r->addr;
+
+    /* need unmapping in case I/O Space or Memory Space disable */
+    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
+        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
+        r_addr = -1;
+    }
+    if ((bar == PCI_ROM_SLOT) && (r_addr != -1)) {
+        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
+        if (reg_grp_entry) {
+            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
+            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
+                r_addr = -1;
+            }
+        }
+    }
+
+    /* prevent guest software mapping memory resource to 00000000h */
+    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
+        r_addr = -1;
+    }
+
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    rc = pci_check_bar_overlap(dev, r_addr, r_size, r->type);
+    if (rc > 0) {
+        PT_LOG("Warning: s[%02x:%02x.%x][Region:%d][Address:%"FMT_PCIBUS"h]"
+               "[Size:%"FMT_PCIBUS"h] is overlapped.\n", pci_bus_num(dev->bus),
+               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), bar,
+               r_addr, r_size);
+    }
+
+    /* check whether we need to update the mapping or not */
+    if (r_addr != s->bases[bar].e_physbase) {
+        /* mapping BAR */
+        if (base->bar_flag == PT_BAR_FLAG_IO) {
+            pt_ioport_map(s, bar, r_addr, r_size, r->type);
+        } else {
+            pt_iomem_map(s, bar, r_addr, r_size, r->type);
+        }
+    }
+}
+
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        pt_bar_mapping_one(s, i, io_enable, mem_enable);
+    }
+}
+
+/* register regions */
+static int pt_register_regions(XenPCIPassthroughState *s)
+{
+    int i = 0;
+    uint32_t bar_data = 0;
+    HostPCIDevice *d = s->real_device;
+
+    /* Register PIO/MMIO BARs */
+    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
+        HostPCIIORegion *r = &d->io_regions[i];
+
+        if (r->base_addr) {
+            s->bases[i].e_physbase = r->base_addr;
+            s->bases[i].access.u = r->base_addr;
+
+            /* Register current region */
+            if (r->flags & IORESOURCE_IO) {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_IO,
+                                 &s->bar[i]);
+            } else if (r->flags & IORESOURCE_PREFETCH) {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                                 &s->bar[i]);
+            } else {
+                memory_region_init_io(&s->bar[i], NULL, NULL,
+                                      "xen-pci-pt-bar", r->size);
+                pci_register_bar(&s->dev, i, PCI_BASE_ADDRESS_SPACE_MEMORY,
+                                 &s->bar[i]);
+            }
+
+            PT_LOG("IO region registered (size=0x%08"PRIx64
+                   " base_addr=0x%08"PRIx64")\n",
+                   r->size, r->base_addr);
+        }
+    }
+
+    /* Register expansion ROM address */
+    if (d->rom.base_addr && d->rom.size) {
+        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
+        bar_data = host_pci_get_long(d, PCI_ROM_ADDRESS);
+        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
+            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
+            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
+        }
+
+        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
+        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
+
+        memory_region_init_rom_device(&s->rom, NULL, NULL, &s->dev.qdev,
+                                      "xen-pci-pt-rom", d->rom.size);
+        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                         &s->rom);
+
+        PT_LOG("Expansion ROM registered (size=0x%08"PRIx64
+               " base_addr=0x%08"PRIx64")\n",
+               d->rom.size, d->rom.base_addr);
+    }
+
+    return 0;
+}
+
+static void pt_unregister_regions(XenPCIPassthroughState *s)
+{
+    int i, type, rc;
+    uint32_t e_size;
+    PCIDevice *d = &s->dev;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        e_size = s->bases[i].e_size;
+        if ((e_size == 0) || (s->bases[i].e_physbase == -1)) {
+            continue;
+        }
+
+        type = d->io_regions[i].type;
+
+        if (type == PCI_BASE_ADDRESS_SPACE_MEMORY
+            || type == PCI_BASE_ADDRESS_MEM_PREFETCH) {
+            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                    (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                    DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_LOG("Error: remove old mem mapping failed!\n");
+                continue;
+            }
+
+        } else if (type == PCI_BASE_ADDRESS_SPACE_IO) {
+            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
+                        s->bases[i].e_physbase,
+                        s->bases[i].access.pio_base,
+                        e_size,
+                        DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_LOG("Error: remove old io mapping failed!\n");
+                continue;
+            }
+        }
+    }
+}
+
+static int pt_initfn(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    int dom, bus;
+    unsigned slot, func;
+    int rc = 0;
+    uint32_t machine_irq;
+    int pirq = -1;
+
+    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
+        fprintf(stderr, "error parse bdf: %s\n", s->hostaddr);
+        return -1;
+    }
+
+    /* register real device */
+    PT_LOG("Assigning real physical device %02x:%02x.%x to devfn %i ...\n",
+           bus, slot, func, s->dev.devfn);
+
+    s->real_device = host_pci_device_get(bus, slot, func);
+    if (!s->real_device) {
+        return -1;
+    }
+
+    s->is_virtfn = s->real_device->is_virtfn;
+    if (s->is_virtfn) {
+        PT_LOG("%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
+               s->real_device->domain, bus, slot, func);
+    }
+
+    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
+    if (host_pci_get_block(s->real_device, 0, pcidev->config,
+                           PCI_CONFIG_SPACE_SIZE) == -1) {
+        return -1;
+    }
+
+    /* Handle real device's MMIO/PIO BARs */
+    pt_register_regions(s);
+
+    /* reinitialize each config register to be emulated */
+    pt_config_init(s);
+
+    /* Bind interrupt */
+    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
+        PT_LOG("no pin interrupt\n");
+        goto out;
+    }
+
+    machine_irq = host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE);
+    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
+
+    if (rc) {
+        PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
+
+        /* Disable PCI intx assertion (turn on bit10 of devctl) */
+        host_pci_set_word(s->real_device,
+                          PCI_COMMAND,
+                          pci_get_word(s->dev.config + PCI_COMMAND)
+                          | PCI_COMMAND_INTX_DISABLE);
+        machine_irq = 0;
+        s->machine_irq = 0;
+    } else {
+        machine_irq = pirq;
+        s->machine_irq = pirq;
+        mapped_machine_irq[machine_irq]++;
+    }
+
+    /* bind machine_irq to device */
+    if (rc < 0 && machine_irq != 0) {
+        uint8_t e_device = PCI_SLOT(s->dev.devfn);
+        uint8_t e_intx = pci_intx(s);
+
+        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq, 0,
+                                       e_device, e_intx);
+        if (rc < 0) {
+            PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc);
+
+            /* Disable PCI intx assertion (turn on bit10 of devctl) */
+            host_pci_set_word(s->real_device, PCI_COMMAND,
+                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
+                              | PCI_COMMAND_INTX_DISABLE);
+            mapped_machine_irq[machine_irq]--;
+
+            if (mapped_machine_irq[machine_irq] == 0) {
+                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
+                    PT_LOG("Error: Unmapping of interrupt failed! rc=%d\n",
+                           rc);
+                }
+            }
+            s->machine_irq = 0;
+        }
+    }
+
+out:
+    PT_LOG("Real physical device %02x:%02x.%x registered successfuly!\n"
+           "IRQ type = %s\n", bus, slot, func, "INTx");
+
+    return 0;
+}
+
+static int pt_unregister_device(PCIDevice *pcidev)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, pcidev);
+    uint8_t e_device, e_intx;
+    uint32_t machine_irq;
+    int rc;
+
+    /* Unbind interrupt */
+    e_device = PCI_SLOT(s->dev.devfn);
+    e_intx = pci_intx(s);
+    machine_irq = s->machine_irq;
+
+    if (machine_irq) {
+        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
+                                     PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 0);
+        if (rc < 0) {
+            PT_LOG("Error: Unbinding of interrupt failed! rc=%d\n", rc);
+        }
+    }
+
+    if (machine_irq) {
+        mapped_machine_irq[machine_irq]--;
+
+        if (mapped_machine_irq[machine_irq] == 0) {
+            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
+
+            if (rc < 0) {
+                PT_LOG("Error: Unmaping of interrupt failed! rc=%d\n", rc);
+            }
+        }
+    }
+
+    /* delete all emulated config registers */
+    pt_config_delete(s);
+
+    /* unregister real device's MMIO/PIO BARs */
+    pt_unregister_regions(s);
+
+    host_pci_device_put(s->real_device);
+
+    return 0;
+}
+
+static PCIDeviceInfo xen_pci_passthrough = {
+    .init = pt_initfn,
+    .exit = pt_unregister_device,
+    .qdev.name = "xen-pci-passthrough",
+    .qdev.desc = "Assign an host pci device with Xen",
+    .qdev.size = sizeof(XenPCIPassthroughState),
+    .config_read = pt_pci_read_config,
+    .config_write = pt_pci_write_config,
+    .is_express = 0,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
+        DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
+                        0, false),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void xen_passthrough_register(void)
+{
+    pci_qdev_register(&xen_pci_passthrough);
+}
+
+device_init(xen_passthrough_register);
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
new file mode 100644
index 0000000..2d1979d
--- /dev/null
+++ b/hw/xen_pci_passthrough.h
@@ -0,0 +1,223 @@
+#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
+#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
+
+#include "qemu-common.h"
+#include "xen_common.h"
+#include "pci.h"
+#include "host-pci-device.h"
+
+#define PT_LOGGING_ENABLED
+#define PT_DEBUG_PCI_CONFIG_ACCESS
+
+#ifdef PT_LOGGING_ENABLED
+#  define PT_LOG(_f, _a...)   fprintf(stderr, "%s: " _f, __func__, ##_a)
+#else
+#  define PT_LOG(_f, _a...)
+#endif
+
+#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
+#  define PT_LOG_CONFIG(_f, _a...) PT_LOG(_f, ##_a)
+#else
+#  define PT_LOG_CONFIG(_f, _a...)
+#endif
+
+
+typedef struct XenPTRegInfo XenPTRegInfo;
+typedef struct XenPTReg XenPTReg;
+
+typedef struct XenPCIPassthroughState XenPCIPassthroughState;
+
+/* function type for config reg */
+typedef uint32_t (*conf_reg_init)
+    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset);
+typedef int (*conf_dword_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
+typedef int (*conf_word_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
+typedef int (*conf_byte_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
+typedef int (*conf_dword_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t valid_mask);
+typedef int (*conf_word_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t valid_mask);
+typedef int (*conf_byte_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t valid_mask);
+typedef int (*conf_dword_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint32_t dev_value, uint32_t *val);
+typedef int (*conf_word_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint16_t dev_value, uint16_t *val);
+typedef int (*conf_byte_restore)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry, uint32_t real_offset,
+     uint8_t dev_value, uint8_t *val);
+
+/* power state transition */
+#define PT_FLAG_TRANSITING 0x0001
+
+
+typedef enum {
+    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
+    GRP_TYPE_EMU,                               /* emul reg group */
+} RegisterGroupType;
+
+typedef enum {
+    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
+    PT_BAR_FLAG_IO,                             /* I/O type BAR */
+    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
+    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
+} PTBarFlag;
+
+
+typedef struct XenPTRegion {
+    /* Virtual phys base & size */
+    uint32_t e_physbase;
+    uint32_t e_size;
+    /* Index of region in qemu */
+    uint32_t memory_index;
+    /* BAR flag */
+    PTBarFlag bar_flag;
+    /* Translation of the emulated address */
+    union {
+        uint64_t maddr;
+        uint64_t pio_base;
+        uint64_t u;
+    } access;
+} XenPTRegion;
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/* emulated register infomation */
+struct XenPTRegInfo {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t init_val;
+    /* reg read only field mask (ON:RO/ROS, OFF:other) */
+    uint32_t ro_mask;
+    /* reg emulate field mask (ON:emu, OFF:passthrough) */
+    uint32_t emu_mask;
+    /* no write back allowed */
+    uint32_t no_wb;
+    conf_reg_init init;
+    /* read/write/restore function pointer
+     * for double_word/word/byte size */
+    union {
+        struct {
+            conf_dword_write write;
+            conf_dword_read read;
+            conf_dword_restore restore;
+        } dw;
+        struct {
+            conf_word_write write;
+            conf_word_read read;
+            conf_word_restore restore;
+        } w;
+        struct {
+            conf_byte_write write;
+            conf_byte_read read;
+            conf_byte_restore restore;
+        } b;
+    } u;
+};
+
+/* emulated register management */
+struct XenPTReg {
+    QLIST_ENTRY(XenPTReg) entries;
+    XenPTRegInfo *reg;
+    uint32_t data;
+};
+
+typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
+
+/* emul reg group size initialize method */
+typedef uint8_t (*pt_reg_size_init_fn)
+    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
+     uint32_t base_offset);
+
+/* emulated register group infomation */
+struct XenPTRegGroupInfo {
+    uint8_t grp_id;
+    RegisterGroupType grp_type;
+    uint8_t grp_size;
+    pt_reg_size_init_fn size_init;
+    XenPTRegInfo *emu_reg_tbl;
+};
+
+/* emul register group management table */
+typedef struct XenPTRegGroup {
+    QLIST_ENTRY(XenPTRegGroup) entries;
+    const XenPTRegGroupInfo *reg_grp;
+    uint32_t base_offset;
+    uint8_t size;
+    QLIST_HEAD(, XenPTReg) reg_tbl_list;
+} XenPTRegGroup;
+
+
+typedef struct XenPTPM {
+    QEMUTimer *pm_timer;  /* QEMUTimer struct */
+    int no_soft_reset;    /* No Soft Reset flags */
+    uint16_t flags;       /* power state transition flags */
+    uint16_t pmc_field;   /* Power Management Capabilities field */
+    int pm_delay;         /* power state transition delay */
+    uint16_t cur_state;   /* current power state */
+    uint16_t req_state;   /* requested power state */
+    uint32_t pm_base;     /* Power Management Capability reg base offset */
+    uint32_t aer_base;    /* AER Capability reg base offset */
+} XenPTPM;
+
+struct XenPCIPassthroughState {
+    PCIDevice dev;
+
+    char *hostaddr;
+    bool is_virtfn;
+    HostPCIDevice *real_device;
+    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
+    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
+
+    uint32_t machine_irq;
+
+    uint32_t power_mgmt;
+    XenPTPM *pm_state;
+
+    MemoryRegion bar[PCI_NUM_REGIONS - 1];
+    MemoryRegion rom;
+};
+
+void pt_config_init(XenPCIPassthroughState *s);
+void pt_config_delete(XenPCIPassthroughState *s);
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable);
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
+int pt_bar_offset_to_index(uint32_t offset);
+
+static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
+{
+    /* align resource size (memory type only) */
+    if (flag == PT_BAR_FLAG_MEM) {
+        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
+    } else {
+        return r_size;
+    }
+}
+
+/* INTx */
+static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
+{
+    return host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN);
+}
+uint8_t pci_intx(XenPCIPassthroughState *ptdev);
+
+#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
diff --git a/hw/xen_pci_passthrough_helpers.c b/hw/xen_pci_passthrough_helpers.c
new file mode 100644
index 0000000..192e918
--- /dev/null
+++ b/hw/xen_pci_passthrough_helpers.c
@@ -0,0 +1,46 @@
+#include "xen_pci_passthrough.h"
+
+/* The PCI Local Bus Specification, Rev. 3.0, {
+ * Section 6.2.4 Miscellaneous Registers, pp 223
+ * outlines 5 valid values for the intertupt pin (intx).
+ *  0: For devices (or device functions) that don't use an interrupt in
+ *  1: INTA#
+ *  2: INTB#
+ *  3: INTC#
+ *  4: INTD#
+ *
+ * Xen uses the following 4 values for intx
+ *  0: INTA#
+ *  1: INTB#
+ *  2: INTC#
+ *  3: INTD#
+ *
+ * Observing that these list of values are not the same, pci_read_intx()
+ * uses the following mapping from hw to xen values.
+ * This seems to reflect the current usage within Xen.
+ *
+ * PCI hardware    | Xen | Notes
+ * ----------------+-----+----------------------------------------------------
+ * 0               | 0   | No interrupt
+ * 1               | 0   | INTA#
+ * 2               | 1   | INTB#
+ * 3               | 2   | INTC#
+ * 4               | 3   | INTD#
+ * any other value | 0   | This should never happen, log error message
+}
+ */
+uint8_t pci_intx(XenPCIPassthroughState *ptdev)
+{
+    uint8_t r_val = pci_read_intx(ptdev);
+
+    PT_LOG("intx=%i\n", r_val);
+    if (r_val < 1 || r_val > 4) {
+        PT_LOG("Interrupt pin read from hardware is out of range: "
+               "value=%i, acceptable range is 1 - 4\n", r_val);
+        r_val = 0;
+    } else {
+        r_val -= 1;
+    }
+
+    return r_val;
+}
-- 
Anthony PERARD


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel