WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH] Make the hypercall PHYSDEVOP_alloc_irq_vector hy

To: "Zhang, Xiantao" <xiantao.zhang@xxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH] Make the hypercall PHYSDEVOP_alloc_irq_vector hypercall dummy.
From: Jeremy Fitzhardinge <jeremy@xxxxxxxx>
Date: Wed, 02 Sep 2009 14:46:24 -0700
Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Delivery-date: Wed, 02 Sep 2009 14:47:03 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <706158FABBBA044BAD4FE898A02E4BC201C435171B@xxxxxxxxxxxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
References: <706158FABBBA044BAD4FE898A02E4BC201C435171B@xxxxxxxxxxxxxxxxxxxxxxxxxxxx>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.1) Gecko/20090814 Fedora/3.0-2.6.b3.fc11 Lightning/1.0pre Thunderbird/3.0b3
On 08/31/09 19:03, Zhang, Xiantao wrote:
> Hi, Keir
>      This patch tends to make the hypercall PHYSDEVOP_alloc_irq_vector
> dummy, and defer vector allocation to programe ioapic entries by dom0.
> Basically, dom0 shouldn't touch vector namespace which is only used by
> hypervisor for servicing real device's interrupts. And this patch also
> makes broken NetBSD dom0 work again.

This is an interesting approach, and it seems like a less intrusive way
to achieve what my functionally similar patch does.  All it needs is a
corresponding change to replace the paravirtualized IO APIC writes with
a hypercall to directly install the vector for a given GSI (ie, to make
use of Xen's GSI -> chip+pin tables rather than relying on dom0 to do
the discovery for itself).

> # HG changeset patch # User xiantao.zhang@xxxxxxxxx # Date 1251742441
> 14400 # Node ID 3c91a9ef5010ef6188728948d6145346b650414a # Parent
> dd71c509f3ff13326ddbb8fa39f638022aa814ff Dummy
> PHYSDEVOP_alloc_irq_vector hypercall. Make the hypercall
> PHYSDEVOP_alloc_irq_vector dummy, and defer vector allocation to
> programe ioapic entries by dom0. Basically, dom0 shouldn't touch
> vector namespace which is only used by hypervisor for servicing real
> device's interrupts. And this patch also make broken NetBSD dom0 work
> again. Signed-off-by: Xiantao Zhang <xiantao.zhang.intel.com> diff -r
> 3b7cbf32fee9 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Mon
> Aug 31 10:54:32 2009 +0100 +++ b/xen/arch/x86/io_apic.c Tue Sep 01
> 05:58:48 2009 -0400 @@ -85,35 +85,7 @@ static struct irq_pin_list {
> unsigned int next; } *irq_2_pin; -static int *pin_irq_map; - static
> unsigned int irq_2_pin_free_entry; - -/* Use an arry to record
> pin_2_irq_mapping */ -static int get_irq_from_apic_pin(int apic, int
> pin) -{ - int i, pin_base = 0; - - ASSERT(apic < nr_ioapics); - - for
> (i = 0; i < apic; i++) - pin_base += nr_ioapic_registers[i]; - -
> return pin_irq_map[pin_base + pin]; -} - -static void
> set_irq_to_apic_pin(int apic, int pin, int irq) -{ - - int i, pin_base
> = 0; - - ASSERT(apic < nr_ioapics); - - for (i = 0; i < apic; i++) -
> pin_base += nr_ioapic_registers[i]; - - pin_irq_map[pin_base + pin] =
> irq; -} /* * The common case is 1:1 IRQ<->pin mappings. Sometimes
> there are @@ -141,42 +113,6 @@ static void add_pin_to_irq(unsigned int
> } entry->apic = apic; entry->pin = pin; - - set_irq_to_apic_pin(apic,
> pin, irq); -} - -static void remove_pin_at_irq(unsigned int irq, int
> apic, int pin) -{ - struct irq_pin_list *entry, *prev; - - for (entry
> = &irq_2_pin[irq]; ; entry = &irq_2_pin[entry->next]) { - if
> ((entry->apic == apic) && (entry->pin == pin)) - break; - if
> (!entry->next) - BUG(); - } - - entry->pin = entry->apic = -1; - - if
> (entry != &irq_2_pin[irq]) { - /* Removed entry is not at head of
> list. */ - prev = &irq_2_pin[irq]; - while (&irq_2_pin[prev->next] !=
> entry) - prev = &irq_2_pin[prev->next]; - prev->next = entry->next; -
> entry->next = irq_2_pin_free_entry; - irq_2_pin_free_entry = entry -
> irq_2_pin; - } else if (entry->next != 0) { - /* Removed entry is at
> head of multi-item list. */ - prev = entry; - entry =
> &irq_2_pin[entry->next]; - *prev = *entry; - entry->pin = entry->apic
> = -1; - entry->next = irq_2_pin_free_entry; - irq_2_pin_free_entry =
> entry - irq_2_pin; - } - - set_irq_to_apic_pin(apic, pin, -1); } /* @@
> -1138,15 +1074,11 @@ static void __init enable_IO_APIC(void) /*
> Initialise dynamic irq_2_pin free list. */ irq_2_pin =
> xmalloc_array(struct irq_pin_list, PIN_MAP_SIZE); memset(irq_2_pin, 0,
> PIN_MAP_SIZE * sizeof(*irq_2_pin)); - pin_irq_map = xmalloc_array(int,
> nr_irqs_gsi); - memset(pin_irq_map, 0, nr_irqs_gsi * sizeof(int)); for
> (i = 0; i < PIN_MAP_SIZE; i++) irq_2_pin[i].pin = -1; for (i =
> irq_2_pin_free_entry = nr_irqs_gsi; i < PIN_MAP_SIZE; i++)
> irq_2_pin[i].next = i + 1; - for (i = 0; i < nr_irqs_gsi; i++) -
> pin_irq_map[i] = -1; for(apic = 0; apic < nr_ioapics; apic++) { int
> pin; @@ -2165,6 +2097,24 @@ static int ioapic_physbase_to_id(unsigne
> return -EINVAL; } +unsigned apic_gsi_base(int apic); + +int
> apic_pin_2_gsi_irq(int apic, int pin) +{ + int idx, irq; + + if (apic
> < 0) + return -EINVAL; + + irq = apic_gsi_base(apic) + pin; + if (apic
> == 0) { + idx = find_irq_entry(apic, pin, mp_INT); + if (idx >= 0) +
> irq = pin_2_irq(idx, apic, pin); + } + return irq; +} + int
> ioapic_guest_read(unsigned long physbase, unsigned int reg, u32 *pval)
> { int apic; @@ -2182,17 +2132,19 @@ int ioapic_guest_read(unsigned
> long phys #define WARN_BOGUS_WRITE(f, a...) \ dprintk(XENLOG_INFO,
> "\n%s: " \ - "apic=%d, pin=%d, old_irq=%d, new_irq=%d\n" \ - "%s:
> old_entry=%08x, new_entry=%08x\n" \ - "%s: " f, __FUNCTION__, apic,
> pin, old_irq, new_irq, \ - __FUNCTION__, *(u32 *)&old_rte, *(u32
> *)&new_rte, \ + "apic=%d, pin=%d, irq=%d\n" \ + "%s: new_entry=%08x\n"
> \ + "%s: " f, __FUNCTION__, apic, pin, irq, \ + __FUNCTION__, *(u32
> *)&rte, \ __FUNCTION__ , ##a ) int ioapic_guest_write(unsigned long
> physbase, unsigned int reg, u32 val) { - int apic, pin, old_irq = -1,
> new_irq = -1; - struct IO_APIC_route_entry old_rte = { 0 }, new_rte =
> { 0 }; + int apic, pin, irq, ret, vector, pirq; + struct
> IO_APIC_route_entry rte = { 0 }; 

Is this just a gratuitious renaming, or does it serve some other purpose?

> unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc;
> if ( (apic = ioapic_physbase_to_id(physbase)) < 0 ) return apic; @@
> -2204,7 +2156,7 @@ int ioapic_guest_write(unsigned long phy pin = (reg
> - 0x10) >> 1; /* Write first half from guest; second half is target
> info. */ - *(u32 *)&new_rte = val; + *(u32 *)&rte = val; /* * What
> about weird destination types? @@ -2214,7 +2166,7 @@ int
> ioapic_guest_write(unsigned long phy * ExtINT: Ignore? Linux only
> asserts this at start of day. * For now, print a message and return an
> error. We can fix up on demand. */ - if ( new_rte.delivery_mode >
> dest_LowestPrio ) + if ( rte.delivery_mode > dest_LowestPrio ) {
> printk("ERROR: Attempt to write weird IOAPIC destination mode!\n");
> printk(" APIC=%d/%d, lo-reg=%x\n", apic, pin, val); @@ -2225,83
> +2177,65 @@ int ioapic_guest_write(unsigned long phy * The guest does
> not know physical APIC arrangement (flat vs. cluster). * Apply genapic
> conventions for this platform. */ - new_rte.delivery_mode =
> INT_DELIVERY_MODE; - new_rte.dest_mode = INT_DEST_MODE; +
> rte.delivery_mode = INT_DELIVERY_MODE; + rte.dest_mode =
> INT_DEST_MODE; + + irq = apic_pin_2_gsi_irq(apic, pin); + if ( irq < 0
> ) + return irq; + + desc = irq_to_desc(irq); + cfg = desc->chip_data;
> + + /* Since PHYSDEVOP_alloc_irq_vector is dummy, rte.vector is the
> pirq + which corresponds to this ioapic pin, retrieve it for building
> + pirq and irq mapping. + */ + pirq = rte.vector; + if(pirq < 0 ||
> pirq >= dom0->nr_pirqs) + return -EINVAL; + + if ( desc->action ) + {
> + WARN_BOGUS_WRITE("Attempt to modify IO-APIC pin for in-use IRQ!\n");
> + return 0; + } + + if ( cfg->vector <= 0 || cfg->vector >
> LAST_DYNAMIC_VECTOR ) { + + printk("allocated vector for irq:%d\n",
> irq); + + vector = assign_irq_vector(irq); + if ( vector < 0 ) +
> return vector; + + add_pin_to_irq(irq, apic, pin); 


If this is replacing or disabling a previously set "vector" will it
properly clean up the old state?  (I don't fully understand how all the
datastructures relate pirq/vector/apic/pin to each other.)

> + } + spin_lock(&pcidevs_lock); + spin_lock(&dom0->event_lock); + ret
> = map_domain_pirq(dom0, pirq, irq, + MAP_PIRQ_TYPE_GSI, NULL); +
> spin_unlock(&dom0->event_lock); + spin_unlock(&pcidevs_lock); + if (
> ret < 0 ) + return ret; spin_lock_irqsave(&ioapic_lock, flags); - - /*
> Read first (interesting) half of current routing entry. */ - *(u32
> *)&old_rte = io_apic_read(apic, 0x10 + 2 * pin); - - /* No change to
> the first half of the routing entry? Bail quietly. */ - if ( *(u32
> *)&old_rte == *(u32 *)&new_rte ) - { -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - - /*
> Special delivery modes (SMI,NMI,INIT,ExtInt) should have no vector. */
> - if ( (old_rte.delivery_mode > dest_LowestPrio) && (old_rte.vector !=
> 0) ) - { - WARN_BOGUS_WRITE("Special delivery mode %d with non-zero
> vector " - "%02x\n", old_rte.delivery_mode, old_rte.vector); - /*
> Nobble the vector here as it does not relate to a valid irq. */ -
> old_rte.vector = 0; - } - - if ( old_rte.vector >=
> FIRST_DYNAMIC_VECTOR ) - old_irq = get_irq_from_apic_pin(apic, pin); -
> - /* FIXME: dirty hack to support per-cpu vector. */ - new_irq =
> new_rte.vector; - - if ( (old_irq != new_irq) && (old_irq >= 0) &&
> IO_APIC_IRQ(old_irq) ) - { - if ( irq_desc[old_irq].action ) - { -
> WARN_BOGUS_WRITE("Attempt to remove IO-APIC pin of in-use IRQ!\n"); -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - -
> remove_pin_at_irq(old_irq, apic, pin); - } - - if ( (new_irq >= 0) &&
> IO_APIC_IRQ(new_irq) ) - { - if ( irq_desc[new_irq].action ) - { -
> WARN_BOGUS_WRITE("Attempt to %s IO-APIC pin for in-use IRQ!\n", -
> (old_irq != new_irq) ? "add" : "modify"); -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - - /*
> Set the correct irq-handling type. */ - irq_desc[new_irq].handler =
> new_rte.trigger ? - &ioapic_level_type: &ioapic_edge_type; - - if (
> old_irq != new_irq ) - add_pin_to_irq(new_irq, apic, pin); - - /* Mask
> iff level triggered. */ - new_rte.mask = new_rte.trigger; - /* Set the
> vector field to the real vector! */ - new_rte.vector =
> irq_cfg[new_irq].vector; - } - else if ( !new_rte.mask ) - { - /* This
> pin leads nowhere but the guest has not masked it. */ -
> WARN_BOGUS_WRITE("Installing bogus unmasked IO-APIC entry!\n"); -
> new_rte.mask = 1; - } - - new_rte.dest.logical.logical_dest = -
> cpu_mask_to_apicid(irq_cfg[new_irq].domain); - - io_apic_write(apic,
> 0x10 + 2 * pin, *(((int *)&new_rte) + 0)); - io_apic_write(apic, 0x11
> + 2 * pin, *(((int *)&new_rte) + 1)); - + /* Set the correct
> irq-handling type. */ + desc->handler = rte.trigger ? +
> &ioapic_level_type: &ioapic_edge_type; + + /* Mask iff level
> triggered. */ + rte.mask = rte.trigger; + /* Set the vector field to
> the real vector! */ + rte.vector = cfg->vector; + +
> rte.dest.logical.logical_dest = + cpu_mask_to_apicid(cfg->domain); + +
> io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&rte) + 0)); +
> io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&rte) + 1)); +
> spin_unlock_irqrestore(&ioapic_lock, flags); return 0; diff -r
> 3b7cbf32fee9 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/irq.c Tue Sep 01 05:58:48 2009
> -0400 @@ -1293,9 +1293,9 @@ int map_domain_pirq( if ( (old_irq &&
> (old_irq != irq) ) || (old_pirq && (old_pirq != pirq)) ) { -
> dprintk(XENLOG_G_ERR, "dom%d: pirq %d or irq %d already mapped\n", +
> dprintk(XENLOG_G_WARNING, "dom%d: pirq %d or irq %d already mapped\n",
> d->domain_id, pirq, irq); - return -EINVAL; + return 0; } ret =
> irq_permit_access(d, pirq); diff -r 3b7cbf32fee9
> xen/arch/x86/mpparse.c --- a/xen/arch/x86/mpparse.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/mpparse.c Tue Sep 01 05:58:48
> 2009 -0400 @@ -958,6 +958,10 @@ unsigned highest_gsi(void) return res;
> } +unsigned apic_gsi_base(int apic) +{ + return
> mp_ioapic_routing[apic].gsi_base; +} void __init
> mp_override_legacy_irq ( u8 bus_irq, diff -r 3b7cbf32fee9
> xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/physdev.c Tue Sep 01 06:00:15
> 2009 -0400 @@ -329,7 +329,6 @@ ret_t do_physdev_op(int cmd,
> XEN_GUEST_H case PHYSDEVOP_alloc_irq_vector: { struct physdev_irq
> irq_op; - int vector; ret = -EFAULT; if ( copy_from_guest(&irq_op,
> arg, 1) != 0 ) @@ -343,26 +342,13 @@ ret_t do_physdev_op(int cmd,
> XEN_GUEST_H if ( ret ) break; - irq = irq_op.irq; - ret = -EINVAL; +
> /* Vector is only used by hypervisor, and dom0 shouldn't + touch it in
> its world, return irq_op.irq as the vecotr, + and make this hypercall
> dummy, and also defer the vector + allocation when dom0 tries to
> programe ioapic entry. */ + irq_op.vector = irq_op.irq; + ret = 0; -
> /* FIXME: Once dom0 breaks GSI IRQ limit, it is - a must to eliminate
> the limit here */ - BUG_ON(irq >= 256); - - vector =
> assign_irq_vector(irq); - if (vector >= FIRST_DYNAMIC_VECTOR) -
> irq_op.vector = irq; - else - irq_op.vector = -ENOSPC; - -
> spin_lock(&pcidevs_lock); - spin_lock(&dom0->event_lock); - ret =
> map_domain_pirq(dom0, irq_op.irq, irq, - MAP_PIRQ_TYPE_GSI, NULL); -
> spin_unlock(&dom0->event_lock); - spin_unlock(&pcidevs_lock); - if (
> copy_to_guest(arg, &irq_op, 1) != 0 ) ret = -EFAULT; break; diff -r
> 3b7cbf32fee9 xen/common/domain.c --- a/xen/common/domain.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/common/domain.c Tue Sep 01 05:58:48 2009
> -0400 @@ -197,7 +197,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
> return alloc_vcpu(d, vcpu_id, cpu_id); } -static unsigned int
> extra_dom0_irqs, extra_domU_irqs = 8; +static unsigned int
> extra_dom0_irqs = 256, extra_domU_irqs = 32; static void __init
> parse_extra_guest_irqs(const char *s) { if ( isdigit(*s) ) @@ -253,9
> +253,11 @@ struct domain *domain_create( d->is_paused_by_controller =
> 1; atomic_inc(&d->pause_count); - d->nr_pirqs = (nr_irqs_gsi + -
> (domid ? extra_domU_irqs : - extra_dom0_irqs ?: nr_irqs_gsi)); + if (
> domid ) + d->nr_pirqs = nr_irqs_gsi + extra_domU_irqs; + else +
> d->nr_pirqs = nr_irqs_gsi + extra_dom0_irqs; 


Surely there's a better test for this?  We don't want to be hard-coding
"privileged domain == dom0" into Xen.  What version is this patch
against anyway?  I don't see this "extra_dom0_irqs" stuff in current
Xen-unstable.

> + d->pirq_to_evtchn = xmalloc_array(u16, d->nr_pirqs); d->pirq_mask =
> xmalloc_array( unsigned long, BITS_TO_LONGS(d->nr_pirqs)); 

Thanks,
        J


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>