WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 10/12] Add Xen CPU hotplug support

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 10/12] Add Xen CPU hotplug support
From: "Tian, Kevin" <kevin.tian@xxxxxxxxx>
Date: Tue, 15 May 2007 22:22:08 +0800
Delivery-date: Tue, 15 May 2007 07:21:39 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AceW/GqfhK2wnNKoQliL/aUOKseKfg==
Thread-topic: [PATCH 10/12] Add Xen CPU hotplug support
Provide cpu hotplug support to Xen. Note this hotplug
support is specific to PM, instead of for a run-time
single CPU hotplug which can be a separate task. See
embedded comment:

/*
 * XXX: One important thing missed here is to migrate vcpus
 * from dead cpu to other online ones and then put whole
 * system into a stop state. It assures a safe environment
 * for a cpu hotplug/remove at normal running state.
 *
 * However for xen PM case, at this point:
 *     -> All other domains should be notified with PM event,
 *        and then in following states:
 *             * Suspend state, or
 *             * Paused state, which is a force step to all
 *               domains if they do nothing to suspend
 *     -> All vcpus of dom0 (except vcpu0) have already beem
 *        hot removed
 * with the net effect that all other cpus only have idle vcpu
 * running. In this special case, we can avoid vcpu migration
 * then and system can be considered in a stop state.
 *
 * So current cpu hotplug is a special version for PM specific
 * usage, and need more effort later for full cpu hotplug.
 * (ktian1)
 */

Signed-off-by Kevin Tian <kevin.tian@xxxxxxxxx>

diff -r fe69f7fd1639 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:52 2007 -0400
@@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void)
 {
        int cpu = raw_smp_processor_id();
        cpu_clear(cpu, cpu_initialized);
-
-       /* lazy TLB state */
-       per_cpu(cpu_tlbstate, cpu).state = 0;
-       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
+}
+#endif
diff -r fe69f7fd1639 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/domain.c     Tue Apr 10 13:41:52 2007 -0400
@@ -76,6 +76,7 @@ static void default_idle(void)
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
+       __cpu_disable();
        /* This must be done before dead CPU ack */
        cpu_exit_clear();
        wbinvd();
@@ -101,6 +102,8 @@ void idle_loop(void)
 {
     for ( ; ; )
     {
+        if (cpu_is_offline(smp_processor_id()))
+            play_dead();
         page_scrub_schedule_work();
         default_idle();
         do_softirq();
diff -r fe69f7fd1639 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/i8259.c      Tue Apr 10 13:41:52 2007 -0400
@@ -396,6 +396,7 @@ void __init init_IRQ(void)
         irq_desc[i].action  = NULL;
         irq_desc[i].depth   = 1;
         spin_lock_init(&irq_desc[i].lock);
+        cpus_setall(irq_desc[i].affinity);
         set_intr_gate(i, interrupt[i]);
     }
 
diff -r fe69f7fd1639 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c    Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/io_apic.c    Tue Apr 10 13:41:52 2007 -0400
@@ -34,9 +34,6 @@
 #include <asm/desc.h>
 #include <mach_apic.h>
 #include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
 
 /* Different to Linux: our implementation can be simpler. */
 #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
diff -r fe69f7fd1639 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/irq.c        Tue Apr 10 13:41:52 2007 -0400
@@ -656,7 +656,8 @@ __initcall(setup_dump_irqs);
 __initcall(setup_dump_irqs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/mach-generic/mach_apic.h>
+#include <xen/delay.h>
 
 void fixup_irqs(cpumask_t map)
 {
@@ -673,8 +674,8 @@ void fixup_irqs(cpumask_t map)
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
                }
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
+               if (irq_desc[irq].handler->set_affinity)
+                       irq_desc[irq].handler->set_affinity(irq, mask);
                else if (irq_desc[irq].action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
diff -r fe69f7fd1639 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/smpboot.c    Tue Apr 10 14:30:27 2007 -0400
@@ -109,6 +109,9 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mos
                        { [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
+static void *stack_base[NR_CPUS] __cacheline_aligned;
+spinlock_t cpu_add_remove_lock;
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -121,7 +124,7 @@ static void map_cpu_to_logical_apicid(vo
 static void map_cpu_to_logical_apicid(void);
 
 /* State of each CPU. */
-/*DEFINE_PER_CPU(int, cpu_state) = { 0 };*/
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 /*
  * Currently trivial. Write the real->protected mode
@@ -439,9 +442,11 @@ void __devinit smp_callin(void)
        /*
         *      Synchronize the TSC with the BP
         */
-       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
+       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) {
                synchronize_tsc_ap();
-       calibrate_tsc_ap();
+               /* No sync for same reason as above */
+               calibrate_tsc_ap();
+       }
 }
 
 static int cpucount, booting_cpu;
@@ -508,8 +513,12 @@ static void construct_percpu_idt(unsigne
 {
        unsigned char idt_load[10];
 
-       idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
-       memcpy(idt_tables[cpu], idt_table,
IDT_ENTRIES*sizeof(idt_entry_t));
+       /* If IDT table exists since last hotplug, reuse it */
+       if (!idt_tables[cpu]) {
+               idt_tables[cpu] = xmalloc_array(idt_entry_t,
IDT_ENTRIES);
+               memcpy(idt_tables[cpu], idt_table,
+                               IDT_ENTRIES*sizeof(idt_entry_t));
+       }
 
        *(unsigned short *)(&idt_load[0]) =
(IDT_ENTRIES*sizeof(idt_entry_t))-1;
        *(unsigned long  *)(&idt_load[2]) = (unsigned
long)idt_tables[cpu];
@@ -571,15 +580,15 @@ void __devinit start_secondary(void *unu
         * lock helps us to not include this cpu in a currently in
progress
         * smp_call_function().
         */
-       /*lock_ipi_call_lock();*/
+       lock_ipi_call_lock();
        cpu_set(smp_processor_id(), cpu_online_map);
-       /*unlock_ipi_call_lock();*/
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+        init_percpu_time();
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
-
-        init_percpu_time();
 
        wmb();
        startup_cpu_idle_loop();
@@ -877,6 +886,22 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
+static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
+{
+       if (idle_vcpu[cpu])
+               return idle_vcpu[cpu];
+
+       return alloc_idle_vcpu(cpu);
+}
+
+static void *prepare_idle_stack(unsigned int cpu)
+{
+       if (!stack_base[cpu])
+               stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+
+       return stack_base[cpu];
+}
+
 static int __devinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -894,7 +919,7 @@ static int __devinit do_boot_cpu(int api
 
        booting_cpu = cpu;
 
-       v = alloc_idle_vcpu(cpu);
+       v = prepare_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
@@ -903,7 +928,7 @@ static int __devinit do_boot_cpu(int api
        /* So we see what's up   */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid,
start_eip);
 
-       stack_start.esp = alloc_xenheap_pages(STACK_ORDER);
+       stack_start.esp = prepare_idle_stack(cpu);
 
        /* Debug build: detect stack overflow by setting up a guard
page. */
        memguard_guard_stack(stack_start.esp);
@@ -980,6 +1005,12 @@ static int __devinit do_boot_cpu(int api
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+static void idle_task_exit(void)
+{
+       /* Give up lazy state borrowed by this idle vcpu */
+       __sync_lazy_execstate();
+}
+
 void cpu_exit_clear(void)
 {
        int cpu = raw_smp_processor_id();
@@ -988,7 +1019,6 @@ void cpu_exit_clear(void)
 
        cpucount --;
        cpu_uninit();
-       irq_ctx_exit(cpu);
 
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
@@ -997,26 +1027,9 @@ void cpu_exit_clear(void)
        unmap_cpu_to_logical_apicid(cpu);
 }
 
-struct warm_boot_cpu_info {
-       struct completion *complete;
-       int apicid;
-       int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(void *p)
-{
-       struct warm_boot_cpu_info *info = p;
-       do_boot_cpu(info->apicid, info->cpu);
-       complete(info->complete);
-}
-
 static int __cpuinit __smp_prepare_cpu(int cpu)
 {
-       DECLARE_COMPLETION(done);
-       struct warm_boot_cpu_info info;
-       struct work_struct task;
        int     apicid, ret;
-       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr,
cpu);
 
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
@@ -1024,34 +1037,19 @@ static int __cpuinit __smp_prepare_cpu(i
                goto exit;
        }
 
-       /*
-        * the CPU isn't initialized at boot time, allocate gdt table
here.
-        * cpu_init will initialize it
-        */
-       if (!cpu_gdt_descr->address) {
-               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-               if (!cpu_gdt_descr->address)
-                       printk(KERN_CRIT "CPU%d failed to allocate
GDT\n", cpu);
-                       ret = -ENOMEM;
-                       goto exit;
-       }
-
-       info.complete = &done;
-       info.apicid = apicid;
-       info.cpu = cpu;
-       INIT_WORK(&task, do_warm_boot_cpu, &info);
-
        tsc_sync_disabled = 1;
 
        /* init low mem mapping */
-       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-                       KERNEL_PGD_PTRS);
-       flush_tlb_all();
-       schedule_work(&task);
-       wait_for_completion(&done);
+       init_low_mappings();
+
+       do_boot_cpu(apicid, cpu);
 
        tsc_sync_disabled = 0;
+#ifdef CONFIG_X86_64
        zap_low_mappings();
+#else
+       zap_low_mappings(idle_pg_table_l2);
+#endif
        ret = 0;
 exit:
        return ret;
@@ -1084,6 +1082,8 @@ static void __init smp_boot_cpus(unsigne
 
        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+       stack_base[0] = stack_start.esp;
 
        /*current_thread_info()->cpu = 0;*/
        /*smp_tune_scheduling();*/
@@ -1255,7 +1255,8 @@ void __devinit smp_prepare_boot_cpu(void
        cpu_set(smp_processor_id(), cpu_callout_map);
        cpu_set(smp_processor_id(), cpu_present_map);
        cpu_set(smp_processor_id(), cpu_possible_map);
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       spin_lock_init(&cpu_add_remove_lock);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1278,11 +1279,12 @@ remove_siblinginfo(int cpu)
                cpu_clear(cpu, cpu_sibling_map[sibling]);
        cpus_clear(cpu_sibling_map[cpu]);
        cpus_clear(cpu_core_map[cpu]);
-       c[cpu].phys_proc_id = 0;
-       c[cpu].cpu_core_id = 0;
+       phys_proc_id[cpu] = BAD_APICID;
+       cpu_core_id[cpu] = BAD_APICID;
        cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
+extern void fixup_irqs(cpumask_t map);
 int __cpu_disable(void)
 {
        cpumask_t map = cpu_online_map;
@@ -1299,12 +1301,15 @@ int __cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
+       local_irq_disable();
        clear_local_APIC();
        /* Allow any queued timer interrupts to get serviced */
        local_irq_enable();
        mdelay(1);
        local_irq_disable();
 
+       destroy_percpu_time();
+
        remove_siblinginfo(cpu);
 
        cpu_clear(cpu, map);
@@ -1323,13 +1328,89 @@ void __cpu_die(unsigned int cpu)
                /* They ack this in play_dead by setting CPU_DEAD */
                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
                        printk ("CPU %d is now offline\n", cpu);
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
                        return;
                }
-               msleep(100);
+               mdelay(100);
+               mb();
+               process_pending_timers();
        }
        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+/* 
+ * XXX: One important thing missed here is to migrate vcpus
+ * from dead cpu to other online ones and then put whole
+ * system into a stop state. It assures a safe environment
+ * for a cpu hotplug/remove at normal running state.
+ *
+ * However for xen PM case, at this point:
+ *     -> All other domains should be notified with PM event,
+ *        and then in following states:
+ *             * Suspend state, or
+ *             * Paused state, which is a force step to all
+ *               domains if they do nothing to suspend
+ *     -> All vcpus of dom0 (except vcpu0) have already beem
+ *        hot removed
+ * with the net effect that all other cpus only have idle vcpu
+ * running. In this special case, we can avoid vcpu migration
+ * then and system can be considered in a stop state.
+ *
+ * So current cpu hotplug is a special version for PM specific
+ * usage, and need more effort later for full cpu hotplug.
+ * (ktian1)
+ */
+int cpu_down(unsigned int cpu)
+{
+       int err = 0;
+       cpumask_t mask;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (num_online_cpus() == 1) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (!cpu_online(cpu)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       printk("Prepare to bring CPU%d down...\n", cpu);
+       /* Send notification to remote idle vcpu */
+       cpus_clear(mask);
+       cpu_set(cpu, mask);
+       per_cpu(cpu_state, cpu) = CPU_DYING;
+       smp_send_event_check_mask(mask);
+
+       __cpu_die(cpu);
+
+       if (cpu_online(cpu)) {
+               printk("Bad state (DEAD, but in online map) on CPU%d\n",
cpu);
+               err = -EBUSY;
+       }
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
+}
+
+int cpu_up(unsigned int cpu)
+{
+       int err = 0;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (cpu_online(cpu)) {
+               printk("Bring up a online cpu. Bogus!\n");
+               err = -EBUSY;
+               goto out;
+       }
+
+       err = __cpu_up(cpu);
+       if (err < 0)
+               goto out;
+
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
 }
 
 /* From kernel/power/main.c */
@@ -1390,6 +1471,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret=0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
+
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_DEBUG "skipping cpu%d, didn't come
online\n", cpu);
diff -r fe69f7fd1639 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/time.c       Tue Apr 10 14:33:20 2007 -0400
@@ -963,6 +963,12 @@ void init_percpu_time(void)
     set_timer(&t->calibration_timer, NOW() + EPOCH);
 }
 
+/* Normally all pending timers are fired once APIC interrupt is
+ * active again, and thus no need to kill them when cpu is down.
+ * (Migrate may be required for pure cpu-hotplug). However
+ * calibration timer is a bit special, and re-initialization is
+ * required after cpu is up.
+ */
 void destroy_percpu_time(void)
 {
     kill_timer(&this_cpu(cpu_time).calibration_timer);
diff -r fe69f7fd1639 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/config.h      Tue Apr 10 13:41:52 2007 -0400
@@ -37,6 +37,8 @@
 #define CONFIG_ACPI_BOOT 1
 
 #define CONFIG_VGA 1
+
+#define CONFIG_HOTPLUG_CPU 1
 
 #define HZ 100
 
diff -r fe69f7fd1639 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/smp.h Tue Apr 10 13:41:52 2007 -0400
@@ -46,14 +46,30 @@ extern void zap_low_mappings(l2_pgentry_
 #endif
 
 extern void init_low_mappings(void);
+
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
 #define MAX_APICID 256
 extern u8 x86_cpu_to_apicid[];
 
 #define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
 
+/* State of each CPU. */
+#define CPU_ONLINE     0x0002  /* CPU is up */
+#define CPU_DYING      0x0003  /* CPU is requested to die */
+#define CPU_DEAD       0x0004  /* CPU is dead */
+DECLARE_PER_CPU(int, cpu_state);
+
 #ifdef CONFIG_HOTPLUG_CPU
+#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) ==
CPU_DYING)
+extern int cpu_down(unsigned int cpu);
+extern int cpu_up(unsigned int cpu);
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
+extern void disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int cpu_is_offline(int cpu) {return 0;}
 #endif
 
 /*
diff -r fe69f7fd1639 xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/system.h      Tue Apr 10 13:41:52 2007 -0400
@@ -313,6 +313,8 @@ static always_inline unsigned long long 
 #define __sti()                        __asm__ __volatile__("sti": :
:"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete
*/
 #define safe_halt()            __asm__ __volatile__("sti; hlt": :
:"memory")
+/* used when interrupts are already enabled or to shutdown the
processor */
+#define halt()                 __asm__ __volatile__("hlt": : :"memory")
 
 /* For spinlocks etc */
 #if defined(__i386__)

Attachment: xen_cpu_hotplug_for_pm.patch
Description: xen_cpu_hotplug_for_pm.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 10/12] Add Xen CPU hotplug support, Tian, Kevin <=