[Xen-devel] [PATCH 10/12] Add Xen CPU hotplug support

Provide cpu hotplug support to Xen. Note this hotplug
support is specific to PM, instead of for a run-time
single CPU hotplug which can be a separate task. See
embedded comment:

/*
 * XXX: One important thing missed here is to migrate vcpus
 * from dead cpu to other online ones and then put whole
 * system into a stop state. It assures a safe environment
 * for a cpu hotplug/remove at normal running state.
 *
 * However for xen PM case, at this point:
 *     -> All other domains should be notified with PM event,
 *        and then in following states:
 *             * Suspend state, or
 *             * Paused state, which is a force step to all
 *               domains if they do nothing to suspend
 *     -> All vcpus of dom0 (except vcpu0) have already beem
 *        hot removed
 * with the net effect that all other cpus only have idle vcpu
 * running. In this special case, we can avoid vcpu migration
 * then and system can be considered in a stop state.
 *
 * So current cpu hotplug is a special version for PM specific
 * usage, and need more effort later for full cpu hotplug.
 * (ktian1)
 */

Signed-off-by Kevin Tian <kevin.tian@xxxxxxxxx>

diff -r fe69f7fd1639 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:52 2007 -0400
@@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void)
 {
        int cpu = raw_smp_processor_id();
        cpu_clear(cpu, cpu_initialized);
-
-       /* lazy TLB state */
-       per_cpu(cpu_tlbstate, cpu).state = 0;
-       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
+}
+#endif
diff -r fe69f7fd1639 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/domain.c     Tue Apr 10 13:41:52 2007 -0400
@@ -76,6 +76,7 @@ static void default_idle(void)
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
+       __cpu_disable();
        /* This must be done before dead CPU ack */
        cpu_exit_clear();
        wbinvd();
@@ -101,6 +102,8 @@ void idle_loop(void)
 {
     for ( ; ; )
     {
+        if (cpu_is_offline(smp_processor_id()))
+            play_dead();
         page_scrub_schedule_work();
         default_idle();
         do_softirq();
diff -r fe69f7fd1639 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/i8259.c      Tue Apr 10 13:41:52 2007 -0400
@@ -396,6 +396,7 @@ void __init init_IRQ(void)
         irq_desc[i].action  = NULL;
         irq_desc[i].depth   = 1;
         spin_lock_init(&irq_desc[i].lock);
+        cpus_setall(irq_desc[i].affinity);
         set_intr_gate(i, interrupt[i]);
     }
 
diff -r fe69f7fd1639 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c    Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/io_apic.c    Tue Apr 10 13:41:52 2007 -0400
@@ -34,9 +34,6 @@
 #include <asm/desc.h>
 #include <mach_apic.h>
 #include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
 
 /* Different to Linux: our implementation can be simpler. */
 #define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
diff -r fe69f7fd1639 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/irq.c        Tue Apr 10 13:41:52 2007 -0400
@@ -656,7 +656,8 @@ __initcall(setup_dump_irqs);
 __initcall(setup_dump_irqs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/mach-generic/mach_apic.h>
+#include <xen/delay.h>
 
 void fixup_irqs(cpumask_t map)
 {
@@ -673,8 +674,8 @@ void fixup_irqs(cpumask_t map)
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
                }
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
+               if (irq_desc[irq].handler->set_affinity)
+                       irq_desc[irq].handler->set_affinity(irq, mask);
                else if (irq_desc[irq].action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
diff -r fe69f7fd1639 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/smpboot.c    Tue Apr 10 14:30:27 2007 -0400
@@ -109,6 +109,9 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mos
                        { [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
+static void *stack_base[NR_CPUS] __cacheline_aligned;
+spinlock_t cpu_add_remove_lock;
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -121,7 +124,7 @@ static void map_cpu_to_logical_apicid(vo
 static void map_cpu_to_logical_apicid(void);
 
 /* State of each CPU. */
-/*DEFINE_PER_CPU(int, cpu_state) = { 0 };*/
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 /*
  * Currently trivial. Write the real->protected mode
@@ -439,9 +442,11 @@ void __devinit smp_callin(void)
        /*
         *      Synchronize the TSC with the BP
         */
-       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
+       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) {
                synchronize_tsc_ap();
-       calibrate_tsc_ap();
+               /* No sync for same reason as above */
+               calibrate_tsc_ap();
+       }
 }
 
 static int cpucount, booting_cpu;
@@ -508,8 +513,12 @@ static void construct_percpu_idt(unsigne
 {
        unsigned char idt_load[10];
 
-       idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
-       memcpy(idt_tables[cpu], idt_table,
IDT_ENTRIES*sizeof(idt_entry_t));
+       /* If IDT table exists since last hotplug, reuse it */
+       if (!idt_tables[cpu]) {
+               idt_tables[cpu] = xmalloc_array(idt_entry_t,
IDT_ENTRIES);
+               memcpy(idt_tables[cpu], idt_table,
+                               IDT_ENTRIES*sizeof(idt_entry_t));
+       }
 
        *(unsigned short *)(&idt_load[0]) =
(IDT_ENTRIES*sizeof(idt_entry_t))-1;
        *(unsigned long  *)(&idt_load[2]) = (unsigned
long)idt_tables[cpu];
@@ -571,15 +580,15 @@ void __devinit start_secondary(void *unu
         * lock helps us to not include this cpu in a currently in
progress
         * smp_call_function().
         */
-       /*lock_ipi_call_lock();*/
+       lock_ipi_call_lock();
        cpu_set(smp_processor_id(), cpu_online_map);
-       /*unlock_ipi_call_lock();*/
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+        init_percpu_time();
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
-
-        init_percpu_time();
 
        wmb();
        startup_cpu_idle_loop();
@@ -877,6 +886,22 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
+static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
+{
+       if (idle_vcpu[cpu])
+               return idle_vcpu[cpu];
+
+       return alloc_idle_vcpu(cpu);
+}
+
+static void *prepare_idle_stack(unsigned int cpu)
+{
+       if (!stack_base[cpu])
+               stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+
+       return stack_base[cpu];
+}
+
 static int __devinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -894,7 +919,7 @@ static int __devinit do_boot_cpu(int api
 
        booting_cpu = cpu;
 
-       v = alloc_idle_vcpu(cpu);
+       v = prepare_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
@@ -903,7 +928,7 @@ static int __devinit do_boot_cpu(int api
        /* So we see what's up   */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid,
start_eip);
 
-       stack_start.esp = alloc_xenheap_pages(STACK_ORDER);
+       stack_start.esp = prepare_idle_stack(cpu);
 
        /* Debug build: detect stack overflow by setting up a guard
page. */
        memguard_guard_stack(stack_start.esp);
@@ -980,6 +1005,12 @@ static int __devinit do_boot_cpu(int api
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+static void idle_task_exit(void)
+{
+       /* Give up lazy state borrowed by this idle vcpu */
+       __sync_lazy_execstate();
+}
+
 void cpu_exit_clear(void)
 {
        int cpu = raw_smp_processor_id();
@@ -988,7 +1019,6 @@ void cpu_exit_clear(void)
 
        cpucount --;
        cpu_uninit();
-       irq_ctx_exit(cpu);
 
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
@@ -997,26 +1027,9 @@ void cpu_exit_clear(void)
        unmap_cpu_to_logical_apicid(cpu);
 }
 
-struct warm_boot_cpu_info {
-       struct completion *complete;
-       int apicid;
-       int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(void *p)
-{
-       struct warm_boot_cpu_info *info = p;
-       do_boot_cpu(info->apicid, info->cpu);
-       complete(info->complete);
-}
-
 static int __cpuinit __smp_prepare_cpu(int cpu)
 {
-       DECLARE_COMPLETION(done);
-       struct warm_boot_cpu_info info;
-       struct work_struct task;
        int     apicid, ret;
-       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr,
cpu);
 
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
@@ -1024,34 +1037,19 @@ static int __cpuinit __smp_prepare_cpu(i
                goto exit;
        }
 
-       /*
-        * the CPU isn't initialized at boot time, allocate gdt table
here.
-        * cpu_init will initialize it
-        */
-       if (!cpu_gdt_descr->address) {
-               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-               if (!cpu_gdt_descr->address)
-                       printk(KERN_CRIT "CPU%d failed to allocate
GDT\n", cpu);
-                       ret = -ENOMEM;
-                       goto exit;
-       }
-
-       info.complete = &done;
-       info.apicid = apicid;
-       info.cpu = cpu;
-       INIT_WORK(&task, do_warm_boot_cpu, &info);
-
        tsc_sync_disabled = 1;
 
        /* init low mem mapping */
-       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-                       KERNEL_PGD_PTRS);
-       flush_tlb_all();
-       schedule_work(&task);
-       wait_for_completion(&done);
+       init_low_mappings();
+
+       do_boot_cpu(apicid, cpu);
 
        tsc_sync_disabled = 0;
+#ifdef CONFIG_X86_64
        zap_low_mappings();
+#else
+       zap_low_mappings(idle_pg_table_l2);
+#endif
        ret = 0;
 exit:
        return ret;
@@ -1084,6 +1082,8 @@ static void __init smp_boot_cpus(unsigne
 
        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+       stack_base[0] = stack_start.esp;
 
        /*current_thread_info()->cpu = 0;*/
        /*smp_tune_scheduling();*/
@@ -1255,7 +1255,8 @@ void __devinit smp_prepare_boot_cpu(void
        cpu_set(smp_processor_id(), cpu_callout_map);
        cpu_set(smp_processor_id(), cpu_present_map);
        cpu_set(smp_processor_id(), cpu_possible_map);
-       /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       spin_lock_init(&cpu_add_remove_lock);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1278,11 +1279,12 @@ remove_siblinginfo(int cpu)
                cpu_clear(cpu, cpu_sibling_map[sibling]);
        cpus_clear(cpu_sibling_map[cpu]);
        cpus_clear(cpu_core_map[cpu]);
-       c[cpu].phys_proc_id = 0;
-       c[cpu].cpu_core_id = 0;
+       phys_proc_id[cpu] = BAD_APICID;
+       cpu_core_id[cpu] = BAD_APICID;
        cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
+extern void fixup_irqs(cpumask_t map);
 int __cpu_disable(void)
 {
        cpumask_t map = cpu_online_map;
@@ -1299,12 +1301,15 @@ int __cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
+       local_irq_disable();
        clear_local_APIC();
        /* Allow any queued timer interrupts to get serviced */
        local_irq_enable();
        mdelay(1);
        local_irq_disable();
 
+       destroy_percpu_time();
+
        remove_siblinginfo(cpu);
 
        cpu_clear(cpu, map);
@@ -1323,13 +1328,89 @@ void __cpu_die(unsigned int cpu)
                /* They ack this in play_dead by setting CPU_DEAD */
                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
                        printk ("CPU %d is now offline\n", cpu);
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
                        return;
                }
-               msleep(100);
+               mdelay(100);
+               mb();
+               process_pending_timers();
        }
        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+/* 
+ * XXX: One important thing missed here is to migrate vcpus
+ * from dead cpu to other online ones and then put whole
+ * system into a stop state. It assures a safe environment
+ * for a cpu hotplug/remove at normal running state.
+ *
+ * However for xen PM case, at this point:
+ *     -> All other domains should be notified with PM event,
+ *        and then in following states:
+ *             * Suspend state, or
+ *             * Paused state, which is a force step to all
+ *               domains if they do nothing to suspend
+ *     -> All vcpus of dom0 (except vcpu0) have already beem
+ *        hot removed
+ * with the net effect that all other cpus only have idle vcpu
+ * running. In this special case, we can avoid vcpu migration
+ * then and system can be considered in a stop state.
+ *
+ * So current cpu hotplug is a special version for PM specific
+ * usage, and need more effort later for full cpu hotplug.
+ * (ktian1)
+ */
+int cpu_down(unsigned int cpu)
+{
+       int err = 0;
+       cpumask_t mask;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (num_online_cpus() == 1) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (!cpu_online(cpu)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       printk("Prepare to bring CPU%d down...\n", cpu);
+       /* Send notification to remote idle vcpu */
+       cpus_clear(mask);
+       cpu_set(cpu, mask);
+       per_cpu(cpu_state, cpu) = CPU_DYING;
+       smp_send_event_check_mask(mask);
+
+       __cpu_die(cpu);
+
+       if (cpu_online(cpu)) {
+               printk("Bad state (DEAD, but in online map) on CPU%d\n",
cpu);
+               err = -EBUSY;
+       }
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
+}
+
+int cpu_up(unsigned int cpu)
+{
+       int err = 0;
+
+       spin_lock(&cpu_add_remove_lock);
+       if (cpu_online(cpu)) {
+               printk("Bring up a online cpu. Bogus!\n");
+               err = -EBUSY;
+               goto out;
+       }
+
+       err = __cpu_up(cpu);
+       if (err < 0)
+               goto out;
+
+out:
+       spin_unlock(&cpu_add_remove_lock);
+       return err;
 }
 
 /* From kernel/power/main.c */
@@ -1390,6 +1471,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret=0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
+
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_DEBUG "skipping cpu%d, didn't come
online\n", cpu);
diff -r fe69f7fd1639 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/time.c       Tue Apr 10 14:33:20 2007 -0400
@@ -963,6 +963,12 @@ void init_percpu_time(void)
     set_timer(&t->calibration_timer, NOW() + EPOCH);
 }
 
+/* Normally all pending timers are fired once APIC interrupt is
+ * active again, and thus no need to kill them when cpu is down.
+ * (Migrate may be required for pure cpu-hotplug). However
+ * calibration timer is a bit special, and re-initialization is
+ * required after cpu is up.
+ */
 void destroy_percpu_time(void)
 {
     kill_timer(&this_cpu(cpu_time).calibration_timer);
diff -r fe69f7fd1639 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/config.h      Tue Apr 10 13:41:52 2007 -0400
@@ -37,6 +37,8 @@
 #define CONFIG_ACPI_BOOT 1
 
 #define CONFIG_VGA 1
+
+#define CONFIG_HOTPLUG_CPU 1
 
 #define HZ 100
 
diff -r fe69f7fd1639 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/smp.h Tue Apr 10 13:41:52 2007 -0400
@@ -46,14 +46,30 @@ extern void zap_low_mappings(l2_pgentry_
 #endif
 
 extern void init_low_mappings(void);
+
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
 #define MAX_APICID 256
 extern u8 x86_cpu_to_apicid[];
 
 #define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
 
+/* State of each CPU. */
+#define CPU_ONLINE     0x0002  /* CPU is up */
+#define CPU_DYING      0x0003  /* CPU is requested to die */
+#define CPU_DEAD       0x0004  /* CPU is dead */
+DECLARE_PER_CPU(int, cpu_state);
+
 #ifdef CONFIG_HOTPLUG_CPU
+#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) ==
CPU_DYING)
+extern int cpu_down(unsigned int cpu);
+extern int cpu_up(unsigned int cpu);
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
+extern void disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int cpu_is_offline(int cpu) {return 0;}
 #endif
 
 /*
diff -r fe69f7fd1639 xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h      Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/system.h      Tue Apr 10 13:41:52 2007 -0400
@@ -313,6 +313,8 @@ static always_inline unsigned long long 
 #define __sti()                        __asm__ __volatile__("sti": :
:"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete
*/
 #define safe_halt()            __asm__ __volatile__("sti; hlt": :
:"memory")
+/* used when interrupts are already enabled or to shutdown the
processor */
+#define halt()                 __asm__ __volatile__("hlt": : :"memory")
 
 /* For spinlocks etc */
 #if defined(__i386__)

xen_cpu_hotplug_for_pm.patch
Description: xen_cpu_hotplug_for_pm.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 10/12] Add Xen CPU hotplug support