Provide cpu hotplug support to Xen. Note this hotplug
support is specific to PM, instead of for a run-time
single CPU hotplug which can be a separate task. See
embedded comment:
/*
* XXX: One important thing missed here is to migrate vcpus
* from dead cpu to other online ones and then put whole
* system into a stop state. It assures a safe environment
* for a cpu hotplug/remove at normal running state.
*
* However for xen PM case, at this point:
* -> All other domains should be notified with PM event,
* and then in following states:
* * Suspend state, or
* * Paused state, which is a force step to all
* domains if they do nothing to suspend
* -> All vcpus of dom0 (except vcpu0) have already beem
* hot removed
* with the net effect that all other cpus only have idle vcpu
* running. In this special case, we can avoid vcpu migration
* then and system can be considered in a stop state.
*
* So current cpu hotplug is a special version for PM specific
* usage, and need more effort later for full cpu hotplug.
* (ktian1)
*/
Signed-off-by Kevin Tian <kevin.tian@xxxxxxxxx>
diff -r fe69f7fd1639 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/cpu/common.c Tue Apr 10 13:41:52 2007 -0400
@@ -600,9 +600,5 @@ void __cpuinit cpu_uninit(void)
{
int cpu = raw_smp_processor_id();
cpu_clear(cpu, cpu_initialized);
-
- /* lazy TLB state */
- per_cpu(cpu_tlbstate, cpu).state = 0;
- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
+}
+#endif
diff -r fe69f7fd1639 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/domain.c Tue Apr 10 13:41:52 2007 -0400
@@ -76,6 +76,7 @@ static void default_idle(void)
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
+ __cpu_disable();
/* This must be done before dead CPU ack */
cpu_exit_clear();
wbinvd();
@@ -101,6 +102,8 @@ void idle_loop(void)
{
for ( ; ; )
{
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
page_scrub_schedule_work();
default_idle();
do_softirq();
diff -r fe69f7fd1639 xen/arch/x86/i8259.c
--- a/xen/arch/x86/i8259.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/i8259.c Tue Apr 10 13:41:52 2007 -0400
@@ -396,6 +396,7 @@ void __init init_IRQ(void)
irq_desc[i].action = NULL;
irq_desc[i].depth = 1;
spin_lock_init(&irq_desc[i].lock);
+ cpus_setall(irq_desc[i].affinity);
set_intr_gate(i, interrupt[i]);
}
diff -r fe69f7fd1639 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/io_apic.c Tue Apr 10 13:41:52 2007 -0400
@@ -34,9 +34,6 @@
#include <asm/desc.h>
#include <mach_apic.h>
#include <io_ports.h>
-
-#define set_irq_info(irq, mask) ((void)0)
-#define set_native_irq_info(irq, mask) ((void)0)
/* Different to Linux: our implementation can be simpler. */
#define make_8259A_irq(irq) (io_apic_irqs &= ~(1<<(irq)))
diff -r fe69f7fd1639 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/irq.c Tue Apr 10 13:41:52 2007 -0400
@@ -656,7 +656,8 @@ __initcall(setup_dump_irqs);
__initcall(setup_dump_irqs);
#ifdef CONFIG_HOTPLUG_CPU
-#include <mach_apic.h>
+#include <asm/mach-generic/mach_apic.h>
+#include <xen/delay.h>
void fixup_irqs(cpumask_t map)
{
@@ -673,8 +674,8 @@ void fixup_irqs(cpumask_t map)
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
else if (irq_desc[irq].action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff -r fe69f7fd1639 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/smpboot.c Tue Apr 10 14:30:27 2007 -0400
@@ -109,6 +109,9 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mos
{ [0 ... NR_CPUS-1] = 0xff };
EXPORT_SYMBOL(x86_cpu_to_apicid);
+static void *stack_base[NR_CPUS] __cacheline_aligned;
+spinlock_t cpu_add_remove_lock;
+
/*
* Trampoline 80x86 program as an array.
*/
@@ -121,7 +124,7 @@ static void map_cpu_to_logical_apicid(vo
static void map_cpu_to_logical_apicid(void);
/* State of each CPU. */
-/*DEFINE_PER_CPU(int, cpu_state) = { 0 };*/
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
/*
* Currently trivial. Write the real->protected mode
@@ -439,9 +442,11 @@ void __devinit smp_callin(void)
/*
* Synchronize the TSC with the BP
*/
- if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
+ if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) {
synchronize_tsc_ap();
- calibrate_tsc_ap();
+ /* No sync for same reason as above */
+ calibrate_tsc_ap();
+ }
}
static int cpucount, booting_cpu;
@@ -508,8 +513,12 @@ static void construct_percpu_idt(unsigne
{
unsigned char idt_load[10];
- idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
- memcpy(idt_tables[cpu], idt_table,
IDT_ENTRIES*sizeof(idt_entry_t));
+ /* If IDT table exists since last hotplug, reuse it */
+ if (!idt_tables[cpu]) {
+ idt_tables[cpu] = xmalloc_array(idt_entry_t,
IDT_ENTRIES);
+ memcpy(idt_tables[cpu], idt_table,
+ IDT_ENTRIES*sizeof(idt_entry_t));
+ }
*(unsigned short *)(&idt_load[0]) =
(IDT_ENTRIES*sizeof(idt_entry_t))-1;
*(unsigned long *)(&idt_load[2]) = (unsigned
long)idt_tables[cpu];
@@ -571,15 +580,15 @@ void __devinit start_secondary(void *unu
* lock helps us to not include this cpu in a currently in
progress
* smp_call_function().
*/
- /*lock_ipi_call_lock();*/
+ lock_ipi_call_lock();
cpu_set(smp_processor_id(), cpu_online_map);
- /*unlock_ipi_call_lock();*/
- /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+ unlock_ipi_call_lock();
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+ init_percpu_time();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
-
- init_percpu_time();
wmb();
startup_cpu_idle_loop();
@@ -877,6 +886,22 @@ static inline int alloc_cpu_id(void)
return cpu;
}
+static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
+{
+ if (idle_vcpu[cpu])
+ return idle_vcpu[cpu];
+
+ return alloc_idle_vcpu(cpu);
+}
+
+static void *prepare_idle_stack(unsigned int cpu)
+{
+ if (!stack_base[cpu])
+ stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER);
+
+ return stack_base[cpu];
+}
+
static int __devinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -894,7 +919,7 @@ static int __devinit do_boot_cpu(int api
booting_cpu = cpu;
- v = alloc_idle_vcpu(cpu);
+ v = prepare_idle_vcpu(cpu);
BUG_ON(v == NULL);
/* start_eip had better be page-aligned! */
@@ -903,7 +928,7 @@ static int __devinit do_boot_cpu(int api
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid,
start_eip);
- stack_start.esp = alloc_xenheap_pages(STACK_ORDER);
+ stack_start.esp = prepare_idle_stack(cpu);
/* Debug build: detect stack overflow by setting up a guard
page. */
memguard_guard_stack(stack_start.esp);
@@ -980,6 +1005,12 @@ static int __devinit do_boot_cpu(int api
}
#ifdef CONFIG_HOTPLUG_CPU
+static void idle_task_exit(void)
+{
+ /* Give up lazy state borrowed by this idle vcpu */
+ __sync_lazy_execstate();
+}
+
void cpu_exit_clear(void)
{
int cpu = raw_smp_processor_id();
@@ -988,7 +1019,6 @@ void cpu_exit_clear(void)
cpucount --;
cpu_uninit();
- irq_ctx_exit(cpu);
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
@@ -997,26 +1027,9 @@ void cpu_exit_clear(void)
unmap_cpu_to_logical_apicid(cpu);
}
-struct warm_boot_cpu_info {
- struct completion *complete;
- int apicid;
- int cpu;
-};
-
-static void __cpuinit do_warm_boot_cpu(void *p)
-{
- struct warm_boot_cpu_info *info = p;
- do_boot_cpu(info->apicid, info->cpu);
- complete(info->complete);
-}
-
static int __cpuinit __smp_prepare_cpu(int cpu)
{
- DECLARE_COMPLETION(done);
- struct warm_boot_cpu_info info;
- struct work_struct task;
int apicid, ret;
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr,
cpu);
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
@@ -1024,34 +1037,19 @@ static int __cpuinit __smp_prepare_cpu(i
goto exit;
}
- /*
- * the CPU isn't initialized at boot time, allocate gdt table
here.
- * cpu_init will initialize it
- */
- if (!cpu_gdt_descr->address) {
- cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
- if (!cpu_gdt_descr->address)
- printk(KERN_CRIT "CPU%d failed to allocate
GDT\n", cpu);
- ret = -ENOMEM;
- goto exit;
- }
-
- info.complete = &done;
- info.apicid = apicid;
- info.cpu = cpu;
- INIT_WORK(&task, do_warm_boot_cpu, &info);
-
tsc_sync_disabled = 1;
/* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- KERNEL_PGD_PTRS);
- flush_tlb_all();
- schedule_work(&task);
- wait_for_completion(&done);
+ init_low_mappings();
+
+ do_boot_cpu(apicid, cpu);
tsc_sync_disabled = 0;
+#ifdef CONFIG_X86_64
zap_low_mappings();
+#else
+ zap_low_mappings(idle_pg_table_l2);
+#endif
ret = 0;
exit:
return ret;
@@ -1084,6 +1082,8 @@ static void __init smp_boot_cpus(unsigne
boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+ stack_base[0] = stack_start.esp;
/*current_thread_info()->cpu = 0;*/
/*smp_tune_scheduling();*/
@@ -1255,7 +1255,8 @@ void __devinit smp_prepare_boot_cpu(void
cpu_set(smp_processor_id(), cpu_callout_map);
cpu_set(smp_processor_id(), cpu_present_map);
cpu_set(smp_processor_id(), cpu_possible_map);
- /*per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;*/
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ spin_lock_init(&cpu_add_remove_lock);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1278,11 +1279,12 @@ remove_siblinginfo(int cpu)
cpu_clear(cpu, cpu_sibling_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
- c[cpu].phys_proc_id = 0;
- c[cpu].cpu_core_id = 0;
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
cpu_clear(cpu, cpu_sibling_setup_map);
}
+extern void fixup_irqs(cpumask_t map);
int __cpu_disable(void)
{
cpumask_t map = cpu_online_map;
@@ -1299,12 +1301,15 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;
+ local_irq_disable();
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
mdelay(1);
local_irq_disable();
+ destroy_percpu_time();
+
remove_siblinginfo(cpu);
cpu_clear(cpu, map);
@@ -1323,13 +1328,89 @@ void __cpu_die(unsigned int cpu)
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
return;
}
- msleep(100);
+ mdelay(100);
+ mb();
+ process_pending_timers();
}
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+/*
+ * XXX: One important thing missed here is to migrate vcpus
+ * from dead cpu to other online ones and then put whole
+ * system into a stop state. It assures a safe environment
+ * for a cpu hotplug/remove at normal running state.
+ *
+ * However for xen PM case, at this point:
+ * -> All other domains should be notified with PM event,
+ * and then in following states:
+ * * Suspend state, or
+ * * Paused state, which is a force step to all
+ * domains if they do nothing to suspend
+ * -> All vcpus of dom0 (except vcpu0) have already beem
+ * hot removed
+ * with the net effect that all other cpus only have idle vcpu
+ * running. In this special case, we can avoid vcpu migration
+ * then and system can be considered in a stop state.
+ *
+ * So current cpu hotplug is a special version for PM specific
+ * usage, and need more effort later for full cpu hotplug.
+ * (ktian1)
+ */
+int cpu_down(unsigned int cpu)
+{
+ int err = 0;
+ cpumask_t mask;
+
+ spin_lock(&cpu_add_remove_lock);
+ if (num_online_cpus() == 1) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (!cpu_online(cpu)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ printk("Prepare to bring CPU%d down...\n", cpu);
+ /* Send notification to remote idle vcpu */
+ cpus_clear(mask);
+ cpu_set(cpu, mask);
+ per_cpu(cpu_state, cpu) = CPU_DYING;
+ smp_send_event_check_mask(mask);
+
+ __cpu_die(cpu);
+
+ if (cpu_online(cpu)) {
+ printk("Bad state (DEAD, but in online map) on CPU%d\n",
cpu);
+ err = -EBUSY;
+ }
+out:
+ spin_unlock(&cpu_add_remove_lock);
+ return err;
+}
+
+int cpu_up(unsigned int cpu)
+{
+ int err = 0;
+
+ spin_lock(&cpu_add_remove_lock);
+ if (cpu_online(cpu)) {
+ printk("Bring up a online cpu. Bogus!\n");
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = __cpu_up(cpu);
+ if (err < 0)
+ goto out;
+
+out:
+ spin_unlock(&cpu_add_remove_lock);
+ return err;
}
/* From kernel/power/main.c */
@@ -1390,6 +1471,22 @@ void __cpu_die(unsigned int cpu)
int __devinit __cpu_up(unsigned int cpu)
{
+#ifdef CONFIG_HOTPLUG_CPU
+ int ret=0;
+
+ /*
+ * We do warm boot only on cpus that had booted earlier
+ * Otherwise cold boot is all handled from smp_boot_cpus().
+ * cpu_callin_map is set during AP kickstart process. Its reset
+ * when a cpu is taken offline from cpu_exit_clear().
+ */
+ if (!cpu_isset(cpu, cpu_callin_map))
+ ret = __smp_prepare_cpu(cpu);
+
+ if (ret)
+ return -EIO;
+#endif
+
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
printk(KERN_DEBUG "skipping cpu%d, didn't come
online\n", cpu);
diff -r fe69f7fd1639 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/arch/x86/time.c Tue Apr 10 14:33:20 2007 -0400
@@ -963,6 +963,12 @@ void init_percpu_time(void)
set_timer(&t->calibration_timer, NOW() + EPOCH);
}
+/* Normally all pending timers are fired once APIC interrupt is
+ * active again, and thus no need to kill them when cpu is down.
+ * (Migrate may be required for pure cpu-hotplug). However
+ * calibration timer is a bit special, and re-initialization is
+ * required after cpu is up.
+ */
void destroy_percpu_time(void)
{
kill_timer(&this_cpu(cpu_time).calibration_timer);
diff -r fe69f7fd1639 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/config.h Tue Apr 10 13:41:52 2007 -0400
@@ -37,6 +37,8 @@
#define CONFIG_ACPI_BOOT 1
#define CONFIG_VGA 1
+
+#define CONFIG_HOTPLUG_CPU 1
#define HZ 100
diff -r fe69f7fd1639 xen/include/asm-x86/smp.h
--- a/xen/include/asm-x86/smp.h Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/smp.h Tue Apr 10 13:41:52 2007 -0400
@@ -46,14 +46,30 @@ extern void zap_low_mappings(l2_pgentry_
#endif
extern void init_low_mappings(void);
+
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
#define MAX_APICID 256
extern u8 x86_cpu_to_apicid[];
#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
+/* State of each CPU. */
+#define CPU_ONLINE 0x0002 /* CPU is up */
+#define CPU_DYING 0x0003 /* CPU is requested to die */
+#define CPU_DEAD 0x0004 /* CPU is dead */
+DECLARE_PER_CPU(int, cpu_state);
+
#ifdef CONFIG_HOTPLUG_CPU
+#define cpu_is_offline(cpu) unlikely(per_cpu(cpu_state,cpu) ==
CPU_DYING)
+extern int cpu_down(unsigned int cpu);
+extern int cpu_up(unsigned int cpu);
extern void cpu_exit_clear(void);
extern void cpu_uninit(void);
+extern void disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int cpu_is_offline(int cpu) {return 0;}
#endif
/*
diff -r fe69f7fd1639 xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h Tue Apr 10 13:41:45 2007 -0400
+++ b/xen/include/asm-x86/system.h Tue Apr 10 13:41:52 2007 -0400
@@ -313,6 +313,8 @@ static always_inline unsigned long long
#define __sti() __asm__ __volatile__("sti": :
:"memory")
/* used in the idle loop; sti takes one instruction cycle to complete
*/
#define safe_halt() __asm__ __volatile__("sti; hlt": :
:"memory")
+/* used when interrupts are already enabled or to shutdown the
processor */
+#define halt() __asm__ __volatile__("hlt": : :"memory")
/* For spinlocks etc */
#if defined(__i386__)
xen_cpu_hotplug_for_pm.patch
Description: xen_cpu_hotplug_for_pm.patch
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|