[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] x86/cpuidle: split the max_cstate variable


  • To: Jan Beulich <jbeulich@xxxxxxxx>
  • From: Roger Pau Monné <roger.pau@xxxxxxxxxx>
  • Date: Mon, 20 Apr 2026 18:14:20 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=citrix.com; dmarc=pass action=none header.from=citrix.com; dkim=pass header.d=citrix.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector10001; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=V96pWWCUkchYywXY2ng+krZjeuzCTGROfoobKErAJ+0=; b=B1zJcSyfcRwOzf7Z4MLaW7MTMyD4SWu2a6LtBOhKxQXbxi3j0+IMcXBSQ9XiGX52zTyTHFz6GjXPI1dWLidgFXl1WXQOjBmRMhWvRU/mCEzEnmmmbc8ifY98CPH3epaehHfq7qkHEly7HqBizYcrGbKfm+om95Y7jVtM82ewiaRCKm3CZOK0J5qeX7Qrum8K7xwErgtNhxaJlG1eCsf6Xizx+6kYfTvmpYAgf+Ys+dK/S2gsFfJ60uTLvWkK4XehzGX4nPUg1Z7bb4bdDesUwSPgXgV4a516wjaLh/aPl6EHn5O5gWwHlV9lHGsTE7YBfndyPQ3yW/OCOZpLzvUdhA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none; b=XNPEvAT85W49KhV9ycoAZjXXoN4vsefdF3ZBrbu8aCuqd7oIqngJY/b0HdQIKPaFNIROsAAaKK2rgymKfYu8qYCP1AO35aT5tZFhbKMOUxPee7sgjV0FzMnQiQ/1WzfWBEuWjpvON4OYGA5sQpTvh3MVHc1V+Sx+gBN9bhtoPIJuYpSfR49+UIwlRyFAyi4UgYr7tWWXhaRXEa2yqzGEdiA3MUzWcgepNNlTX4eyPkmNfKdSupoNiME+6k954qOA2XKdNfIZii8XJNsfxQyxQhrslpLbz5O1BGEJ7jRiR6TGTpbTkCaD8kqRiEyiaFktnUwUOHlx5sDBY4K+EA2Wow==
  • Authentication-results: eu.smtp.expurgate.cloud; dkim=pass header.s=selector1 header.d=citrix.com header.i="@citrix.com" header.h="From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck"
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=citrix.com;
  • Cc: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Teddy Astie <teddy.astie@xxxxxxxxxx>, Marek Marczykowski <marmarek@xxxxxxxxxxxxxxxxxxxxxx>
  • Delivery-date: Mon, 20 Apr 2026 16:14:35 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Wed, Apr 08, 2026 at 01:34:43PM +0200, Jan Beulich wrote:
> The admin can control the upper bound wanted not only via command line
> option, but also via XEN_SYSCTL_pm_op_set_max_cstate. While decisions how
> to set up the system are okay this way as long as we deem the command line
> option a strict upper bound, what to do during S3 resume should not be
> based on that potentially varying value. Decisions there need to use
> solely the strict upper bound we may have enforced ourselves (or which was
> forced onto us via command line option).
> 
> Rather than altering pit_broadcast_is_available(), drop the function
> altogether. It's pretty odd for acpi/cpu_idle.c to call into time.c, just
> for that to call into acpi/cpu_idle.c again.
> 
> Fixes: 8d24303023ec ("x86: don't write_tsc() non-zero values on CPUs updating 
> only the lower 32 bits")
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> ---
> cpuidle_disable_deep_cstate(), when called from handle_rtc_once(), is
> still somewhat of a problem: Boot time and resume time runs of
> _disable_pit_irq() may still behave differently because of that.
> 
> If we wanted the sysctl to possibly move the maximum C-state beyond what
> was given on the command line, things would get yet more complicated, as
> we'd then need to re-configure the driver that's in use.
> 
> I wonder how useful the ACPI_STATE_C<n> #define-s really are. Plain 1 is
> used in e.g. probe_c3_errata(), and the plain 7 doesn't even have a
> respective constant (nor would that be suitable, as that's not really an
> ACPI state).
> 
> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -119,7 +119,7 @@ bool lapic_timer_init(void)
>          lapic_timer_off = hpet_broadcast_enter;
>          lapic_timer_on = hpet_broadcast_exit;
>      }
> -    else if ( pit_broadcast_is_available() )
> +    else if ( cpuidle_usable_deep_cstate() )
>      {
>          lapic_timer_off = pit_broadcast_enter;
>          lapic_timer_on = pit_broadcast_exit;
> @@ -131,12 +131,15 @@ bool lapic_timer_init(void)
>  }
>  
>  void (*__read_mostly pm_idle_save)(void);
> -unsigned int max_cstate __read_mostly = UINT_MAX;
> +
> +unsigned int max_usable_cstate __read_mostly = UINT_MAX;
> +unsigned int max_allowed_cstate __read_mostly = UINT_MAX;
>  unsigned int max_csubstate __read_mostly = UINT_MAX;
>  
>  static int __init cf_check parse_cstate(const char *s)
>  {
> -    max_cstate = simple_strtoul(s, &s, 0);
> +    max_allowed_cstate = simple_strtoul(s, &s, 0);
> +    max_usable_cstate = max_allowed_cstate;
>      if ( *s == ',' )
>          max_csubstate = simple_strtoul(s + 1, NULL, 0);
>      return 0;
> @@ -413,10 +416,11 @@ static void cf_check dump_cx(unsigned ch
>      unsigned int cpu;
>  
>      printk("'%c' pressed -> printing ACPI Cx structures\n", key);
> -    if ( max_cstate < UINT_MAX )
> +    if ( max_cstate() < UINT_MAX )
>      {
> -        printk("max state: C%u\n", max_cstate);
> -        if ( max_csubstate < UINT_MAX )
> +        printk("max state: C%u\n", max_cstate());
> +        if ( max_allowed_cstate <= max_usable_cstate &&
> +             max_csubstate < UINT_MAX )
>              printk("max sub-state: %u\n", max_csubstate);
>          else
>              printk("max sub-state: unlimited\n");
> @@ -690,18 +694,18 @@ static void cf_check acpi_processor_idle
>      u32 exp = 0, pred = 0;
>      u32 irq_traced[4] = { 0 };
>  
> -    if ( max_cstate > 0 && power &&
> +    if ( max_cstate() > 0 && power &&
>           (next_state = cpuidle_current_governor->select(power)) > 0 )
>      {
>          unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -                                                         : max_cstate;
> +                                                         : max_cstate();
>  
>          do {
>              cx = &power->states[next_state];
>          } while ( (cx->type > max_state ||
>                     cx->entry_method == ACPI_CSTATE_EM_NONE ||
>                     (cx->entry_method == ACPI_CSTATE_EM_FFH &&
> -                    cx->type == max_cstate &&
> +                    cx->type == max_allowed_cstate &&

I'm afraid I'm missing why this uses max_allowed_cstate instead of
max_state.

>                      (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
>                    --next_state );
>          if ( next_state )
> @@ -1448,7 +1452,7 @@ static void amd_cpuidle_init(struct acpi
>  
>      for ( i = 0; i < nr; ++i )
>      {
> -        if ( cx[i].type > max_cstate )
> +        if ( cx[i].type > max_cstate() )
>              break;
>          power->states[i + 1] = cx[i];
>          power->states[i + 1].idx = i + 1;
> @@ -1611,21 +1615,22 @@ int pmstat_reset_cx_stat(unsigned int cp
>  
>  void cpuidle_disable_deep_cstate(void)
>  {
> -    if ( max_cstate > ACPI_STATE_C1 )
> +    if ( max_usable_cstate > ACPI_STATE_C1 )
>      {
>          if ( local_apic_timer_c2_ok )
> -            max_cstate = ACPI_STATE_C2;
> +            max_usable_cstate = ACPI_STATE_C2;
>          else
> -            max_cstate = ACPI_STATE_C1;
> +            max_usable_cstate = ACPI_STATE_C1;
>      }
>  
>      hpet_disable_legacy_broadcast();
>  }
>  
> -bool cpuidle_using_deep_cstate(void)
> +bool cpuidle_usable_deep_cstate(void)
>  {
> -    return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 
> ACPI_STATE_C2
> -                                                               : 
> ACPI_STATE_C1);
> +    return xen_cpuidle &&
> +           max_usable_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
> +                                                       : ACPI_STATE_C1);
>  }
>  
>  static int cf_check cpu_callback(
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -384,12 +384,12 @@ static void probe_c3_errata(const struct
>      };
>  
>      /* Serialized by the AP bringup code. */
> -    if ( max_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
> +    if ( max_usable_cstate > 1 && (c->apicid & (c->x86_num_siblings - 1)) &&
>           x86_match_cpu(models) )
>      {
>          printk(XENLOG_WARNING
>              "Disabling C-states C3 and C6 due to CPU errata\n");
> -        max_cstate = 1;
> +        max_usable_cstate = 1;
>      }
>  }
>  
> --- a/xen/arch/x86/cpu/mwait-idle.c
> +++ b/xen/arch/x86/cpu/mwait-idle.c
> @@ -1045,15 +1045,16 @@ static void cf_check mwait_idle(void)
>       u64 before, after;
>       u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
>  
> -     if (max_cstate > 0 && power &&
> +     if (max_cstate() > 0 && power &&
>           (next_state = cpuidle_current_governor->select(power)) > 0) {
>               unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
> -                                                              : max_cstate;
> +                                                              : max_cstate();
>  
>               do {
>                       cx = &power->states[next_state];
> -             } while ((cx->type > max_state || (cx->type == max_cstate &&
> -                       MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
> +             } while ((cx->type > max_state ||
> +                          (cx->type == max_allowed_cstate &&

Indentation is weird for the above line IMO, you should use hard 3
tabs plus spaces afterwards, like the surrounding indentation?

> +                        MWAIT_HINT2SUBSTATE(cx->address) > max_csubstate)) &&
>                        --next_state);
>               if (!next_state)
>                       cx = NULL;

Seeing max_cstate() is used in multiple places here, you might want to
introduce a local max_cstate variable?

> @@ -1458,7 +1459,7 @@ static void __init sklh_idle_state_table
>       u64 msr;
>  
>       /* if PC10 disabled via cmdline max_cstate=7 or shallower */
> -     if (max_cstate <= 7)
> +     if (max_cstate() <= 7)
>               return;
>  
>       /* if PC10 not present in CPUID.MWAIT.EDX */
> @@ -1623,7 +1624,7 @@ static int __init mwait_idle_probe(void)
>           !mwait_substates)
>               return -ENODEV;
>  
> -     if (!max_cstate || !opt_mwait_idle) {
> +     if (!max_cstate() || !opt_mwait_idle) {
>               pr_debug(PREFIX "disabled\n");
>               return -EPERM;
>       }
> @@ -1714,8 +1715,8 @@ static int cf_check mwait_idle_cpu_init(
>               hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
>               state = MWAIT_HINT2CSTATE(hint) + 1;
>  
> -             if (state > max_cstate) {
> -                     printk(PREFIX "max C-state %u reached\n", max_cstate);
> +             if (state > max_cstate()) {
> +                     printk(PREFIX "max C-state %u reached\n", max_cstate());
>                       break;
>               }
>  
> --- a/xen/arch/x86/include/asm/time.h
> +++ b/xen/arch/x86/include/asm/time.h
> @@ -31,7 +31,6 @@ int cpu_frequency_change(u64 freq);
>  
>  void cf_check pit_broadcast_enter(void);
>  void cf_check pit_broadcast_exit(void);
> -int pit_broadcast_is_available(void);
>  
>  uint64_t cf_check acpi_pm_tick_to_ns(uint64_t ticks);
>  
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -517,7 +517,7 @@ static int64_t __init cf_check init_hpet
>      bool disable_hpet = false;
>  
>      if ( hpet_address && strcmp(opt_clocksource, pts->id) &&
> -         cpuidle_using_deep_cstate() )
> +         cpuidle_usable_deep_cstate() )
>      {
>          if ( pci_conf_read16(PCI_SBDF(0, 0, 0x1f, 0),
>                               PCI_VENDOR_ID) == PCI_VENDOR_ID_INTEL )
> @@ -2655,7 +2655,7 @@ static int _disable_pit_irq(bool init)
>       * XXX dom0 may rely on RTC interrupt delivery, so only enable
>       * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
>       */
> -    if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) )
> +    if ( cpuidle_usable_deep_cstate() && !boot_cpu_has(X86_FEATURE_XEN_ARAT) 
> )
>      {
>          init ? hpet_broadcast_init() : hpet_broadcast_resume();
>          if ( !hpet_broadcast_is_available() )
> @@ -2707,11 +2707,6 @@ void cf_check pit_broadcast_exit(void)
>          reprogram_timer(this_cpu(timer_deadline));
>  }
>  
> -int pit_broadcast_is_available(void)
> -{
> -    return cpuidle_using_deep_cstate();
> -}
> -
>  void send_timer_event(struct vcpu *v)
>  {
>      send_guest_vcpu_virq(v, VIRQ_TIMER);
> @@ -2999,7 +2994,7 @@ static void cf_check dump_softtsc(unsign
>      else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
>      {
>          printk("TSC has constant rate, ");
> -        if ( max_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
> +        if ( max_usable_cstate <= ACPI_STATE_C2 && tsc_max_warp == 0 )
>              printk("no deep Cstates, passed warp test, deemed reliable, ");
>          else
>              printk("deep Cstates possible, so not reliable, ");
> --- a/xen/include/xen/acpi.h
> +++ b/xen/include/xen/acpi.h
> @@ -142,30 +142,33 @@ int acpi_gsi_to_irq (u32 gsi, unsigned i
>  
>  #ifdef       CONFIG_ACPI_CSTATE
>  /*
> - * max_cstate sets the highest legal C-state.
> - * max_cstate = 0: C0 okay, but not C1
> - * max_cstate = 1: C1 okay, but not C2
> - * max_cstate = 2: C2 okay, but not C3 etc.
> -
> - * max_csubstate sets the highest legal C-state sub-state. Only applies to 
> the
> - * highest legal C-state.
> - * max_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> - * max_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but not C2
> - * max_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but not C3
> - * max_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but not C3
> + * max_{allowed,usable}_cstate sets the highest allowed / usable C-state, 
> where
> + * "allowed" is command line / sysctl based.

Hm, this is a bit misleading, because max_usable_cstate is also
command line based (plus system errata).  What about:

"max_{allowed,usable}_cstate sets the highest allowed / usable C-state.
max_usable_cstate can only be set from the command line, while
max_allowed_cstate can be set from both command line and systcl."

> + * max_*_cstate = 0: C0 okay, but not C1
> + * max_*_cstate = 1: C1 okay, but not C2
> + * max_*_cstate = 2: C2 okay, but not C3 etc.
> + *
> + * max_csubstate sets the highest allowed C-state sub-state. Only applies to
> + * the highest allowed C-state.
> + * max_allowed_cstate = 1, max_csubstate = 0 ==> C0, C1 okay, but not C1E
> + * max_allowed_cstate = 1, max_csubstate = 1 ==> C0, C1 and C1E okay, but 
> not C2
> + * max_allowed_cstate = 2, max_csubstate = 0 ==> C0, C1, C1E, C2 okay, but 
> not C3
> + * max_allowed_cstate = 2, max_csubstate = 1 ==> C0, C1, C1E, C2 okay, but 
> not C3
>   */
>  
> -extern unsigned int max_cstate;
> +extern unsigned int max_usable_cstate;
> +extern unsigned int max_allowed_cstate;
>  extern unsigned int max_csubstate;
>  
> +#define max_cstate() min(max_usable_cstate, max_allowed_cstate)

I would be tempted to drop the ending parenthesis so that you don't
need to adjust callers, but that's likely misleading, as then it would
need to be uppercase MAX_CSTATE.

> +
>  static inline unsigned int acpi_get_cstate_limit(void)
>  {
> -     return max_cstate;
> +     return max_allowed_cstate;
>  }
>  static inline void acpi_set_cstate_limit(unsigned int new_limit)
>  {
> -     max_cstate = new_limit;
> -     return;
> +     max_allowed_cstate = new_limit;

Do we want to check the new limit doesn't exceed max_usable_cstate and
return -ERANGE or similar on failure?

After this change it's a bit weird to silently ignore invalid values
IMO.

>  }
>  
>  static inline unsigned int acpi_get_csubstate_limit(void)
> --- a/xen/include/xen/cpuidle.h
> +++ b/xen/include/xen/cpuidle.h
> @@ -89,7 +89,7 @@ struct cpuidle_governor
>  extern int8_t xen_cpuidle;
>  extern struct cpuidle_governor *cpuidle_current_governor;
>  
> -bool cpuidle_using_deep_cstate(void);
> +bool cpuidle_usable_deep_cstate(void);
>  void cpuidle_disable_deep_cstate(void);
>  
>  #define CPUIDLE_DRIVER_STATE_START  1



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.