WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

Re: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of m

To: "Liu, Jinsong" <jinsong.liu@xxxxxxxxx>, Jan Beulich <JBeulich@xxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: Re: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
From: Keir Fraser <keir@xxxxxxx>
Date: Sat, 14 May 2011 11:01:01 +0100
Cc: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>, "Li, Xin" <xin.li@xxxxxxxxx>
Delivery-date: Sat, 14 May 2011 03:03:25 -0700
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:sender:user-agent:date:subject:from:to:cc :message-id:thread-topic:thread-index:in-reply-to:mime-version :content-type:content-transfer-encoding; bh=HOFmdf5hjkQMsFTOz2N7qmXIHKg9AHdMUe7xE6m8GF8=; b=g3rkyRACs/SuWP4pxXuPuqMKgTu5Yo/+RXQ0jVkPjpagttC3SE7c+exZOe7npD51AN Exhr9BZkUw2eFC/CypvzoRdgtmdYnkpLkFatnnvDIM4SnjGoCGfxORwWg/xVI0aV+PJl QZhHAapYe32H6SzcZ1qnmnkReL0791KmeKvXg=
Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=sender:user-agent:date:subject:from:to:cc:message-id:thread-topic :thread-index:in-reply-to:mime-version:content-type :content-transfer-encoding; b=Z26U7mbM0zWig9KwAqep9Hxap09r1HiKMKhex6XxdYSS67eKnqAwfLW3d0350jE5fD sbEhOE/2PrRwdO0TxXi2YVf62H3XKeUZAG6BSaFSK0CTucF8eMlYdcnhxCDcyJbPFIWd ycdrKetuzLBL81TJ4qtn/h//iIjEoC2rkNzLo=
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <C9F40DBE.173FE%keir@xxxxxxx>
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcwNclEwEvLQfpHVSmSzkx8mJ5h67gECANbgAChTZIYAAIyFbw==
Thread-topic: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
User-agent: Microsoft-Entourage/12.23.0.091001
Ah no, I found it! I'll take a look at it on Monday.

 -- Keir

On 14/05/2011 10:45, "Keir Fraser" <keir@xxxxxxx> wrote:

> I don't seem to have this one in my queue. You should resend.
> 
>  -- Keir
> 
> On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@xxxxxxxxx> wrote:
> 
>> Jan and Keir,
>> 
>> Any comments?
>> 
>> Thanks
>> Jinsong
>> 
>> 
>> Liu, Jinsong wrote:
>>> MCA: simplify mce actoin, and some update of mem err handler for
>>> future SRAR extension
>>> 
>>> This patch simplify mce_action(). Basically the 2 set of mce status
>>> flags MCA_... and MCER_... are highly functional similar. This patch
>>> remove the redundant middle-level flag MCA_..., and its related data
>>> structure and function, so that mce_action() logic is more clean and
>>> simpler.    
>>> This patch also update mem err handler. intel_memerr_dhandler() will
>>> be commonly used by SRAO and SRAR, so for the extension of future
>>> SRAR case, this patch remove the default fail flag from
>>> intel_memerr_dhandler() outside to SRAO/SRAR level, since only
>>> SRAO/SRAR handler itself has the knowledge of how to handle the
>>> failure when mem err handler fail.
>>> 
>>> Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>
>>> 
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>>  static unsigned int __read_mostly mce_uhandler_num;
>>> 
>>> -enum mce_result
>>> -{
>>> -    MCER_NOERROR,
>>> -    MCER_RECOVERED,
>>> -    /* Not recoverd, but can continue */
>>> -    MCER_CONTINUE,
>>> -    MCER_RESET,
>>> -};
>>> -
>>>  /* Maybe called in MCE context, no lock, no printk */
>>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>>                        mctelem_cookie_t mctc)
>>>  {
>>>      struct mc_info *local_mi;
>>> -    enum mce_result ret = MCER_NOERROR;
>>> +    enum mce_result bank_result = MCER_NOERROR;
>>> +    enum mce_result worst_result = MCER_NOERROR;
>>>      struct mcinfo_common *mic = NULL;
>>> -    struct mca_handle_result mca_res;
>>>      struct mca_binfo binfo;
>>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>>      unsigned int i, handler_num = mce_dhandler_num;
>>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>>      /* Processing bank information */
>>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>>> 
>>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>>            mic = x86_mcinfo_next(mic) )
>>>      {
>>>          if (mic->type != MC_TYPE_BANK) {
>>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
>>>          }
>>>          binfo.mib = (struct mcinfo_bank*)mic;
>>>          binfo.bank = binfo.mib->mc_bank;
>>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>>> +        bank_result = MCER_NOERROR;
>>>          for ( i = 0; i < handler_num; i++ ) {
>>>              if (handlers[i].owned_error(binfo.mib->mc_status))
>>>              {
>>> -                handlers[i].recovery_handler(&binfo, &mca_res);
>>> -
>>> -                if (mca_res.result & MCA_OWNER)
>>> -                    binfo.mib->mc_domid = mca_res.owner;
>>> -
>>> -                if (mca_res.result == MCA_NEED_RESET)
>>> -                    ret = MCER_RESET;
>>> -                else if (mca_res.result == MCA_RECOVERED)
>>> -                {
>>> -                    if (ret < MCER_RECOVERED)
>>> -                        ret = MCER_RECOVERED;
>>> -                }
>>> -                else if (mca_res.result == MCA_NO_ACTION)
>>> -                {
>>> -                    if (ret < MCER_CONTINUE)
>>> -                        ret = MCER_CONTINUE;
>>> -                }
>>> +                handlers[i].recovery_handler(&binfo, &bank_result);
>>> +                if (worst_result < bank_result)
>>> +                    worst_result = bank_result;
>>>                  break;
>>>              }
>>>          }
>>>          ASSERT(i != handler_num);
>>>      }
>>> 
>>> -    return ret;
>>> +    return worst_result;
>>>  }
>>> 
>>>  /*
>>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>>> 
>>>  static void intel_memerr_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      struct mcinfo_bank *bank = binfo->mib;
>>>      struct mcinfo_global *global = binfo->mig;
>>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>>      uint64_t mc_status, mc_misc;
>>> 
>>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>>> -    result->result = MCA_NEED_RESET;
>>> 
>>>      mc_status = bank->mc_status;
>>>      mc_misc = bank->mc_misc;
>>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>>          !(mc_status & MCi_STATUS_MISCV) ||
>>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>>      {
>>> -        result->result |= MCA_NO_ACTION;
>>>          dprintk(XENLOG_WARNING,
>>>              "No physical address provided for memory error\n");
>>>          return;
>>> @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>>> 
>>>      /* This is free page */
>>>      if (status & PG_OFFLINE_OFFLINED)
>>> -        result->result = MCA_RECOVERED;
>>> +        *result = MCER_RECOVERED;
>>>      else if (status & PG_OFFLINE_PENDING) {
>>>          /* This page has owner */
>>>          if (status & PG_OFFLINE_OWNED) {
>>> -            result->result |= MCA_OWNER;
>>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>>> -              " by DOM %d\n", result->owner);
>>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>>> -             * "related data */
>>> -            bank->mc_domid = result->owner;
>>> +              " by DOM %d\n", bank->mc_domid);
>>>              /* XXX: Cannot handle shared pages yet
>>>               * (this should identify all domains and gfn mapping to
>>>               *  the mfn in question) */
>>> -            BUG_ON( result->owner == DOMID_COW );
>>> -            if ( result->owner != DOMID_XEN ) {
>>> -                d = get_domain_by_id(result->owner);
>>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>>> +            if ( bank->mc_domid != DOMID_XEN ) {
>>> +                d = get_domain_by_id(bank->mc_domid);
>>>                  ASSERT(d);
>>>                  gfn = get_gpfn_from_mfn((bank->mc_addr) >>
>>> PAGE_SHIFT); 
>>> 
>>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>>                        global->mc_gstatus) == -1 )
>>>                  {
>>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d
>>> " -                      "failed\n", result->owner);
>>> +                      "failed\n", bank->mc_domid);
>>>                      goto vmce_failed;
>>>                  }
>>> 
>>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>>                   * For xen, it has contained the error and finished
>>>                   * its own recovery job.
>>>                   */
>>> -                result->result = MCA_RECOVERED;
>>> +                *result = MCER_RECOVERED;
>>>                  put_domain(d);
>>> 
>>>                  return;
>>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>>> 
>>>  static void intel_srao_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>> 
>>>      /* For unknown srao error code, no action required */
>>> +    *result = MCER_CONTINUE;
>>> +
>>>      if ( status & MCi_STATUS_VAL )
>>>      {
>>>          switch ( status & INTEL_MCCOD_MASK )
>>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>>> 
>>>  static void intel_default_mce_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>>      type = intel_check_mce_type(status);
>>> 
>>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>>> -        result->result = MCA_NEED_RESET;
>>> -    else if (type == intel_mce_ucr_srao)
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_RESET;
>>> +    else
>>> +        *result = MCER_CONTINUE;
>>>  }
>>> 
>>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>>> 
>>>  static void intel_default_mce_uhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>>      /* Panic if no handler for SRAR error */
>>>      case intel_mce_ucr_srar:
>>>      case intel_mce_fatal:
>>> -        result->result = MCA_NEED_RESET;
>>> +        *result = MCER_RESET;
>>>          break;
>>>      default:
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_CONTINUE;
>>>          break;
>>>      }
>>>  }
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011 +0800
>>> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13 2011 +0800
>>> @@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
>>>  void mcabanks_free(struct mca_banks *banks);
>>>  extern struct mca_banks *mca_allbanks;
>>> 
>>> -/* Below interfaces are defined for MCA internal processing:
>>> - * a. pre_handler will be called early in MCA ISR context, mainly
>>> for early 
>>> - *    need_reset detection for avoiding log missing. Also, it is
>>> used to judge 
>>> - *    impacted DOMAIN if possible.
>>> - * b. mca_error_handler is actually a (error_action_index,
>>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>>> - *    performs the actual recovery operations such as page_offline,
>>> cpu_offline 
>>> - *    in softIRQ context when the per_bank MCA error matching the
>>> corresponding 
>>> - *    mca_code index. If pre_handler can't judge the impacted domain,
>>> - *    recovery_handler must figure it out.
>>> -*/
>>> -
>>> -/* MCA error has been recovered successfully by the recovery action*/
>>> -#define MCA_RECOVERED (0x1 << 0)
>>> -/* MCA error impact the specified DOMAIN in owner field below */
>>> -#define MCA_OWNER (0x1 << 1)
>>> -/* MCA error can't be recovered and need reset */
>>> -#define MCA_NEED_RESET (0x1 << 2)
>>> -/* MCA error did not have any action yet */
>>> -#define MCA_NO_ACTION (0x1 << 3)
>>> -
>>> -struct mca_handle_result
>>> -{
>>> -    uint32_t result;
>>> -    /* Used one result & MCA_OWNER */
>>> -    domid_t owner;
>>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>>> -    struct recovery_action *action;
>>> -};
>>> -
>>>  /*Keep bank so that we can get staus even if mib is NULL */
>>>  struct mca_binfo {
>>>      int bank;
>>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>>      struct cpu_user_regs *regs;
>>>  };
>>> 
>>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>>> -                        struct mca_handle_result *result);
>>> +enum mce_result
>>> +{
>>> +    MCER_NOERROR,
>>> +    MCER_RECOVERED,
>>> +    /* Not recoverd, but can continue */
>>> +    MCER_CONTINUE,
>>> +    MCER_RESET,
>>> +};
>>> 
>>>  struct mca_error_handler
>>>  {
>>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>>      */
>>>      int (*owned_error)(uint64_t status);
>>>      void (*recovery_handler)(struct mca_binfo *binfo,
>>> -                    struct mca_handle_result *result);
>>> +                    enum mce_result *result);
>>>  };
>>> 
>>>  /* Global variables */
>> 
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel