Re: [PATCH v2] powerpc/powernv: Framework to log critical errors on powernv.

Michael Ellerman Tue, 17 Dec 2013 18:44:20 -0800

On Mon, 2013-12-16 at 18:00 +0530, Deepthi Dharwar wrote:
> This patch provides error logging interfaces to report critical
> powernv error to FSP.
> All the required information to dump the error is collected
> at POWERNV level through error log interfaces
> and then pushed on to FSP.
> 
> This also supports synchronous error logging in cases of
> PANIC, by passing OPAL_ERROR_PANIC as severity in
> elog_create() call.


Please make note of the fact that none of this code is currently used but will
be in a subsequent patch. When can we expect those patches?


> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 0f01308..1c5440a 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -168,6 +168,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
>  #define OPAL_GET_MSG                         85
>  #define OPAL_CHECK_ASYNC_COMPLETION          86
>  #define OPAL_SYNC_HOST_REBOOT                        87
> +#define OPAL_ELOG_SEND                               88
>  
>  #ifndef __ASSEMBLY__
>  
> @@ -260,6 +261,122 @@ enum OpalMessageType {
>       OPAL_MSG_TYPE_MAX,
>  };
>  
> +/* Classification of Error/Events reporting type classification

Standard comment style for block comments is:

/*
 * Classification ...
 */

That applies to almost all of your comments in here.


> + * Platform Events/Errors: Report Machine Check Interrupt

I think these comments would be better inline with the values, eg:

        /* Report Machine Check Interrupt */
        OPAL_PLATFORM,

        /* Report all I/O related events/errors */
        OPAL_INPUT_OUTPUT,

        etc.


Again that applies to most of your comments.

> + * INPUT_OUTPUT: Report all I/O related events/errors
> + * RESOURCE_DEALLOC: Hotplug events and errors
> + * MISC: Miscellanous error
> + * Field: error_events_type

What is this "Field:" thing about?

> + */
> +enum error_events {

If you're going to define an enum you should actually use it in the API, I
can't see anywhere you do?

If you do want to use an enum it should be "opal_error_events".

> +     OPAL_PLATFORM,
> +     OPAL_INPUT_OUTPUT,
> +     OPAL_RESOURCE_DEALLOC,
> +     OPAL_MISC,
> +};
> +
> +/* OPAL Subsystem IDs listed for reporting events/errors
> + * Field: subsystem_id
> + */
> +
> +#define OPAL_PROCESSOR_SUBSYSTEM        0x10
> +#define OPAL_MEMORY_SUBSYSTEM           0x20
> +#define OPAL_IO_SUBSYSTEM               0x30
> +#define OPAL_IO_DEVICES                 0x40
> +#define OPAL_CEC_HARDWARE               0x50
> +#define OPAL_POWER_COOLING              0x60
> +#define OPAL_MISC_SUBSYSTEM             0x70
> +#define OPAL_SURVEILLANCE_ERR           0x7A
> +#define OPAL_PLATFORM_FIRMWARE          0x80
> +#define OPAL_SOFTWARE                   0x90
> +#define OPAL_EXTERNAL_ENV               0xA0
> +
> +/* During reporting an event/error the following represents
> + * how serious the logged event/error is. (Severity)
> + * Field: event_sev
> + */
> +#define OPAL_INFO                                   0x00
> +#define OPAL_RECOVERED_ERR_GENERAL                  0x10
> +
> +/* 0x2X series is to denote set of Predictive Error
> + * 0x20 Generic predictive error
> + * 0x21 Predictive error, degraded performance
> + * 0x22 Predictive error, fault may be corrected after reboot
> + * 0x23 Predictive error, fault may be corrected after reboot,
> + * degraded performance
> + * 0x24 Predictive error, loss of redundancy
> + */
> +#define OPAL_PREDICTIVE_ERR_GENERAL                         0x20
> +#define OPAL_PREDICTIVE_ERR_DEGRADED_PERF                   0x21
> +#define OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_REBOOT            0x22
> +#define OPAL_PREDICTIVE_ERR_FAULT_RECTIFY_BOOT_DEGRADE_PERF 0x23
> +#define OPAL_PREDICTIVE_ERR_LOSS_OF_REDUNDANCY              0x24
> +
> +/* 0x4X series for Unrecoverable Error
> + * 0x40 Generic Unrecoverable error
> + * 0x41 Unrecoverable error bypassed with degraded performance
> + * 0x44 Unrecoverable error bypassed with loss of redundancy
> + * 0x45 Unrecoverable error bypassed with loss of redundancy and performance
> + * 0x48 Unrecoverable error bypassed with loss of function
> + */
> +#define OPAL_UNRECOVERABLE_ERR_GENERAL                      0x40
> +#define OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF                 0x41
> +#define OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY              0x44
> +#define OPAL_UNRECOVERABLE_ERR_LOSS_REDUNDANCY_PERF         0x45
> +#define OPAL_UNRECOVERABLE_ERR_LOSS_OF_FUNCTION             0x48
> +#define OPAL_ERROR_PANIC                                    0x50
> +
> +/* Event Sub-type
> + * This field provides additional information on the non-error
> + * event type
> + * Field: event_subtype
> + */
> +#define OPAL_NA                                         0x00
> +#define OPAL_MISCELLANEOUS_INFO_ONLY                    0x01
> +#define OPAL_PREV_REPORTED_ERR_RECTIFIED                0x10
> +#define OPAL_SYS_RESOURCES_DECONFIG_BY_USER             0x20
> +#define OPAL_SYS_RESOURCE_DECONFIG_PRIOR_ERR            0x21
> +#define OPAL_RESOURCE_DEALLOC_EVENT_NOTIFY              0x22
> +#define OPAL_CONCURRENT_MAINTENANCE_EVENT               0x40
> +#define OPAL_CAPACITY_UPGRADE_EVENT                     0x60
> +#define OPAL_RESOURCE_SPARING_EVENT                     0x70
> +#define OPAL_DYNAMIC_RECONFIG_EVENT                     0x80
> +#define OPAL_NORMAL_SYS_PLATFORM_SHUTDOWN               0xD0
> +#define OPAL_ABNORMAL_POWER_OFF                         0xE0
> +
> +/* Max user dump size is 14K    */
> +#define OPAL_LOG_MAX_DUMP       14336
> +
> +/* Multiple user data sections */
> +struct opal_usr_data_scn {

Just spell it out? opal_user_data_section 

> +     uint32_t tag;
> +     uint16_t size;
> +     uint16_t component_id;
> +     char data_dump[4];
> +};
> +
> +/* All the information regarding an error/event to be reported
> + * needs to populate this structure using pre-defined interfaces
> + * only
> + */
> +struct opal_errorlog {
> +
> +     uint16_t component_id;
> +     uint8_t error_events_type:3;

Bit field?

> +     uint8_t subsystem_id;
> +
> +     uint8_t event_sev;
> +     uint8_t event_subtype;
> +     uint8_t usr_scn_count; /* User section count */

user_section_count;

> +     uint8_t elog_origin;
> +
> +     uint32_t usr_scn_size; /* User section size */

user_section_size;

> +     uint32_t reason_code;
> +     uint32_t additional_info[4];
> +
> +     char usr_data_dump[OPAL_LOG_MAX_DUMP];
> +};

It looks like this goes straight to Opal, should we be using __packed ?


> @@ -853,6 +970,14 @@ int64_t opal_dump_ack(uint32_t dump_id);
>  int64_t opal_get_msg(uint64_t buffer, size_t size);
>  int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
>  int64_t opal_sync_host_reboot(void);
> +struct opal_errorlog *elog_create(uint8_t err_evt_type, uint16_t 
> component_id,
> +             uint8_t subsystem_id, uint8_t event_sev, uint8_t  event_subtype,
> +             uint32_t reason_code, uint32_t info0, uint32_t info1,
> +             uint32_t info2, uint32_t info3);
> +int update_user_dump(struct opal_errorlog *buf, unsigned char *data,
> +                                     uint32_t tag, uint16_t size);
> +void commit_errorlog_to_fsp(struct opal_errorlog *buf);
> +int opal_commit_log_to_fsp(void *buf);

Are we using "opal_" as a prefix or not?

> diff --git a/arch/powerpc/platforms/powernv/opal-elog.c 
> b/arch/powerpc/platforms/powernv/opal-elog.c
> index 58849d0..ade1e58 100644
> --- a/arch/powerpc/platforms/powernv/opal-elog.c
> +++ b/arch/powerpc/platforms/powernv/opal-elog.c
> @@ -15,6 +15,7 @@
>  #include <linux/sysfs.h>
>  #include <linux/fs.h>
>  #include <linux/vmalloc.h>
> +#include <linux/mm.h>
>  #include <linux/fcntl.h>
>  #include <asm/uaccess.h>
>  #include <asm/opal.h>
> @@ -22,7 +23,9 @@
>  /* Maximum size of a single log on FSP is 16KB */
>  #define OPAL_MAX_ERRLOG_SIZE 16384
>  
> -/* maximu number of records powernv can hold */
> +#define USR_CHAR_ARRAY_FIXED_SIZE      4

What is this?

> +/* Maximum number of records powernv can hold */

That's an unrelated typo fix AFAICS, please send it separately.

>  #define MAX_NUM_RECORD       128
>  
>  struct opal_err_log {
> @@ -272,6 +275,61 @@ static int init_err_log_buffer(void)
>       return 0;
>  }
>  
> +/* Interface to be used by POWERNV to push the logs to FSP via Sapphire */
> +struct opal_errorlog *elog_create(uint8_t err_evt_type, uint16_t 
> component_id,
> +             uint8_t subsystem_id, uint8_t event_sev, uint8_t  event_subtype,
> +             uint32_t reason_code, uint32_t info0, uint32_t info1,
> +             uint32_t info2, uint32_t info3)


A call to this function is going to be just a giant list of integer values, it
will not be easy to see at a glance which value goes in which field.

I think you'd be better off with an elog_alloc() routine, and then you just do
the initialisation explicitly so that it's obvious which value goes where:

        elog->error_events_type = FOO;
        elog->component_id = BAR;
        elog->subsystem_id = ETC;


elog_create(uint8_t err_evt_type, uint16_t component_id,
> +             uint8_t subsystem_id, uint8_t event_sev, uint8_t  event_subtype,
> +             uint32_t reason_code, uint32_t info0, uint32_t info1,
> +             uint32_t info2, uint32_t info3)
> +{
> +     struct opal_errorlog *buf;
> +
> +     buf = kzalloc(sizeof(struct opal_errorlog), GFP_KERNEL);
> +     if (!buf) {
> +             printk(KERN_ERR "ELOG: failed to allocate memory.\n");
> +             return NULL;
> +     }
> +
> +     buf->error_events_type = err_evt_type;
> +     buf->component_id = component_id;
> +     buf->subsystem_id = subsystem_id;
> +     buf->event_sev = event_sev;
> +     buf->event_subtype = event_subtype;
> +     buf->reason_code = reason_code;
> +     buf->additional_info[0] = info0;
> +     buf->additional_info[1] = info1;
> +     buf->additional_info[2] = info2;
> +     buf->additional_info[3] = info3;
> +     return buf;
> +}
> +
> +int update_user_dump(struct opal_errorlog *buf, unsigned char *data,
> +                             uint32_t tag, uint16_t size)
> +{
> +     char *buffer = (char *)buf->usr_data_dump + buf->usr_scn_size;
> +     struct opal_usr_data_scn *tmp;
> +
> +     if ((buf->usr_scn_size + size) > OPAL_LOG_MAX_DUMP) {
> +             printk(KERN_ERR "ELOG: Size of dump data overruns buffer");

Use pr_err() and set pr_fmt() to "opal-error-log" at the top of the file.

> +             return -1;
> +     }
> +
> +     tmp = (struct opal_usr_data_scn *)buffer;
> +     tmp->tag = tag;
> +     tmp->size = size + sizeof(struct opal_usr_data_scn)
> +                             - USR_CHAR_ARRAY_FIXED_SIZE;
> +     memcpy(tmp->data_dump, data, size);
> +
> +     buf->usr_scn_size += tmp->size;
> +     buf->usr_scn_count++;
> +     return 0;
> +}
> +
> +void commit_errorlog_to_fsp(struct opal_errorlog *buf)
> +{
> +     opal_commit_log_to_fsp((void *)(vmalloc_to_pfn(buf) << PAGE_SHIFT));

Can't fail?

> +     kfree(buf);

It's a bit rude to free buf when the caller still has a pointer to it.

> +}


cheers


_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] powerpc/powernv: Framework to log critical errors on powernv.

Reply via email to