qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC PATCH v2 6/6] cxl/core: add poison injection event handler


From: Shiyang Ruan
Subject: [RFC PATCH v2 6/6] cxl/core: add poison injection event handler
Date: Fri, 29 Mar 2024 14:36:14 +0800

Currently driver only traces cxl events, poison injection (for both vmem
and pmem type) on cxl memdev is silent.  OS needs to be notified then it
could handle poison range in time.  Per CXL spec, the device error event
could be signaled through FW-First and OS-First methods.

So, add poison event handler in OS-First method:
  - qemu:
    - CXL device report POISON event to OS by MSI by sending GMER after
      injecting a poison record
  - CXL driver
    a. parse the POISON event from GMER;        <-- this patch
    b. retrieve POISON list from memdev;
    c. translate poisoned DPA to HPA;
    d. enqueue poisoned PFN to memory_failure's work queue;

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---

the reply to Jonathan's comment in last version:
> I'm not 100% convinced this is necessary poison causing.  Also
> the text tells us we should see 'an appropriate event'.
> DRAM one seems likely to be chosen by some vendors.
I think it's right to use DRAM Event Record for volatile-memdev, but 
should poison on a persistent-memdev also use DRAM Event Record too? 
Though its 'Physical Address' feild has the 'Volatile' bit too, which is 
same as General Media Event Record.  I am a bit confused about this.

---
 drivers/cxl/core/mbox.c | 100 ++++++++++++++++++++++++++++++++++------
 drivers/cxl/cxlmem.h    |   8 ++--
 2 files changed, 91 insertions(+), 17 deletions(-)

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 19b46fb06ed6..97ef45d808b8 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -837,25 +837,99 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
 
-void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-                           enum cxl_event_log_type type,
-                           enum cxl_event_type event_type,
-                           const uuid_t *uuid, union cxl_event *evt)
+struct cxl_event_poison_context {
+       u64 dpa;
+       u64 length;
+};
+
+static int __cxl_report_poison(struct device *dev, void *arg)
+{
+       struct cxl_event_poison_context *ctx = arg;
+       struct cxl_endpoint_decoder *cxled;
+       struct cxl_memdev *cxlmd;
+
+       cxled = to_cxl_endpoint_decoder(dev);
+       if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
+               return 0;
+
+       if (cxled->mode == CXL_DECODER_MIXED) {
+               dev_dbg(dev, "poison list read unsupported in mixed mode\n");
+               return 0;
+       }
+
+       if (ctx->dpa > cxled->dpa_res->end || ctx->dpa < cxled->dpa_res->start)
+               return 0;
+
+       cxlmd = cxled_to_memdev(cxled);
+       cxl_mem_get_poison(cxlmd, ctx->dpa, ctx->length, cxled->cxld.region,
+                          true);
+
+       return 1;
+}
+
+static void cxl_event_handle_poison(struct cxl_memdev *cxlmd,
+                                   struct cxl_event_gen_media *rec)
+{
+       struct cxl_port *port = cxlmd->endpoint;
+       u64 phys_addr = le64_to_cpu(rec->phys_addr);
+       struct cxl_event_poison_context ctx = {
+               .dpa = phys_addr & CXL_DPA_MASK,
+       };
+
+       /* No regions mapped to this memdev, that is to say no HPA is mapped */
+       if (!port || !is_cxl_endpoint(port) ||
+           cxl_num_decoders_committed(port) == 0)
+               return;
+
+       /*
+        * Host Inject Poison may have a range of DPA, but the GMER only has
+        * "Physical Address" field, no such one indicates length.  So it's
+        * better to call cxl_mem_get_poison() to find this poison record.
+        */
+       ctx.length = phys_addr & CXL_DPA_VOLATILE ?
+                       resource_size(&cxlmd->cxlds->ram_res) :
+                       resource_size(&cxlmd->cxlds->pmem_res) - ctx.dpa;
+
+       device_for_each_child(&port->dev, &ctx, __cxl_report_poison);
+}
+
+static void cxl_event_handle_general_media(struct cxl_memdev *cxlmd,
+                                          enum cxl_event_log_type type,
+                                          struct cxl_event_gen_media *rec)
+{
+       if (type == CXL_EVENT_TYPE_FAIL) {
+               switch (rec->transaction_type) {
+               case CXL_EVENT_TRANSACTION_READ:
+               case CXL_EVENT_TRANSACTION_WRITE:
+               case CXL_EVENT_TRANSACTION_INJECT_POISON:
+                       cxl_event_handle_poison(cxlmd, rec);
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+
+void cxl_event_handle_record(struct cxl_memdev *cxlmd,
+                            enum cxl_event_log_type type,
+                            enum cxl_event_type event_type,
+                            const uuid_t *uuid, union cxl_event *evt)
 {
-       if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
+       if (event_type == CXL_CPER_EVENT_GEN_MEDIA) {
                trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-       else if (event_type == CXL_CPER_EVENT_DRAM)
+               cxl_event_handle_general_media(cxlmd, type, &evt->gen_media);
+       } else if (event_type == CXL_CPER_EVENT_DRAM)
                trace_cxl_dram(cxlmd, type, &evt->dram);
        else if (event_type == CXL_CPER_EVENT_MEM_MODULE)
                trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
        else
                trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
 }
-EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
+EXPORT_SYMBOL_NS_GPL(cxl_event_handle_record, CXL);
 
-static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-                                    enum cxl_event_log_type type,
-                                    struct cxl_event_record_raw *record)
+static void __cxl_event_handle_record(struct cxl_memdev *cxlmd,
+                                     enum cxl_event_log_type type,
+                                     struct cxl_event_record_raw *record)
 {
        enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
        const uuid_t *uuid = &record->id;
@@ -867,7 +941,7 @@ static void __cxl_event_trace_record(const struct 
cxl_memdev *cxlmd,
        else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
                ev_type = CXL_CPER_EVENT_MEM_MODULE;
 
-       cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
+       cxl_event_handle_record(cxlmd, type, ev_type, uuid, &record->event);
 }
 
 static int cxl_clear_event_record(struct cxl_memdev_state *mds,
@@ -978,8 +1052,8 @@ static void cxl_mem_get_records_log(struct 
cxl_memdev_state *mds,
                        break;
 
                for (i = 0; i < nr_rec; i++)
-                       __cxl_event_trace_record(cxlmd, type,
-                                                &payload->records[i]);
+                       __cxl_event_handle_record(cxlmd, type,
+                                                 &payload->records[i]);
 
                if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
                        trace_cxl_overflow(cxlmd, type, payload);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 1f03130b9d6a..dfd7bdd0d66a 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -822,10 +822,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state 
*mds,
 void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
                                  unsigned long *cmds);
 void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
-void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-                           enum cxl_event_log_type type,
-                           enum cxl_event_type event_type,
-                           const uuid_t *uuid, union cxl_event *evt);
+void cxl_event_handle_record(struct cxl_memdev *cxlmd,
+                            enum cxl_event_log_type type,
+                            enum cxl_event_type event_type,
+                            const uuid_t *uuid, union cxl_event *evt);
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
-- 
2.34.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]