[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v19 2/8] numa: Extend CLI to provide memory latency and bandwidth
From: |
Tao Xu |
Subject: |
[PATCH v19 2/8] numa: Extend CLI to provide memory latency and bandwidth information |
Date: |
Thu, 28 Nov 2019 16:21:03 +0800 |
From: Liu Jingqi <address@hidden>
Add -numa hmat-lb option to provide System Locality Latency and
Bandwidth Information. These memory attributes help to build
System Locality Latency and Bandwidth Information Structure(s)
in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using
hmat-lb option, enable HMAT with -machine hmat=on.
Signed-off-by: Liu Jingqi <address@hidden>
Signed-off-by: Tao Xu <address@hidden>
---
Changes in v19:
- Add description about the machine property 'hmat' in commit
message (Markus)
Changes in v18:
- Use qapi type uint64 and only nanosecond for latency (Markus)
Changes in v17:
- Add check when user input latency or bandwidth 0, the
lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4,
0 means the corresponding latency or bandwidth information is
not provided.
- Fix the infinite loop when node->latency is 0.
Changes in v16:
- Initialize HMAT_LB_Data lb_data (Igor)
- Remove punctuation from error_setg (Igor)
- Correct some description (Igor)
- Drop statement about max value (Igor)
- Simplify struct HMAT_LB_Info and related code, unify latency
and bandwidth (Igor)
---
hw/core/numa.c | 181 ++++++++++++++++++++++++++++++++++++++++++
include/sysemu/numa.h | 53 +++++++++++++
qapi/machine.json | 93 +++++++++++++++++++++-
qemu-options.hx | 48 ++++++++++-
4 files changed, 372 insertions(+), 3 deletions(-)
diff --git a/hw/core/numa.c b/hw/core/numa.c
index e60da99293..2183c8df1f 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -23,6 +23,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/units.h"
#include "sysemu/hostmem.h"
#include "sysemu/numa.h"
#include "sysemu/sysemu.h"
@@ -198,6 +199,173 @@ void parse_numa_distance(MachineState *ms,
NumaDistOptions *dist, Error **errp)
ms->numa_state->have_numa_distance = true;
}
+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+ Error **errp)
+{
+ int i, first_bit, last_bit;
+ uint64_t max_entry, temp_base_la;
+ NodeInfo *numa_info = numa_state->nodes;
+ HMAT_LB_Info *hmat_lb =
+ numa_state->hmat_lb[node->hierarchy][node->data_type];
+ HMAT_LB_Data lb_data = {};
+ HMAT_LB_Data *lb_temp;
+
+ /* Error checking */
+ if (node->initiator > numa_state->num_nodes) {
+ error_setg(errp, "Invalid initiator=%d, it should be less than %d",
+ node->initiator, numa_state->num_nodes);
+ return;
+ }
+ if (node->target > numa_state->num_nodes) {
+ error_setg(errp, "Invalid target=%d, it should be less than %d",
+ node->target, numa_state->num_nodes);
+ return;
+ }
+ if (!numa_info[node->initiator].has_cpu) {
+ error_setg(errp, "Invalid initiator=%d, it isn't an "
+ "initiator proximity domain", node->initiator);
+ return;
+ }
+ if (!numa_info[node->target].present) {
+ error_setg(errp, "The target=%d should point to an existing node",
+ node->target);
+ return;
+ }
+
+ if (!hmat_lb) {
+ hmat_lb = g_malloc0(sizeof(*hmat_lb));
+ numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
+ hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data));
+ }
+ hmat_lb->hierarchy = node->hierarchy;
+ hmat_lb->data_type = node->data_type;
+ lb_data.initiator = node->initiator;
+ lb_data.target = node->target;
+
+ if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
+ /* Input latency data */
+
+ if (!node->has_latency) {
+ error_setg(errp, "Missing 'latency' option");
+ return;
+ }
+ if (node->has_bandwidth) {
+ error_setg(errp, "Invalid option 'bandwidth' since "
+ "the data type is latency");
+ return;
+ }
+
+ /* Detect duplicate configuration */
+ for (i = 0; i < hmat_lb->list->len; i++) {
+ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
+
+ if (node->initiator == lb_temp->initiator &&
+ node->target == lb_temp->target) {
+ error_setg(errp, "Duplicate configuration of the latency for "
+ "initiator=%d and target=%d", node->initiator,
+ node->target);
+ return;
+ }
+ }
+
+ hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX;
+
+ if (node->latency) {
+ /* Calculate the temporary base and compressed latency */
+ max_entry = node->latency;
+ temp_base_la = 1;
+ while (QEMU_IS_ALIGNED(max_entry, 10)) {
+ max_entry /= 10;
+ temp_base_la *= 10;
+ }
+
+ /* Calculate the max compressed latency */
+ hmat_lb->base = MIN(hmat_lb->base, temp_base_la);
+ max_entry = node->latency / hmat_lb->base;
+ hmat_lb->range_bitmap = MAX(hmat_lb->range_bitmap, max_entry);
+
+ /*
+ * For latency hmat_lb->range_bitmap record the max compressed
+ * latency which should be less than 0xFFFF (UINT16_MAX)
+ */
+ if (hmat_lb->range_bitmap >= UINT16_MAX) {
+ error_setg(errp, "Latency %" PRIu64 " between initiator=%d and
"
+ "target=%d should not differ from previously entered "
+ "min or max values on more than %d", node->latency,
+ node->initiator, node->target, UINT16_MAX - 1);
+ return;
+ }
+
+ /*
+ * Set lb_info_provided bit 0 as 1,
+ * latency information is provided
+ */
+ numa_info[node->target].lb_info_provided |= BIT(0);
+ }
+ lb_data.data = node->latency;
+ } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
+ /* Input bandwidth data */
+
+ if (!node->has_bandwidth) {
+ error_setg(errp, "Missing 'bandwidth' option");
+ return;
+ }
+ if (node->has_latency) {
+ error_setg(errp, "Invalid option 'latency' since "
+ "the data type is bandwidth");
+ return;
+ }
+ if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
+ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
+ "target=%d should be 1MB aligned", node->bandwidth,
+ node->initiator, node->target);
+ return;
+ }
+
+ /* Detect duplicate configuration */
+ for (i = 0; i < hmat_lb->list->len; i++) {
+ lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
+
+ if (node->initiator == lb_temp->initiator &&
+ node->target == lb_temp->target) {
+ error_setg(errp, "Duplicate configuration of the bandwidth for
"
+ "initiator=%d and target=%d", node->initiator,
+ node->target);
+ return;
+ }
+ }
+
+ hmat_lb->range_bitmap |= node->bandwidth;
+ first_bit = ctz64(hmat_lb->range_bitmap);
+ hmat_lb->base = UINT64_C(1) << first_bit;
+ max_entry = node->bandwidth / hmat_lb->base;
+ last_bit = 64 - clz64(hmat_lb->range_bitmap);
+
+ /*
+ * For bandwidth, first_bit record the base unit of bandwidth bits,
+ * last_bit record the last bit of the max bandwidth. The max
compressed
+ * bandwidth should be less than 0xFFFF (UINT16_MAX)
+ */
+ if ((last_bit - first_bit) > UINT16_BITS || max_entry >= UINT16_MAX) {
+ error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
+ "target=%d should not differ from previously entered "
+ "values on more than %d", node->bandwidth,
+ node->initiator, node->target, UINT16_MAX - 1);
+ return;
+ }
+
+ /* Set lb_info_provided bit 1 as 1, bandwidth information is provided
*/
+ if (node->bandwidth) {
+ numa_info[node->target].lb_info_provided |= BIT(1);
+ }
+ lb_data.data = node->bandwidth;
+ } else {
+ assert(0);
+ }
+
+ g_array_append_val(hmat_lb->list, lb_data);
+}
+
void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
{
Error *err = NULL;
@@ -236,6 +404,19 @@ void set_numa_options(MachineState *ms, NumaOptions
*object, Error **errp)
machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
&err);
break;
+ case NUMA_OPTIONS_TYPE_HMAT_LB:
+ if (!ms->numa_state->hmat_enabled) {
+ error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
+ "(HMAT) is disabled, enable it with -machine hmat=on "
+ "before using any of hmat specific options");
+ return;
+ }
+
+ parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
+ if (err) {
+ goto end;
+ }
+ break;
default:
abort();
}
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 788cbec7a2..70f93c83d7 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -14,11 +14,34 @@ struct CPUArchId;
#define NUMA_DISTANCE_MAX 254
#define NUMA_DISTANCE_UNREACHABLE 255
+/* the value of AcpiHmatLBInfo flags */
+enum {
+ HMAT_LB_MEM_MEMORY = 0,
+ HMAT_LB_MEM_CACHE_1ST_LEVEL = 1,
+ HMAT_LB_MEM_CACHE_2ND_LEVEL = 2,
+ HMAT_LB_MEM_CACHE_3RD_LEVEL = 3,
+ HMAT_LB_LEVELS /* must be the last entry */
+};
+
+/* the value of AcpiHmatLBInfo data type */
+enum {
+ HMAT_LB_DATA_ACCESS_LATENCY = 0,
+ HMAT_LB_DATA_READ_LATENCY = 1,
+ HMAT_LB_DATA_WRITE_LATENCY = 2,
+ HMAT_LB_DATA_ACCESS_BANDWIDTH = 3,
+ HMAT_LB_DATA_READ_BANDWIDTH = 4,
+ HMAT_LB_DATA_WRITE_BANDWIDTH = 5,
+ HMAT_LB_TYPES /* must be the last entry */
+};
+
+#define UINT16_BITS 16
+
struct NodeInfo {
uint64_t node_mem;
struct HostMemoryBackend *node_memdev;
bool present;
bool has_cpu;
+ uint8_t lb_info_provided;
uint16_t initiator;
uint8_t distance[MAX_NODES];
};
@@ -28,6 +51,31 @@ struct NumaNodeMem {
uint64_t node_plugged_mem;
};
+struct HMAT_LB_Data {
+ uint8_t initiator;
+ uint8_t target;
+ uint64_t data;
+};
+typedef struct HMAT_LB_Data HMAT_LB_Data;
+
+struct HMAT_LB_Info {
+ /* Indicates it's memory or the specified level memory side cache. */
+ uint8_t hierarchy;
+
+ /* Present the type of data, access/read/write latency or bandwidth. */
+ uint8_t data_type;
+
+ /* The range bitmap of bandwidth for calculating common base */
+ uint64_t range_bitmap;
+
+ /* The common base unit for latencies or bandwidths */
+ uint64_t base;
+
+ /* Array to store the latencies or bandwidths */
+ GArray *list;
+};
+typedef struct HMAT_LB_Info HMAT_LB_Info;
+
struct NumaState {
/* Number of NUMA nodes */
int num_nodes;
@@ -40,11 +88,16 @@ struct NumaState {
/* NUMA nodes information */
NodeInfo nodes[MAX_NODES];
+
+ /* NUMA nodes HMAT Locality Latency and Bandwidth Information */
+ HMAT_LB_Info *hmat_lb[HMAT_LB_LEVELS][HMAT_LB_TYPES];
};
typedef struct NumaState NumaState;
void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp);
void parse_numa_opts(MachineState *ms);
+void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
+ Error **errp);
void numa_complete_configuration(MachineState *ms);
void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms);
extern QemuOptsList qemu_numa_opts;
diff --git a/qapi/machine.json b/qapi/machine.json
index 27d0e37534..cf9851fcd1 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -426,10 +426,12 @@
#
# @cpu: property based CPU(s) to node mapping (Since: 2.10)
#
+# @hmat-lb: memory latency and bandwidth information (Since: 5.0)
+#
# Since: 2.1
##
{ 'enum': 'NumaOptionsType',
- 'data': [ 'node', 'dist', 'cpu' ] }
+ 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] }
##
# @NumaOptions:
@@ -444,7 +446,8 @@
'data': {
'node': 'NumaNodeOptions',
'dist': 'NumaDistOptions',
- 'cpu': 'NumaCpuOptions' }}
+ 'cpu': 'NumaCpuOptions',
+ 'hmat-lb': 'NumaHmatLBOptions' }}
##
# @NumaNodeOptions:
@@ -557,6 +560,92 @@
'base': 'CpuInstanceProperties',
'data' : {} }
+##
+# @HmatLBMemoryHierarchy:
+#
+# The memory hierarchy in the System Locality Latency and Bandwidth
+# Information Structure of HMAT (Heterogeneous Memory Attribute Table)
+#
+# For more information about @HmatLBMemoryHierarchy see chapter
+# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec.
+#
+# @memory: the structure represents the memory performance
+#
+# @first-level: first level of memory side cache
+#
+# @second-level: second level of memory side cache
+#
+# @third-level: third level of memory side cache
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatLBMemoryHierarchy',
+ 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] }
+
+##
+# @HmatLBDataType:
+#
+# Data type in the System Locality Latency and Bandwidth
+# Information Structure of HMAT (Heterogeneous Memory Attribute Table)
+#
+# For more information about @HmatLBDataType see chapter
+# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec.
+#
+# @access-latency: access latency (nanoseconds)
+#
+# @read-latency: read latency (nanoseconds)
+#
+# @write-latency: write latency (nanoseconds)
+#
+# @access-bandwidth: access bandwidth (Bytes per second)
+#
+# @read-bandwidth: read bandwidth (Bytes per second)
+#
+# @write-bandwidth: write bandwidth (Bytes per second)
+#
+# Since: 5.0
+##
+{ 'enum': 'HmatLBDataType',
+ 'data': [ 'access-latency', 'read-latency', 'write-latency',
+ 'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] }
+
+##
+# @NumaHmatLBOptions:
+#
+# Set the system locality latency and bandwidth information
+# between Initiator and Target proximity Domains.
+#
+# For more information about @NumaHmatLBOptions see chapter
+# 5.2.27.4: Table 5-146 of ACPI 6.3 spec.
+#
+# @initiator: the Initiator Proximity Domain.
+#
+# @target: the Target Proximity Domain.
+#
+# @hierarchy: the Memory Hierarchy. Indicates the performance
+# of memory or side cache.
+#
+# @data-type: presents the type of data, access/read/write
+# latency or hit latency.
+#
+# @latency: the value of latency from @initiator to @target
+# proximity domain, the latency unit is "ns(nanosecond)".
+#
+# @bandwidth: the value of bandwidth between @initiator and @target
+# proximity domain, the bandwidth unit is
+# "Bytes per second".
+#
+# Since: 5.0
+##
+{ 'struct': 'NumaHmatLBOptions',
+ 'data': {
+ 'initiator': 'uint16',
+ 'target': 'uint16',
+ 'hierarchy': 'HmatLBMemoryHierarchy',
+ 'data-type': 'HmatLBDataType',
+ '*latency': 'uint64',
+ '*bandwidth': 'size' }}
+
##
# @HostMemPolicy:
#
diff --git a/qemu-options.hx b/qemu-options.hx
index 63f6b33322..23303fc7d7 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa,
"-numa
node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
"-numa
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n"
"-numa dist,src=source,dst=destination,val=distance\n"
- "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n",
+ "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n"
+ "-numa
hmat-lb,initiator=node,target=node,hierarchy=memory|first-level|second-level|third-level,data-type=access-latency|read-latency|write-latency[,latency=lat][,bandwidth=bw]\n",
QEMU_ARCH_ALL)
STEXI
@item -numa
node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
@itemx -numa
node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}][,initiator=@var{initiator}]
@itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance}
@itemx -numa
cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}]
+@itemx -numa
hmat-lb,initiator=@var{node},target=@var{node},hierarchy=@var{hierarchy},data-type=@var{tpye}[,latency=@var{lat}][,bandwidth=@var{bw}]
@findex -numa
Define a NUMA node and assign RAM and VCPUs to it.
Set the NUMA distance from a source node to a destination node.
+Set the ACPI Heterogeneous Memory Attributes for the given nodes.
Legacy VCPU assignment uses @samp{cpus} option where
@var{firstcpu} and @var{lastcpu} are CPU indexes. Each
@@ -256,6 +259,49 @@ specified resources, it just assigns existing resources to
NUMA
nodes. This means that one still has to use the @option{-m},
@option{-smp} options to allocate RAM and VCPUs respectively.
+Use @samp{hmat-lb} to set System Locality Latency and Bandwidth Information
+between initiator and target NUMA nodes in ACPI Heterogeneous Attribute Memory
Table (HMAT).
+Initiator NUMA node can create memory requests, usually it has one or more
processors.
+Target NUMA node contains addressable memory.
+
+In @samp{hmat-lb} option, @var{node} are NUMA node IDs. @var{hierarchy} is the
memory
+hierarchy of the target NUMA node: if @var{hierarchy} is 'memory', the
structure
+represents the memory performance; if @var{hierarchy} is
'first-level|second-level|third-level',
+this structure represents aggregated performance of memory side caches for
each domain.
+@var{type} of 'data-type' is type of data represented by this structure
instance:
+if 'hierarchy' is 'memory', 'data-type' is 'access|read|write' latency or
'access|read|write'
+bandwidth of the target memory; if 'hierarchy' is
'first-level|second-level|third-level',
+'data-type' is 'access|read|write' hit latency or 'access|read|write' hit
bandwidth of the
+target memory side cache.
+
+@var{lat} is latency value in nanoseconds. @var{bw} is bandwidth value,
+the possible value and units are NUM[M|G|T], mean that the bandwidth value are
+NUM byte per second (or MB/s, GB/s or TB/s depending on used suffix).
+And if input bandwidth value without any unit, the unit will be byte per
second.
+Note that if latency or bandwidth value is 0, means the corresponding latency
or
+bandwidth information is not provided.
+
+For example, the following options describe 2 NUMA nodes. Node 0 has 2 cpus and
+a ram, node 1 has only a ram. The processors in node 0 access memory in node
+0 with access-latency 5 nanoseconds, access-bandwidth is 200 MB/s;
+The processors in NUMA node 0 access memory in NUMA node 1 with access-latency
10
+nanoseconds, access-bandwidth is 100 MB/s.
+@example
+-machine hmat=on \
+-m 2G \
+-object memory-backend-ram,size=1G,id=m0 \
+-object memory-backend-ram,size=1G,id=m1 \
+-smp 2 \
+-numa node,nodeid=0,memdev=m0 \
+-numa node,nodeid=1,memdev=m1,initiator=0 \
+-numa cpu,node-id=0,socket-id=0 \
+-numa cpu,node-id=0,socket-id=1 \
+-numa
hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,latency=5
\
+-numa
hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,bandwidth=200M
\
+-numa
hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,latency=10
\
+-numa
hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,bandwidth=100M
+@end example
+
ETEXI
DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd,
--
2.20.1