[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering
From: |
Igor Mammedov |
Subject: |
Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering |
Date: |
Tue, 24 Jun 2014 09:33:48 +0200 |
On Mon, 23 Jun 2014 12:33:10 -0700
Nishanth Aravamudan <address@hidden> wrote:
> Add a present field to NodeInfo which indicates if a given nodeid was
> present on the command-line or not. Current code relies on a memory
> value being passed for a node to indicate presence, which is
> insufficient in the presence of memoryless nodes or sparse numbering.
> Adjust the iteration of various NUMA loops to use the maximum known NUMA
> ID rather than the number of NUMA nodes.
Why would you need sparse numbering?
What task exactly are you trying to solve?
>
> numa.c::set_numa_nodes() has become a bit more convoluted for
> round-robin'ing the CPUs over known nodes when not specified by the
> user.
>
> Note that architecture-specific code still needs changes (forthcoming)
> for both sparse node numbering and memoryless nodes.
>
> Examples:
>
> (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa
> node,nodeid=2 -smp 16
>
> Before:
>
> node 0 cpus: 0 2 4 6 8 10 12 14
> node 0 size: 2048 MB
> node 1 cpus: 1 3 5 7 9 11 13 15
> node 1 size: 2048 MB
>
> After:
>
> node 2 cpus: 0 2 4 6 8 10 12 14
> |
> node 2 size: 2048 MB
> |
> node 3 cpus: 1 3 5 7 9 11 13 15
> |
> node 3 size: 2048 MB
>
> (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa
> node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16
>
> Before:
>
> node 0 cpus: 0 4 8 12
> |
> node 0 size: 1024 MB
> |
> node 1 cpus: 1 5 9 13
> |
> node 1 size: 1024 MB
> |
> node 2 cpus: 2 6 10 14
> |
> node 2 size: 1024 MB
> |
> node 3 cpus: 3 7 11 15
> |
> node 3 size: 1024 MB
>
> After:
>
> node 0 cpus: 0 4 8 12
> node 0 size: 1024 MB
> node 1 cpus: 1 5 9 13
> node 1 size: 1024 MB
> node 2 cpus: 2 6 10 14
> node 2 size: 1024 MB
> node 3 cpus: 3 7 11 15
> node 3 size: 1024 MB
>
> Signed-off-by: Nishanth Aravamudan <address@hidden>
>
> ---
> Based off mst's for_upstream tag, which has the NodeInfo changes.
>
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 277230d..b90bf66 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -145,11 +145,13 @@ extern int mem_prealloc;
> */
> #define MAX_CPUMASK_BITS 255
>
> -extern int nb_numa_nodes;
> +extern int nb_numa_nodes; /* Number of NUMA nodes */
> +extern int max_numa_node; /* Highest specified NUMA node ID */
> typedef struct node_info {
> uint64_t node_mem;
> DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> struct HostMemoryBackend *node_memdev;
> + bool present;
> } NodeInfo;
> extern NodeInfo numa_info[MAX_NODES];
> void set_numa_nodes(void);
> diff --git a/monitor.c b/monitor.c
> index c7f8797..4721996 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict
> *qdict)
> CPUState *cpu;
>
> monitor_printf(mon, "%d nodes\n", nb_numa_nodes);
> - for (i = 0; i < nb_numa_nodes; i++) {
> + for (i = 0; i < max_numa_node; i++) {
> + if (!numa_info[i].present) {
> + continue;
> + }
> monitor_printf(mon, "node %d cpus:", i);
> CPU_FOREACH(cpu) {
> if (cpu->numa_node == i) {
> diff --git a/numa.c b/numa.c
> index e471afe..c31857d 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -106,6 +106,10 @@ static void numa_node_parse(NumaNodeOptions *node,
> QemuOpts *opts, Error **errp)
> numa_info[nodenr].node_mem = object_property_get_int(o, "size",
> NULL);
> numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
> }
> + numa_info[nodenr].present = true;
> + if (nodenr >= max_numa_node) {
> + max_numa_node = nodenr + 1;
> + }
> }
>
> int numa_init_func(QemuOpts *opts, void *opaque)
> @@ -155,7 +159,7 @@ void set_numa_nodes(void)
> {
> if (nb_numa_nodes > 0) {
> uint64_t numa_total;
> - int i;
> + int i, j, last_j = -1;
>
> if (nb_numa_nodes > MAX_NODES) {
> nb_numa_nodes = MAX_NODES;
> @@ -164,27 +168,29 @@ void set_numa_nodes(void)
> /* If no memory size if given for any node, assume the default case
> * and distribute the available memory equally across all nodes
> */
> - for (i = 0; i < nb_numa_nodes; i++) {
> - if (numa_info[i].node_mem != 0) {
> + for (i = 0; i < max_numa_node; i++) {
> + if (numa_info[i].present && numa_info[i].node_mem != 0) {
> break;
> }
> }
> - if (i == nb_numa_nodes) {
> + if (i == max_numa_node) {
> uint64_t usedmem = 0;
>
> /* On Linux, the each node's border has to be 8MB aligned,
> * the final node gets the rest.
> */
> - for (i = 0; i < nb_numa_nodes - 1; i++) {
> - numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> - ~((1 << 23UL) - 1);
> - usedmem += numa_info[i].node_mem;
> + for (i = 0; i < max_numa_node - 1; i++) {
> + if (numa_info[i].present) {
> + numa_info[i].node_mem = (ram_size / nb_numa_nodes) &
> + ~((1 << 23UL) - 1);
> + usedmem += numa_info[i].node_mem;
> + }
> }
> numa_info[i].node_mem = ram_size - usedmem;
> }
>
> numa_total = 0;
> - for (i = 0; i < nb_numa_nodes; i++) {
> + for (i = 0; i < max_numa_node; i++) {
> numa_total += numa_info[i].node_mem;
> }
> if (numa_total != ram_size) {
> @@ -194,8 +200,9 @@ void set_numa_nodes(void)
> exit(1);
> }
>
> - for (i = 0; i < nb_numa_nodes; i++) {
> - if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> + for (i = 0; i < max_numa_node; i++) {
> + if (numa_info[i].present &&
> + !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
> break;
> }
> }
> @@ -203,9 +210,19 @@ void set_numa_nodes(void)
> * must cope with this anyway, because there are BIOSes out there in
> * real machines which also use this scheme.
> */
> - if (i == nb_numa_nodes) {
> + if (i == max_numa_node) {
> for (i = 0; i < max_cpus; i++) {
> - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
> + for (j = last_j + 1;
> + j != (last_j == 0 ? max_numa_node : last_j - 1); j++) {
> + if (j >= max_numa_node) {
> + j = 0;
> + }
> + if (numa_info[j].present) {
> + break;
> + }
> + }
> + last_j = j;
> + set_bit(i, numa_info[j].node_cpu);
> }
> }
> }
> @@ -217,7 +234,7 @@ void set_numa_modes(void)
> int i;
>
> CPU_FOREACH(cpu) {
> - for (i = 0; i < nb_numa_nodes; i++) {
> + for (i = 0; i < max_numa_node; i++) {
> if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) {
> cpu->numa_node = i;
> }
> diff --git a/vl.c b/vl.c
> index 54b4627..e1a6ab8 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
> QTAILQ_HEAD_INITIALIZER(fw_boot_order);
>
> int nb_numa_nodes;
> +int max_numa_node;
> NodeInfo numa_info[MAX_NODES];
>
> uint8_t qemu_uuid[16];
> @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp)
>
> for (i = 0; i < MAX_NODES; i++) {
> numa_info[i].node_mem = 0;
> + numa_info[i].present = false;
> bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);
> }
>
> nb_numa_nodes = 0;
> + max_numa_node = -1;
> nb_nics = 0;
>
> bdrv_init_with_whitelist();
>
>
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, (continued)
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Michael S. Tsirkin, 2014/06/25
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Hu Tao, 2014/06/26
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Nishanth Aravamudan, 2014/06/26
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Eduardo Habkost, 2014/06/26
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Eduardo Habkost, 2014/06/26
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Nishanth Aravamudan, 2014/06/30
- Re: [Qemu-devel] [RFC PATCH v3] numa: enable sparse node numbering, Eduardo Habkost, 2014/06/30
Re: [Qemu-devel] [RFC PATCH] numa: enable sparse node numbering,
Igor Mammedov <=