using libnuma calls on RedHat 6.3 x86_64 with the default kernel and up to 3.4.29
don't allocate on the specified numa nodes, even when forced with numactl.

It appears that setting the NUMA policy, and or numa nodes does little for large allocations. Using HUGETLBFS, and you get memory on most any node BUT the one you asked for.
It appears that it allocates on the last node that did a free().

Here is a small program to demo the lack of numa awareness from user space.

#include <stdlib.h>
#include <sys/shm.h>
#include <stdio.h>
#include <numa.h>
#include <numaif.h>
#include <sched.h>              // for sched_getcpu() call

static const unsigned long HUGE_PAGE_SIZE = 1UL << 21;  // a 2MB huge page
static const unsigned long HUGE_PAGE_SIZE1 = (1UL << 21) - 1;   // less one
static const unsigned long PAGE_SIZE = 1UL << 10;       // a 4KB page
static const unsigned long PAGE_SIZE1 = (1UL << 10) - 1;        // less one

int VerifyNumaNode(void *ptr,   // address
                   int node,    // target node
                   int Count);  // count of 4KB pages
int MoveAddrToNodeMulti(void *ptr, int node, int Count);

void *Allocate(size_t length, int OnNode)
{
   int shmid = -1;
   void *shmaddr = NULL;
   size_t new_length = length;
   int MaxNumaNode = numa_max_node();   // find highest NUMA number
   int LocalNumaNode = numa_node_of_cpu(sched_getcpu());
   int NewNumaNode = LocalNumaNode;
   unsigned long MaskBits[2] = { 0UL, 0UL };    // up to 128 nodes
   struct bitmask NewMask;
   NewMask.size = 8;            // Max nodes on an HP
   struct bitmask *CurrentMask = numa_get_membind();

   // see if NUMA allocation is desired
   if (OnNode >= 0)
   {
      if (OnNode > MaxNumaNode)
      {
fprintf(stderr, "Invalid NUMA HUGEPages allocation node %d max is %d\n", OnNode, MaxNumaNode);
      }
      else
      {
         NewNumaNode = OnNode;
      }
   }
   MaskBits[0] = 1UL << NewNumaNode;
   numa_set_membind(&NewMask);  // restrict to this node

   if (new_length < HUGE_PAGE_SIZE)     /* 2MB min alloc for huge pages */
   {
      new_length = HUGE_PAGE_SIZE;
   }

   if (new_length & HUGE_PAGE_SIZE1)    /* 2MB min alloc for huge pages */
   {
      new_length = ((new_length >> 21) + 1) << 21;
   }
   if ((shmid = shmget(IPC_PRIVATE, new_length, /* length */
                       SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) == -1)
   {
fprintf(stderr, "shmget() failed for %ldMB\n", (long) (new_length >> 20));
      numa_set_membind(CurrentMask);    // unrestrict to this node
      return NULL;
   }

   shmaddr = shmat(shmid, NULL, 0);
   if (shmaddr == (void *) -1)
   {
      shmctl(shmid, IPC_RMID, NULL);
      numa_set_membind(CurrentMask);    // unrestrict to this node
      return NULL;
   }
   else if ((unsigned long) (shmaddr) & (PAGE_SIZE - 1))
   {
      fprintf(stderr, "huge page allocation was not page aligned\n");
   }

   memset(shmaddr, 0x00, new_length);
   if (VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL) > 0)
   {
      MoveAddrToNodeMulti(shmaddr, NewNumaNode, new_length / 4096UL);
   }
   numa_set_membind(CurrentMask);       // unrestrict to this node
   VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL);
   /* now delete the ID so it will free itself on exit */
   shmctl(shmid, IPC_RMID, NULL);
   return shmaddr;
}

void Free(void *addr)
{
}

int NumaNodeFromAddress(void *Address)
{
   int status[1] = { -1 };
   void *PTR = Address;
   void *PTR2[1] = { NULL };
   PTR2[0] = &PTR;
   int retval = move_pages(0,   // this thread
                           1,   // just one pointer
                           PTR2,        // The given address
NULL, // array of nodes, no moving, just asking
                           status,      // array of node results
                           MPOL_MF_MOVE);
   if (retval)
   {
      fprintf(stderr, "Invalid Address %p - No NUMA node\n", Address);
   }
   return status[0];
}

int MoveAddrToNodeMulti(void *ptr, int node, int Count)
{
   unsigned long *PTR = new unsigned long[Count + 1];
   unsigned long *PTR2 = new unsigned long[Count + 1];
   int *status = new int[Count + 1];
   int *NN = new int[Count + 1];
   int retval = 0;
   int i = 0;
   unsigned long addr = 0;

   for (i = 0; i < Count; i++)
   {
      status[i] = -1;
      NN[i] = node;
      addr = ((unsigned long) ptr) + (i * 4096);
      PTR[i] = (unsigned long) (addr & ~4095UL);
      PTR2[i] = (unsigned long) &PTR[i];
   }

   retval = move_pages(0,       // this thread
                       Count,   // lots of pointers
                       (void **) PTR2,  // The given address
                       NN,      // move to new node please
                       status,  // array of node results
                       MPOL_MF_MOVE);
   if (retval)
   {
      fprintf(stderr, "MoveAddrToNodeMulti to failed\n");
   }
   else
   {
      retval = 0;
      for (i = 0; i < Count; i++)
      {
         if (status[i] != node)
         {
fprintf(stderr, "Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node);
            retval++;
         }
      }
   }
   delete [] NN;
   delete [] status;
   delete [] PTR2;
   delete [] PTR;
   return retval;
}

int VerifyNumaNode(void *ptr, int node, int Count)
{
   unsigned long *PTR = new unsigned long[Count + 1];
   unsigned long *PTR2 = new unsigned long[Count + 1];
   int *status = new int[Count + 1];
   int retval = 0;
   int i = 0;
   unsigned long addr = 0;

   for (i = 0; i < Count; i++)
   {
      status[i] = -1;
      addr = ((unsigned long) ptr) + (i * 4096);
      PTR[i] = (unsigned long) (addr & ~4095UL);
      PTR2[i] = (unsigned long) &PTR[i];
   }

   retval = move_pages(0,       // this thread
                       Count,   // lots of pointers
                       (void **) PTR2,  // The given address
                       NULL,    // no new node
                       status,  // array of node results
                       MPOL_MF_MOVE);
   if (retval)
   {
      fprintf(stderr, "VerifyNumaNode move_pages failed\n");
   }
   else
   {
      retval = 0;
      for (i = 0; i < Count; i++)
      {
         if (status[i] != node)
         {
fprintf(stderr, "Verify Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node);
            retval++;
         }
      }
   }

   // release temp stuff

   delete [] status;
   delete [] PTR2;
   delete [] PTR;
   return retval;
}

// small demo program showing:
//
// a: huge page allocations via hugetlb are not node allocated
// b: huge pages cannot be move_page()'ed
// c: Replacing the shm*() with numa_alloc_node() has the exact same problem
// d: 4KB pages or 2MB pages act the same.

int main(int argc, char **argv)
{
   int Node = -1;
   unsigned long Size = 32UL * 1024UL *1024UL;  // default to 32MB

   if (argc >= 2)
   {
      Node = atoi(argv[1]);
   }
   if (argc >= 3)
   {
      Size = atol(argv[2]) * 1024UL * 1024UL;
   }

   unsigned long *Array = (unsigned long *) Allocate(Size, Node);
   exit(-1);
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to