Public bug reported: Disclaimer: I haven't reproduced this in a functional test, but based on the traceback that I gathered from a real environment as well as the fact that the proposed fix actually fixes this, I think my theory is correct
Description =========== `socket` PCI NUMA policy doesn't work if another instance is booted first on the same host Steps to reproduce ================== 1. Boot any instance. 2. Boot an instance with the `socket` PCI NUMA policy on the same host. Expected result =============== `socket` instance boots. Actual result ============= Instance creation fails with Details: Fault: {'code': 500, 'created': '2022-10-28T20:17:31Z', 'message': 'NotImplementedError'}. Server boot request ID: req-e3fd15d7-fb79-440f-b2f3-e6b2a5505e56. Environment =========== Originally reported as part of QE verification of [1], so stable/wallaby. Additional info =============== Playing around with the whitebox test for the socket policy [2] on a wallaby deployment, I noticed that the `socket` field in the compute.numa_topology column was being switched to `null` then back to its correct value (0 or 1). I added logging of the stack trace to the resource tracker _udpate() method right before it calls compute_node.save(), and found that `null` was getting saved when an instance was being booted or deleted. Example of a traceback: File "/usr/lib/python3.9/site-packages/nova/utils.py", line 686, in context_wrapper\n func(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2126, in _locked_do_build_and_run_instance\n result = self._do_build_and_run_instance(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/exception_wrapper.py", line 63, in wrapped\n return f(self, context, *args, **kw)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 154, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/utils.py", line 1434, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 200, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2232, in _do_build_and_run_instance\n self._build_and_run_instance(context, instance, image,\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2383, in _build_and_run_instance\n with self.rt.instance_claim(context, instance, node, allocs,\n' ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 197, in instance_claim\n self._update(elevated, cn)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 Similarly for delete: 2022-10-28 21:57:27.091 2 DEBUG nova.compute.resource_tracker [req-c9fa718c-983e-416c-bc87-9564b8747294 d6d16a793ab74fe6a0b5594d037d3165 599a6777a45d46a09a7e233a926b7675 - default default] artom: [' File "/usr/lib/python3.9/site-packages /eventlet/greenpool.py", line 88, in _spawn_n_impl\n func(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/futurist/_green.py", line 71, in __call__\n self.work.run()\n', ' File "/usr/lib/python3.9/site-packages/futur ist/_utils.py", line 49, in run\n result = self.fn(*self.args, **self.kwargs)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/server.py", line 165, in _process_incoming\n res = self.dispatcher.dispatch(message)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/dispatcher.py", line 309, in dispatch\n return self._do_dispatch(endpoint, method, ctxt, args)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/dispatcher.py" , line 229, in _do_dispatch\n result = func(ctxt, **new_args)\n', ' File "/usr/lib/python3.9/site-packages/nova/exception_wrapper.py", line 63, in wrapped\n return f(self, context, *args, **kw)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/manager.py", line 154, in decorated_function\n return function(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/utils.py", line 1434, in decorated_function\n return fun ction(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 200, in decorated_function\n return function(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packag es/nova/compute/manager.py", line 3072, in terminate_instance\n do_terminate_instance(instance, bdms)\n', ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 3060, in do_terminate_instance\n self._delete_instance(context, instance, bdms)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 3024, in _delete_instance\n self._complete_deletion(context, instance)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 828, in _complete_deletion\n self._update_resource_tracker(context, instance)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 596, in _update_resource_tracker\n self.rt.update_usage(context, instance, instance.node)\n', ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", li ne 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 658, in update_usage\n self._update(context.elevated(), self.compute_nodes[nodename])\n', ' File "/u sr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 On the other hand, the resource tracker's periodic resource update task was saving the socket correctly: 2022-10-28 21:57:59.794 2 DEBUG nova.compute.resource_tracker [req-31329b8b-0de4-4b30-b2a1-dcd4d62369b4 - - - - -] artom: [' File "/usr/lib/python3.9/site-packages/eventlet/greenthread.py", line 221, in main\n result = function(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/oslo_service/loopingcall.py", line 150, in _run_loop\n result = func(*self.args, **self.kw)\n', ' File "/usr/lib/python3.9/site-packages/nova/service.py", line 307, in periodic_tasks\n return self.manager.periodic_tasks(ctxt, raise_on_error=raise_on_error)\n', ' File "/usr/lib/python3.9/site- packages/nova/manager.py", line 104, in periodic_tasks\n return self.run_periodic_tasks(context, raise_on_error=raise_on_error)\n', ' File "/usr/lib/python3.9/site-packages/oslo_service/periodic_task.py", line 216, in run_periodic_tasks\n task(self, context)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 10026, in update_available_resource\n self._update_available_resource_for_node(context, nodename,\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 9935, in _update_available_resource_for_node\n self.rt.update_available_resource(context, nodename,\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 896, in update_available_resource\n self._update_available_resource(context, resources, startup=startup)\n', ' File "/usr/lib/python3.9/site- packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/resource_tracker.py", line 1003, in _update_available_resource\n self._update(context, cn, startup=startup)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 Not included in the above for brevity is the log line showing what was actually being saved, you'll just have to trust me on this one ;) [1] https://bugzilla.redhat.com/show_bug.cgi?id=1883554 [2] https://review.opendev.org/c/openstack/whitebox-tempest-plugin/+/851447 ** Affects: nova Importance: Undecided Status: In Progress -- You received this bug notification because you are a member of Yahoo! Engineering Team, which is subscribed to OpenStack Compute (nova). https://bugs.launchpad.net/bugs/1995153 Title: `socket` PCI NUMA policy doesn't work if another instance is booted first on the same host Status in OpenStack Compute (nova): In Progress Bug description: Disclaimer: I haven't reproduced this in a functional test, but based on the traceback that I gathered from a real environment as well as the fact that the proposed fix actually fixes this, I think my theory is correct Description =========== `socket` PCI NUMA policy doesn't work if another instance is booted first on the same host Steps to reproduce ================== 1. Boot any instance. 2. Boot an instance with the `socket` PCI NUMA policy on the same host. Expected result =============== `socket` instance boots. Actual result ============= Instance creation fails with Details: Fault: {'code': 500, 'created': '2022-10-28T20:17:31Z', 'message': 'NotImplementedError'}. Server boot request ID: req-e3fd15d7-fb79-440f-b2f3-e6b2a5505e56. Environment =========== Originally reported as part of QE verification of [1], so stable/wallaby. Additional info =============== Playing around with the whitebox test for the socket policy [2] on a wallaby deployment, I noticed that the `socket` field in the compute.numa_topology column was being switched to `null` then back to its correct value (0 or 1). I added logging of the stack trace to the resource tracker _udpate() method right before it calls compute_node.save(), and found that `null` was getting saved when an instance was being booted or deleted. Example of a traceback: File "/usr/lib/python3.9/site-packages/nova/utils.py", line 686, in context_wrapper\n func(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2126, in _locked_do_build_and_run_instance\n result = self._do_build_and_run_instance(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/exception_wrapper.py", line 63, in wrapped\n return f(self, context, *args, **kw)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 154, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/utils.py", line 1434, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 200, in decorated_function\n return function(self, context, *args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2232, in _do_build_and_run_instance\n self._build_and_run_instance(context, instance, image,\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 2383, in _build_and_run_instance\n with self.rt.instance_claim(context, instance, node, allocs,\n' ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 197, in instance_claim\n self._update(elevated, cn)\n' ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 Similarly for delete: 2022-10-28 21:57:27.091 2 DEBUG nova.compute.resource_tracker [req-c9fa718c-983e-416c-bc87-9564b8747294 d6d16a793ab74fe6a0b5594d037d3165 599a6777a45d46a09a7e233a926b7675 - default default] artom: [' File "/usr/lib/python3.9/site-packages /eventlet/greenpool.py", line 88, in _spawn_n_impl\n func(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/futurist/_green.py", line 71, in __call__\n self.work.run()\n', ' File "/usr/lib/python3.9/site-packages/futur ist/_utils.py", line 49, in run\n result = self.fn(*self.args, **self.kwargs)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/server.py", line 165, in _process_incoming\n res = self.dispatcher.dispatch(message)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/dispatcher.py", line 309, in dispatch\n return self._do_dispatch(endpoint, method, ctxt, args)\n', ' File "/usr/lib/python3.9/site-packages/oslo_messaging/rpc/dispatcher.py" , line 229, in _do_dispatch\n result = func(ctxt, **new_args)\n', ' File "/usr/lib/python3.9/site-packages/nova/exception_wrapper.py", line 63, in wrapped\n return f(self, context, *args, **kw)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/manager.py", line 154, in decorated_function\n return function(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/utils.py", line 1434, in decorated_function\n return fun ction(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 200, in decorated_function\n return function(self, context, *args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packag es/nova/compute/manager.py", line 3072, in terminate_instance\n do_terminate_instance(instance, bdms)\n', ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 3060, in do_terminate_instance\n self._delete_instance(context, instance, bdms)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 3024, in _delete_instance\n self._complete_deletion(context, instance)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 828, in _complete_deletion\n self._update_resource_tracker(context, instance)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 596, in _update_resource_tracker\n self.rt.update_usage(context, instance, instance.node)\n', ' File "/usr/lib/python3.9/site-packages/oslo_concurrency/lockutils.py", li ne 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 658, in update_usage\n self._update(context.elevated(), self.compute_nodes[nodename])\n', ' File "/u sr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 On the other hand, the resource tracker's periodic resource update task was saving the socket correctly: 2022-10-28 21:57:59.794 2 DEBUG nova.compute.resource_tracker [req-31329b8b-0de4-4b30-b2a1-dcd4d62369b4 - - - - -] artom: [' File "/usr/lib/python3.9/site-packages/eventlet/greenthread.py", line 221, in main\n result = function(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site-packages/oslo_service/loopingcall.py", line 150, in _run_loop\n result = func(*self.args, **self.kw)\n', ' File "/usr/lib/python3.9/site-packages/nova/service.py", line 307, in periodic_tasks\n return self.manager.periodic_tasks(ctxt, raise_on_error=raise_on_error)\n', ' File "/usr/lib/python3.9/site- packages/nova/manager.py", line 104, in periodic_tasks\n return self.run_periodic_tasks(context, raise_on_error=raise_on_error)\n', ' File "/usr/lib/python3.9/site-packages/oslo_service/periodic_task.py", line 216, in run_periodic_tasks\n task(self, context)\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 10026, in update_available_resource\n self._update_available_resource_for_node(context, nodename,\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/manager.py", line 9935, in _update_available_resource_for_node\n self.rt.update_available_resource(context, nodename,\n', ' File "/usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py", line 896, in update_available_resource\n self._update_available_resource(context, resources, startup=startup)\n', ' File "/usr/lib/python3.9/site- packages/oslo_concurrency/lockutils.py", line 360, in inner\n return f(*args, **kwargs)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/resource_tracker.py", line 1003, in _update_available_resource\n self._update(context, cn, startup=startup)\n', ' File "/usr/lib/python3.9/site- packages/nova/compute/resource_tracker.py", line 1247, in _update\n LOG.debug(\'artom: %s\', traceback.format_stack())\n'] _update /usr/lib/python3.9/site-packages/nova/compute/resource_tracker.py:1247 Not included in the above for brevity is the log line showing what was actually being saved, you'll just have to trust me on this one ;) [1] https://bugzilla.redhat.com/show_bug.cgi?id=1883554 [2] https://review.opendev.org/c/openstack/whitebox-tempest-plugin/+/851447 To manage notifications about this bug go to: https://bugs.launchpad.net/nova/+bug/1995153/+subscriptions -- Mailing list: https://launchpad.net/~yahoo-eng-team Post to : yahoo-eng-team@lists.launchpad.net Unsubscribe : https://launchpad.net/~yahoo-eng-team More help : https://help.launchpad.net/ListHelp