Public bug reported: It is possible that placement gets out of sync which can cause scheduling problems that would go unknown. I've built out this script would would be nice to have as `nova-manage placement audit`:
================================================================================ #!/usr/bin/env python import argparse import sys from openstack import connection import openstack.config config = openstack.config.OpenStackConfig() parser = argparse.ArgumentParser() config.register_argparse_arguments(parser, sys.argv) options = parser.parse_args() cloud_region = config.get_one(argparse=options) conn = connection.Connection(config=cloud_region) # Grab list of all hypervisors and their servers hypervisors = conn.compute.get('/os-hypervisors?with_servers=true', microversion='2.53').json().get('hypervisors') # Generate a dictionary mapping of hypervisor => [instances] hypervisor_mapping = {h['id']: [s['uuid'] for s in h.get('servers', [])] for h in hypervisors} hypervisor_names = {h['id']: h['hypervisor_hostname'] for h in hypervisors} # Grab list of all resource providers resource_providers = conn.placement.get('/resource_providers').json().get('resource_providers') for rp in resource_providers: # Check if RP has VCPU in inventory (i.e. compute node) inventories = conn.placement.get('/resource_providers/%s/inventories' % rp['uuid']).json().get('inventories') # Skip those without VCPU and MEMORY_MB (non computes) if 'MEMORY_MB' not in inventories and 'VCPU' not in inventories: continue # Get all allocations for RP allocations = conn.placement.get('/resource_providers/%s/allocations' % rp['uuid']).json().get('allocations') # Is there a compute node for this RP? if rp['uuid'] not in hypervisor_mapping: print "openstack resource provider delete %s # resource provider does not have matching provider" % rp['uuid'] continue for allocation_id, info in allocations.iteritems(): # The instance does not exist where placement says it should be. if allocation_id not in hypervisor_mapping[rp['uuid']]: hypervisor = None # Try to find where it's hiding. for hyp, instances in hypervisor_mapping.iteritems(): if allocation_id in instances: hypervisor = hyp break # We found it. if hypervisor: classes = ','.join(["%s=%s" % (key, value) for key, value in info.get('resources').iteritems()]) print "openstack resource provider allocation set --allocation rp=%s,%s %s # instance allocated on wrong rp" % (hypervisor, classes, allocation_id) continue # We don't know where this is. Let's see if it exists in Nova. server = conn.placement.get('/servers/%s' % allocation_id) if server.status_code == 404: print "openstack resource provider allocation delete %s # instance deleted" % allocation_id continue # TODO: idk? edge cases? raise ================================================================================ It would likely need to be rewritten to use the built-in placement HTTP client and objects to avoid extra API calls. ** Affects: nova Importance: Undecided Status: New -- You received this bug notification because you are a member of Yahoo! Engineering Team, which is subscribed to OpenStack Compute (nova). https://bugs.launchpad.net/bugs/1793569 Title: Add placement audit commands Status in OpenStack Compute (nova): New Bug description: It is possible that placement gets out of sync which can cause scheduling problems that would go unknown. I've built out this script would would be nice to have as `nova-manage placement audit`: ================================================================================ #!/usr/bin/env python import argparse import sys from openstack import connection import openstack.config config = openstack.config.OpenStackConfig() parser = argparse.ArgumentParser() config.register_argparse_arguments(parser, sys.argv) options = parser.parse_args() cloud_region = config.get_one(argparse=options) conn = connection.Connection(config=cloud_region) # Grab list of all hypervisors and their servers hypervisors = conn.compute.get('/os-hypervisors?with_servers=true', microversion='2.53').json().get('hypervisors') # Generate a dictionary mapping of hypervisor => [instances] hypervisor_mapping = {h['id']: [s['uuid'] for s in h.get('servers', [])] for h in hypervisors} hypervisor_names = {h['id']: h['hypervisor_hostname'] for h in hypervisors} # Grab list of all resource providers resource_providers = conn.placement.get('/resource_providers').json().get('resource_providers') for rp in resource_providers: # Check if RP has VCPU in inventory (i.e. compute node) inventories = conn.placement.get('/resource_providers/%s/inventories' % rp['uuid']).json().get('inventories') # Skip those without VCPU and MEMORY_MB (non computes) if 'MEMORY_MB' not in inventories and 'VCPU' not in inventories: continue # Get all allocations for RP allocations = conn.placement.get('/resource_providers/%s/allocations' % rp['uuid']).json().get('allocations') # Is there a compute node for this RP? if rp['uuid'] not in hypervisor_mapping: print "openstack resource provider delete %s # resource provider does not have matching provider" % rp['uuid'] continue for allocation_id, info in allocations.iteritems(): # The instance does not exist where placement says it should be. if allocation_id not in hypervisor_mapping[rp['uuid']]: hypervisor = None # Try to find where it's hiding. for hyp, instances in hypervisor_mapping.iteritems(): if allocation_id in instances: hypervisor = hyp break # We found it. if hypervisor: classes = ','.join(["%s=%s" % (key, value) for key, value in info.get('resources').iteritems()]) print "openstack resource provider allocation set --allocation rp=%s,%s %s # instance allocated on wrong rp" % (hypervisor, classes, allocation_id) continue # We don't know where this is. Let's see if it exists in Nova. server = conn.placement.get('/servers/%s' % allocation_id) if server.status_code == 404: print "openstack resource provider allocation delete %s # instance deleted" % allocation_id continue # TODO: idk? edge cases? raise ================================================================================ It would likely need to be rewritten to use the built-in placement HTTP client and objects to avoid extra API calls. To manage notifications about this bug go to: https://bugs.launchpad.net/nova/+bug/1793569/+subscriptions -- Mailing list: https://launchpad.net/~yahoo-eng-team Post to : yahoo-eng-team@lists.launchpad.net Unsubscribe : https://launchpad.net/~yahoo-eng-team More help : https://help.launchpad.net/ListHelp