Skia has proposed merging ~hyask/autopkgtest-cloud:skia/amqp_semaphores into autopkgtest-cloud:master.
Requested reviews: Canonical's Ubuntu QA (canonical-ubuntu-qa) For more details, see: https://code.launchpad.net/~hyask/autopkgtest-cloud/+git/autopkgtest-cloud/+merge/465282 Fix locking mechanism of `cache-amqp`, plus add a `--refresh-semaphores` flag to help repair the system when it breaks. -- Your team Canonical's Ubuntu QA is requested to review the proposed merge of ~hyask/autopkgtest-cloud:skia/amqp_semaphores into autopkgtest-cloud:master.
diff --git a/charms/focal/autopkgtest-web/webcontrol/cache-amqp b/charms/focal/autopkgtest-web/webcontrol/cache-amqp index aaa252d..9461ea2 100755 --- a/charms/focal/autopkgtest-web/webcontrol/cache-amqp +++ b/charms/focal/autopkgtest-web/webcontrol/cache-amqp @@ -19,7 +19,7 @@ AMQP_CONTEXTS = ["ubuntu", "huge", "ppa", "upstream"] class AutopkgtestQueueContents: - def __init__(self, amqp_uri, database): + def __init__(self, amqp_uri, database, refresh_semaphores=False): assert amqp_uri is not None assert database is not None @@ -39,6 +39,14 @@ class AutopkgtestQueueContents: for release, arches in self.release_arches.items(): for arch in arches: queue_name = f"semaphore-{context}-{release}-{arch}" + if ( + os.path.exists("/run/autopkgtest-web-is-leader") + and refresh_semaphores + ): + self.amqp_channel.queue_delete(queue_name) + logger.info( + f"Semaphore queue '{queue_name}' deleted for recreation" + ) try: self.amqp_channel.queue_declare( queue_name, durable=True, passive=True @@ -70,14 +78,6 @@ class AutopkgtestQueueContents: "We are not the leader, and there is no semaphore queue yet, we can't do anything - exiting." ) sys.exit(0) - # if the queue is empty basic_get will return None - if os.path.exists( - "/run/autopkgtest-web-is-leader" - ) and not self.amqp_channel.basic_get(queue_name): - self.amqp_channel.basic_publish( - amqp.Message(f"{queue_name}", delivery_mode=2), - routing_key=queue_name, - ) @property def release_arches(self): @@ -230,6 +230,13 @@ if __name__ == "__main__": help="Print debugging (give twice for super verbose output)", ) parser.add_argument( + "--refresh-semaphores", + dest="refresh_semaphores", + action="store_true", + help="Force the recreation of the semaphore queues if something broke them " + "(make sure to stop every running cache-amqp script before use)", + ) + parser.add_argument( "-o", "--output", dest="output", @@ -282,7 +289,9 @@ if __name__ == "__main__": # make the queue size go crazy in the KPI if os.path.isfile("/run/autopkgtest-web-is-leader"): # Get queue details from rabbitmq directly - aq = AutopkgtestQueueContents(amqp_uri, database) + aq = AutopkgtestQueueContents( + amqp_uri, database, args.refresh_semaphores + ) queue_contents = aq.get_queue_contents() else: # We get queues.json from autopkgtest.ubuntu.com, see if it's diff --git a/docs/administration.rst b/docs/administration.rst index 49739da..4673fd1 100644 --- a/docs/administration.rst +++ b/docs/administration.rst @@ -148,6 +148,22 @@ indicates something to be looked into. ``armhf`` cluster nodes in error almost always need checking out, as they usually indicate that the LXD host has gone down and needs redeploying. +If the queues are non empty but flat +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This may indicate that the infra is somehow unable to process jobs, but +sometimes this is just related to ``cache-amqp`` being stuck somehow. +This script runs on the webunits, and does its job on the leader of those +units. It has a semaphore mechanism, so should be able to work in a fully +distributed system. However, this hasn't been maintained much, and sometime +this semaphores can break, either by having more than one message in the +``semaphore-<queuename>-<release>-<arch>`` queue, or by having none. You can fix +that by stopping all the ``cache-amqp`` services (on all units!), and manually +running ``cache-amqp --refresh-semaphores --debug`` on the leader, which will +nuke the semaphore queues and recreate them. The ``--debug`` will help you +figure out if something goes wrong. + + Opening up a new series -----------------------
-- Mailing list: https://launchpad.net/~canonical-ubuntu-qa Post to : canonical-ubuntu-qa@lists.launchpad.net Unsubscribe : https://launchpad.net/~canonical-ubuntu-qa More help : https://help.launchpad.net/ListHelp