From a20c9cc6d7059bf40989babfd148363d59deb152 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Fri, 31 Jul 2020 18:59:28 -0700 Subject: [PATCH] zephyr: Use exponential backoffs in retry loops. This reduces the number of retries that might spam APIs. There is some complexity here which is left un-managed -- for instance, maybe_restart_mirroring_script does a number of restart attempts, and then fails, but will be retried every 15s by the surrounding `process_loop`. Previously, it would merely have looped forever inside maybe_restart_mirroring_script. Three loops are intentionally left as infinite `while True` loops, that merely cap their backoff at the default 90s. Their callers do not expect, or have any way to handle more gracefully, a failure of the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`. They maintain their previous behavior of retrying forever, albeit more slowly. --- .../zephyr/zephyr_mirror_backend.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/zulip/integrations/zephyr/zephyr_mirror_backend.py b/zulip/integrations/zephyr/zephyr_mirror_backend.py index 8b51971..326c588 100755 --- a/zulip/integrations/zephyr/zephyr_mirror_backend.py +++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py @@ -17,6 +17,8 @@ import hashlib import tempfile import select +from zulip import RandomExponentialBackoff + DEFAULT_SITE = "https://api.zulip.com" class States: @@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None: except OSError: # We don't care whether we failed to cancel subs properly, but we should log it logger.exception("") - while True: + backoff = RandomExponentialBackoff( + maximum_retries=3, + ) + while backoff.keep_going(): try: os.execvp(os.path.abspath(__file__), sys.argv) + # No need for backoff.succeed, since this can't be reached except Exception: logger.exception("Error restarting mirroring script; trying again... Traceback:") - time.sleep(1) + backoff.fail() + raise Exception("Failed to reload too many times, aborting!") def process_loop(log: Optional[IO[Any]]) -> None: restart_check_count = 0 last_check_time = time.time() + recieve_backoff = RandomExponentialBackoff() while True: select.select([zephyr._z.getFD()], [], [], 15) try: + process_backoff = RandomExponentialBackoff() # Fetch notices from the queue until its empty while True: notice = zephyr.receive(block=False) + recieve_backoff.succeed() if notice is None: break try: process_notice(notice, log) + process_backoff.succeed() except Exception: logger.exception("Error relaying zephyr:") - time.sleep(2) + process_backoff.fail() except Exception: logger.exception("Error checking for new zephyrs:") - time.sleep(1) + recieve_backoff.fail() continue if time.time() - last_check_time > 15: @@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None: def zulip_to_zephyr(options: int) -> None: # Sync messages from zulip to zephyr logger.info("Starting syncing messages.") + backoff = RandomExponentialBackoff(timeout_success_equivalent=120) while True: try: zulip_client.call_on_each_message(maybe_forward_to_zephyr) except Exception: logger.exception("Error syncing messages:") - time.sleep(1) + backoff.fail() def subscribed_to_mail_messages() -> bool: # In case we have lost our AFS tokens and those won't be able to