zephyr: Use exponential backoffs in retry loops.

This reduces the number of retries that might spam APIs. There is some complexity here which is left un-managed -- for instance, maybe_restart_mirroring_script does a number of restart attempts, and then fails, but will be retried every 15s by the surrounding `process_loop`. Previously, it would merely have looped forever inside maybe_restart_mirroring_script. Three loops are intentionally left as infinite `while True` loops, that merely cap their backoff at the default 90s. Their callers do not expect, or have any way to handle more gracefully, a failure of the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`. They maintain their previous behavior of retrying forever, albeit more slowly.
2020-07-31 18:59:28 -07:00 · 2020-07-31 18:59:28 -07:00 · a20c9cc6d7
commit a20c9cc6d7
parent 8670cce8e9
1 changed files with 17 additions and 5 deletions
--- a/zulip/integrations/zephyr/zephyr_mirror_backend.py
+++ b/zulip/integrations/zephyr/zephyr_mirror_backend.py
@ -17,6 +17,8 @@ import hashlib
 import tempfile
 import select

+from zulip import RandomExponentialBackoff
+
 DEFAULT_SITE = "https://api.zulip.com"

 class States:
@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None:
        except OSError:
            # We don't care whether we failed to cancel subs properly, but we should log it
            logger.exception("")
-        while True:
+        backoff = RandomExponentialBackoff(
+            maximum_retries=3,
+        )
+        while backoff.keep_going():
            try:
                os.execvp(os.path.abspath(__file__), sys.argv)
+                # No need for backoff.succeed, since this can't be reached
            except Exception:
                logger.exception("Error restarting mirroring script; trying again... Traceback:")
-                time.sleep(1)
+                backoff.fail()
+        raise Exception("Failed to reload too many times, aborting!")

 def process_loop(log: Optional[IO[Any]]) -> None:
    restart_check_count = 0
    last_check_time = time.time()
+    recieve_backoff = RandomExponentialBackoff()
    while True:
        select.select([zephyr._z.getFD()], [], [], 15)
        try:
+            process_backoff = RandomExponentialBackoff()
            # Fetch notices from the queue until its empty
            while True:
                notice = zephyr.receive(block=False)
+                recieve_backoff.succeed()
                if notice is None:
                    break
                try:
                    process_notice(notice, log)
+                    process_backoff.succeed()
                except Exception:
                    logger.exception("Error relaying zephyr:")
-                    time.sleep(2)
+                    process_backoff.fail()
        except Exception:
            logger.exception("Error checking for new zephyrs:")
-            time.sleep(1)
+            recieve_backoff.fail()
            continue

        if time.time() - last_check_time > 15:
@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
 def zulip_to_zephyr(options: int) -> None:
    # Sync messages from zulip to zephyr
    logger.info("Starting syncing messages.")
+    backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
    while True:
        try:
            zulip_client.call_on_each_message(maybe_forward_to_zephyr)
        except Exception:
            logger.exception("Error syncing messages:")
-            time.sleep(1)
+            backoff.fail()

 def subscribed_to_mail_messages() -> bool:
    # In case we have lost our AFS tokens and those won't be able to