zephyr: Use exponential backoffs in retry loops.

This reduces the number of retries that might spam APIs.

There is some complexity here which is left un-managed -- for
instance, maybe_restart_mirroring_script does a number of restart
attempts, and then fails, but will be retried every 15s by the
surrounding `process_loop`.  Previously, it would merely have looped
forever inside maybe_restart_mirroring_script.

Three loops are intentionally left as infinite `while True` loops,
that merely cap their backoff at the default 90s.  Their callers do
not expect, or have any way to handle more gracefully, a failure of
the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`.
They maintain their previous behavior of retrying forever, albeit more
slowly.
This commit is contained in:
Alex Vandiver 2020-07-31 18:59:28 -07:00 committed by Alex Vandiver
parent 8670cce8e9
commit a20c9cc6d7

View file

@ -17,6 +17,8 @@ import hashlib
import tempfile
import select
from zulip import RandomExponentialBackoff
DEFAULT_SITE = "https://api.zulip.com"
class States:
@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None:
except OSError:
# We don't care whether we failed to cancel subs properly, but we should log it
logger.exception("")
while True:
backoff = RandomExponentialBackoff(
maximum_retries=3,
)
while backoff.keep_going():
try:
os.execvp(os.path.abspath(__file__), sys.argv)
# No need for backoff.succeed, since this can't be reached
except Exception:
logger.exception("Error restarting mirroring script; trying again... Traceback:")
time.sleep(1)
backoff.fail()
raise Exception("Failed to reload too many times, aborting!")
def process_loop(log: Optional[IO[Any]]) -> None:
restart_check_count = 0
last_check_time = time.time()
recieve_backoff = RandomExponentialBackoff()
while True:
select.select([zephyr._z.getFD()], [], [], 15)
try:
process_backoff = RandomExponentialBackoff()
# Fetch notices from the queue until its empty
while True:
notice = zephyr.receive(block=False)
recieve_backoff.succeed()
if notice is None:
break
try:
process_notice(notice, log)
process_backoff.succeed()
except Exception:
logger.exception("Error relaying zephyr:")
time.sleep(2)
process_backoff.fail()
except Exception:
logger.exception("Error checking for new zephyrs:")
time.sleep(1)
recieve_backoff.fail()
continue
if time.time() - last_check_time > 15:
@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
def zulip_to_zephyr(options: int) -> None:
# Sync messages from zulip to zephyr
logger.info("Starting syncing messages.")
backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
while True:
try:
zulip_client.call_on_each_message(maybe_forward_to_zephyr)
except Exception:
logger.exception("Error syncing messages:")
time.sleep(1)
backoff.fail()
def subscribed_to_mail_messages() -> bool:
# In case we have lost our AFS tokens and those won't be able to