zephyr: Use exponential backoffs in retry loops.
This reduces the number of retries that might spam APIs. There is some complexity here which is left un-managed -- for instance, maybe_restart_mirroring_script does a number of restart attempts, and then fails, but will be retried every 15s by the surrounding `process_loop`. Previously, it would merely have looped forever inside maybe_restart_mirroring_script. Three loops are intentionally left as infinite `while True` loops, that merely cap their backoff at the default 90s. Their callers do not expect, or have any way to handle more gracefully, a failure of the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`. They maintain their previous behavior of retrying forever, albeit more slowly.
This commit is contained in:
		
							parent
							
								
									8670cce8e9
								
							
						
					
					
						commit
						a20c9cc6d7
					
				
					 1 changed files with 17 additions and 5 deletions
				
			
		| 
						 | 
				
			
			@ -17,6 +17,8 @@ import hashlib
 | 
			
		|||
import tempfile
 | 
			
		||||
import select
 | 
			
		||||
 | 
			
		||||
from zulip import RandomExponentialBackoff
 | 
			
		||||
 | 
			
		||||
DEFAULT_SITE = "https://api.zulip.com"
 | 
			
		||||
 | 
			
		||||
class States:
 | 
			
		||||
| 
						 | 
				
			
			@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None:
 | 
			
		|||
        except OSError:
 | 
			
		||||
            # We don't care whether we failed to cancel subs properly, but we should log it
 | 
			
		||||
            logger.exception("")
 | 
			
		||||
        while True:
 | 
			
		||||
        backoff = RandomExponentialBackoff(
 | 
			
		||||
            maximum_retries=3,
 | 
			
		||||
        )
 | 
			
		||||
        while backoff.keep_going():
 | 
			
		||||
            try:
 | 
			
		||||
                os.execvp(os.path.abspath(__file__), sys.argv)
 | 
			
		||||
                # No need for backoff.succeed, since this can't be reached
 | 
			
		||||
            except Exception:
 | 
			
		||||
                logger.exception("Error restarting mirroring script; trying again... Traceback:")
 | 
			
		||||
                time.sleep(1)
 | 
			
		||||
                backoff.fail()
 | 
			
		||||
        raise Exception("Failed to reload too many times, aborting!")
 | 
			
		||||
 | 
			
		||||
def process_loop(log: Optional[IO[Any]]) -> None:
 | 
			
		||||
    restart_check_count = 0
 | 
			
		||||
    last_check_time = time.time()
 | 
			
		||||
    recieve_backoff = RandomExponentialBackoff()
 | 
			
		||||
    while True:
 | 
			
		||||
        select.select([zephyr._z.getFD()], [], [], 15)
 | 
			
		||||
        try:
 | 
			
		||||
            process_backoff = RandomExponentialBackoff()
 | 
			
		||||
            # Fetch notices from the queue until its empty
 | 
			
		||||
            while True:
 | 
			
		||||
                notice = zephyr.receive(block=False)
 | 
			
		||||
                recieve_backoff.succeed()
 | 
			
		||||
                if notice is None:
 | 
			
		||||
                    break
 | 
			
		||||
                try:
 | 
			
		||||
                    process_notice(notice, log)
 | 
			
		||||
                    process_backoff.succeed()
 | 
			
		||||
                except Exception:
 | 
			
		||||
                    logger.exception("Error relaying zephyr:")
 | 
			
		||||
                    time.sleep(2)
 | 
			
		||||
                    process_backoff.fail()
 | 
			
		||||
        except Exception:
 | 
			
		||||
            logger.exception("Error checking for new zephyrs:")
 | 
			
		||||
            time.sleep(1)
 | 
			
		||||
            recieve_backoff.fail()
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        if time.time() - last_check_time > 15:
 | 
			
		||||
| 
						 | 
				
			
			@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
 | 
			
		|||
def zulip_to_zephyr(options: int) -> None:
 | 
			
		||||
    # Sync messages from zulip to zephyr
 | 
			
		||||
    logger.info("Starting syncing messages.")
 | 
			
		||||
    backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
 | 
			
		||||
    while True:
 | 
			
		||||
        try:
 | 
			
		||||
            zulip_client.call_on_each_message(maybe_forward_to_zephyr)
 | 
			
		||||
        except Exception:
 | 
			
		||||
            logger.exception("Error syncing messages:")
 | 
			
		||||
            time.sleep(1)
 | 
			
		||||
            backoff.fail()
 | 
			
		||||
 | 
			
		||||
def subscribed_to_mail_messages() -> bool:
 | 
			
		||||
    # In case we have lost our AFS tokens and those won't be able to
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue