zmirror: Add backoff-retry code for the Zephyr=>Humbug initialization.

(imported from commit bde30d87d6dae75bbfdcd49b30a1f08b3c585ec7)
This commit is contained in:
Tim Abbott 2012-12-21 11:59:41 -05:00
parent 805e508d02
commit 67ee231ff9

View file

@ -34,6 +34,37 @@ import signal
import logging import logging
import hashlib import hashlib
import tempfile import tempfile
import random
class CountingBackoff(object):
def __init__(self, maximum_retries=10):
self.number_of_retries = 0
self.maximum_retries = maximum_retries
def keep_going(self):
return self.number_of_retries < self.maximum_retries
def succeed(self):
self.number_of_retries = 0
def fail(self):
self.number_of_retries = min(self.number_of_retries + 1,
self.maximum_retries)
class RandomExponentialBackoff(CountingBackoff):
def fail(self):
self.number_of_retries = min(self.number_of_retries + 1,
self.maximum_retries)
# Exponential growth with ratio sqrt(2); compute random delay
# between x and 2x where x is growing exponentially
delay_scale = int(2 ** (self.number_of_retries / 2.0 - 1)) + 1
delay = delay_scale + random.randint(1, delay_scale)
message = "Sleeping for %ss [max %s] before retrying." % (delay, delay_scale * 2)
try:
logger.warning(message)
except NameError:
print message
time.sleep(delay)
DEFAULT_SITE = "https://humbughq.com" DEFAULT_SITE = "https://humbughq.com"
@ -199,18 +230,21 @@ def update_subscriptions():
if len(classes_to_subscribe) > 0: if len(classes_to_subscribe) > 0:
zephyr_bulk_subscribe(list(classes_to_subscribe)) zephyr_bulk_subscribe(list(classes_to_subscribe))
def maybe_restart_mirroring_script(): def maybe_kill_child():
if os.stat(os.path.join(options.root_path, "stamps", "restart_stamp")).st_mtime > start_time or \
((options.user == "tabbott" or options.user == "tabbott/extra") and
os.stat(os.path.join(options.root_path, "stamps", "tabbott_stamp")).st_mtime > start_time):
logger.warning("")
logger.warning("zephyr mirroring script has been updated; restarting...")
try: try:
if child_pid is not None: if child_pid is not None:
os.kill(child_pid, signal.SIGTERM) os.kill(child_pid, signal.SIGTERM)
except OSError: except OSError:
# We don't care if the child process no longer exists, so just log the error # We don't care if the child process no longer exists, so just log the error
logger.exception("") logger.exception("")
def maybe_restart_mirroring_script():
if os.stat(os.path.join(options.root_path, "stamps", "restart_stamp")).st_mtime > start_time or \
((options.user == "tabbott" or options.user == "tabbott/extra") and
os.stat(os.path.join(options.root_path, "stamps", "tabbott_stamp")).st_mtime > start_time):
logger.warning("")
logger.warning("zephyr mirroring script has been updated; restarting...")
maybe_kill_child()
try: try:
zephyr._z.cancelSubs() zephyr._z.cancelSubs()
except IOError: except IOError:
@ -358,18 +392,43 @@ def decode_unicode_byte_strings(zeph):
zeph[field] = decoded zeph[field] = decoded
return zeph return zeph
def quit_failed_initialization(message):
logger.error(message)
maybe_kill_child()
sys.exit(1)
def zephyr_init_autoretry():
backoff = RandomExponentialBackoff()
while backoff.keep_going():
try:
# zephyr.init() tries to clear old subscriptions, and thus
# sometimes gets a SERVNAK from the server
zephyr.init()
backoff.succeed()
return
except IOError:
logger.exception("Error initializing Zephyr library (retrying). Traceback:")
backoff.fail()
quit_failed_initialization("Could not initialize Zephyr library, quitting!")
def zephyr_subscribe_autoretry(sub): def zephyr_subscribe_autoretry(sub):
while True: backoff = RandomExponentialBackoff()
while backoff.keep_going():
try: try:
zephyr.Subscriptions().add(sub) zephyr.Subscriptions().add(sub)
backoff.succeed()
return return
except IOError: except IOError:
# Probably a SERVNAK from the zephyr server, but log the # Probably a SERVNAK from the zephyr server, but log the
# traceback just in case it's something else # traceback just in case it's something else
logger.exception("Error subscribing to personals (retrying). Traceback:") logger.exception("Error subscribing to personals (retrying). Traceback:")
time.sleep(1) backoff.fail()
quit_failed_initialization("Could not subscribe to personals, quitting!")
def zephyr_to_humbug(options): def zephyr_to_humbug(options):
zephyr_init_autoretry()
if options.forward_class_messages: if options.forward_class_messages:
update_subscriptions() update_subscriptions()
if options.forward_personals: if options.forward_personals:
@ -404,7 +463,7 @@ def zephyr_to_humbug(options):
logger.exception("Could not send saved zephyr:") logger.exception("Could not send saved zephyr:")
time.sleep(2) time.sleep(2)
logger.info("Starting receive loop.") logger.info("Successfully initialized; Starting receive loop.")
if options.log_path is not None: if options.log_path is not None:
with open(options.log_path, 'a') as log: with open(options.log_path, 'a') as log:
@ -898,15 +957,6 @@ or specify the --api-key-file option.""" % (options.api_key_file,))))
CURRENT_STATE = States.ZephyrToHumbug CURRENT_STATE = States.ZephyrToHumbug
import zephyr import zephyr
while True:
try:
# zephyr.init() tries to clear old subscriptions, and thus
# sometimes gets a SERVNAK from the server
zephyr.init()
break
except IOError:
logger.exception("")
time.sleep(1)
logger_name = "zephyr=>humbug" logger_name = "zephyr=>humbug"
if options.shard is not None: if options.shard is not None:
logger_name += "(%s)" % (options.shard,) logger_name += "(%s)" % (options.shard,)