Rewrite check-mirroring to be more robust and flexible.

Features include:
* Not forking into two processes (shells out to zwrite to send
  instead).  This makes life easier since we're not doing concurrent
  programming.
* Eliminated a lot of hard-to-read or unnecessary debugging output.
* Adding explanatory test suggesting the likely problem for some
  common sets of received messages.
* Much less code duplication.
* Support for testing a sharded zephyr_mirror script (--sharded).
* Use of the logging module to print timestamps -- makes debugging
  some issues a lot.
* Only one sleep, and for only 10 seconds, between sending the
  outgoing messages and checking that they were received.
* Support for running two copies of this script at the same time, so that
  running it manually doesn't screw up Nagios.
* Passed running 100 tests run in a row.

(imported from commit a3ec02ac1d1a04972e469ca30fec1790c4fb53bc)
This commit is contained in:
Tim Abbott 2012-11-28 22:34:49 -05:00
parent abb95fca04
commit 4bfd784423

View file

@ -4,6 +4,8 @@ import time
import optparse
import os
import random
import logging
import subprocess
parser = optparse.OptionParser()
parser.add_option('--verbose',
@ -14,6 +16,9 @@ parser.add_option('--site',
dest='site',
default="https://humbughq.com",
action='store')
parser.add_option('--sharded',
default=False,
action='store_true')
parser.add_option('--root-path',
dest='root_path',
default="/home/humbug",
@ -27,11 +32,6 @@ sys.path[:0] = [os.path.join(options.root_path, "python-zephyr"),
mit_user = 'tabbott/extra@ATHENA.MIT.EDU'
humbug_user = 'tabbott/extra@mit.edu'
hzkey1 = random.getrandbits(32)
hzkey2 = random.getrandbits(32)
zhkey1 = random.getrandbits(32)
zhkey2 = random.getrandbits(32)
sys.path.append(".")
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
import api.common
@ -41,6 +41,51 @@ humbug_client = api.common.HumbugAPI(email=humbug_user,
client="test: Humbug API",
site=options.site)
# Configure logging
log_file = "/home/humbug/check-mirroring-log"
log_format = "%(asctime)s: %(message)s"
logging.basicConfig(format=log_format)
formatter = logging.Formatter(log_format)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
# Initialize list of streams to test
if options.sharded:
# NOTE: Streams in this list must be in humbug_user's Humbug
# subscriptions, or we won't receive messages via Humbug.
# The sharded stream list has a bunch of pairs
# (stream, shard_name), where sha1sum(stream).startswith(shard_name)
test_streams = [
("message", "p"),
("tabbott-nagios-test-15", "0"),
("tabbott-nagios-test-1", "1"),
("tabbott-nagios-test-4", "2"),
("tabbott-nagios-test-32", "3"),
("tabbott-nagios-test-6", "4"),
("tabbott-nagios-test-93", "5"),
("tabbott-nagios-test-23", "6"),
("tabbott-nagios-test-16", "7"),
("tabbott-nagios-test-22", "8"),
("tabbott-nagios-test-3", "9"),
("tabbott-nagios-test-2", "a"),
("tabbott-nagios-test-10", "b"),
("tabbott-nagios-test-14", "c"),
("tabbott-nagios-test-8", "d"),
("tabbott-nagios-test-13", "e"),
("tabbott-nagios-test-45", "f"),
]
else:
test_streams = [
("message", "p"),
("tabbott-nagios-test", "a"),
]
def print_status_and_exit(status):
# The output of this script is used by Nagios. Various outputs,
# e.g. true success and punting due to a SERVNAK, result in a
@ -49,121 +94,207 @@ def print_status_and_exit(status):
print status
sys.exit(status)
def print_zephyr(notice):
print notice.cls, notice.instance, notice.sender, notice.message.split('\0')[1]
def send_humbug(message):
result = humbug_client.send_message(message)
if result["result"] != "success":
logger.error("Error sending humbug, args were:")
logger.error(message)
logger.error(result)
print_status_and_exit(1)
def print_humbug(message):
if message['type'] == "stream":
print message["type"], message['display_recipient'], message['subject'], \
message['sender_email'], message['content']
def send_zephyr(zwrite_args, content):
p = subprocess.Popen(zwrite_args, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate(input=content.encode("utf-8"))
if p.returncode != 0:
logger.error("Error sending zephyr:")
logger.info(stdout)
logger.error(stderr)
print_status_and_exit(1)
# Subscribe to Humbugs
try:
res = humbug_client.get_profile()
max_message_id = res.get('max_message_id')
if max_message_id is None:
logging.error("Error subscribing to Humbugs!")
logging.error(res)
print_status_and_exit(1)
except Exception:
logger.exception("Unexpected error subscribing to Humbugs")
print_status_and_exit(1)
# Subscribe to Zephyrs
import zephyr
zephyr_subs_to_add = []
for (stream, test) in test_streams:
if stream == "message":
zephyr_subs_to_add.append((stream, 'personal', mit_user))
else:
print message["type"], message['sender_email'], \
message['display_recipient'], message['content']
zephyr_subs_to_add.append((stream, '*', '*'))
max_message_id = humbug_client.get_profile()['max_message_id']
for tries in xrange(10):
try:
zephyr.init()
zephyr._z.subAll(zephyr_subs_to_add)
zephyr_subs = zephyr._z.getSubscriptions()
for elt in zephyr_subs_to_add:
if elt not in zephyr_subs:
logging.error("Failed to subscribe to %s" % (elt,))
continue
break
except IOError, e:
if tries > 5:
if "SERVNAK received" in e:
logger.error("SERVNAK repeatedly received, punting rest of test")
else:
logger.exception("Exception subscribing to zephyrs")
print_status_and_exit(1)
child_pid = os.fork()
if child_pid == 0:
# Run the humbug => zephyr mirror in the child
time.sleep(5)
result = humbug_client.send_message({
# Prepare keys
zhkeys = {}
hzkeys = {}
def gen_keys(key_dict):
for (stream, test) in test_streams:
bits = str(random.getrandbits(32))
while bits in key_dict:
# Avoid the unlikely event that we get the same bits twice
bits = str(random.getrandbits(32))
key_dict[bits] = (stream, test)
gen_keys(zhkeys)
gen_keys(hzkeys)
logger.info("Starting sending messages!")
# Send zephyrs
zsig = "Timothy Good Abbott"
for key, (stream, test) in zhkeys.items():
if stream == "message":
zwrite_args = ["zwrite", "-s", zsig, mit_user]
else:
zwrite_args = ["zwrite", "-s", zsig, "-c", stream, "-i", "test"]
send_zephyr(zwrite_args, str(key))
logger.info("Sent Zephyr messages!")
# Send Humbugs
for key, (stream, test) in hzkeys.items():
if stream == "message":
send_humbug({
"type": "private",
"content": str(hzkey1),
"content": str(key),
"to": humbug_user,
})
if result["result"] != "success":
print "key1 send error:"
print result
result = humbug_client.send_message({
else:
send_humbug({
"type": "stream",
"subject": "test",
"content": str(hzkey2),
"to": "tabbott-nagios-test",
"content": str(key),
"to": stream,
})
if result["result"] != "success":
print "key2 send error:"
print result
logger.info("Sent Humbug messages!")
if options.verbose:
print "Sent Humbug messages!"
import zephyr
try:
zephyr.init()
except IOError, e:
if "SERVNAK received" in e:
print "SERVNAK received, punting rest of test"
print_status_and_exit(0)
zsig = "Timothy Good Abbott"
zeph = zephyr.ZNotice(sender=mit_user, auth=True, recipient=mit_user,
cls="message", instance="personal")
zeph.setmessage("%s\0%s" % (zsig, zhkey1))
zeph.send()
zeph = zephyr.ZNotice(sender=mit_user, auth=True,
cls="tabbott-nagios-test", instance="test")
zeph.setmessage("%s\0%s" % (zsig, zhkey2))
zeph.send()
if options.verbose:
print "Sent Zephyr messages!"
else:
failed = False
import zephyr
try:
zephyr.init()
zephyr._z.subAll([('message', 'personal', 'tabbott/extra@ATHENA.MIT.EDU'),
('tabbott-nagios-test', '*', '*')])
except IOError, e:
if "SERVNAK received" in e:
print "SERVNAK received, punting rest of test"
print_status_and_exit(0)
time.sleep(20)
if options.verbose:
print "Receiving messages!"
# Normally messages manage to forward through in under 3 seconds, but
# sleep 10 to give a safe margin since the messages do need to do 2
# round trips. This alert is for correctness, not performance, and so
# we want it to reliably alert only when messages aren't being
# delivered at all.
time.sleep(10)
logger.info("Starting receiving messages!")
# receive humbugs
messages = humbug_client.get_messages({'last': str(max_message_id)})['messages']
logger.info("Received Humbug messages!")
# receive zephyrs
notices = []
while True:
try:
notice = zephyr.receive(block=False)
except Exception:
logging.exception("Exception receiving zephyrs:")
notice = None
if notice is None:
break
if notice.opcode != "":
continue
notices.append(notice)
if len(notices) != 4:
print "humbug=>zephyr: Got wrong number of messages back!"
failed = True
elif (set(notice.message.split('\0')[1] for notice in notices) !=
set([str(hzkey1), str(hzkey2), str(zhkey1), str(zhkey2)])):
print "humbug=>zephyr: Didn't get back right values!"
failed = True
if failed:
for notice in notices:
print_zephyr(notice)
logger.info("Received Zephyr messages!")
messages = humbug_client.get_messages({'first': '0',
'last': str(max_message_id),
'server_generation': '0'})['messages']
if len(messages) != 4:
print "zephyr=>humbug: Didn't get exactly 4 messages!"
for message in messages:
print_humbug(message)
failed = True
elif (set(message["content"] for message in messages) !=
set([str(hzkey1), str(hzkey2), str(zhkey1), str(zhkey2)])):
print "zephyr=>humbug: Didn't get back right values!"
for message in messages:
print_humbug(message)
failed = True
all_keys = set(zhkeys.keys() + hzkeys.keys())
def process_keys(content_list):
# Start by filtering out any keys that might have come from
# concurrent check-mirroring processes
content_keys = [key for key in content_list if key in all_keys]
key_counts = {}
for key in all_keys:
key_counts[key] = 0
for key in content_keys:
key_counts[key] += 1
z_missing = set(key for key in zhkeys.keys() if key_counts[key] == 0)
h_missing = set(key for key in hzkeys.keys() if key_counts[key] == 0)
duplicates = any(val > 1 for val in key_counts.values())
success = all(val == 1 for val in key_counts.values())
return key_counts, z_missing, h_missing, duplicates, success
if failed:
print "original keys:", hzkey1, hzkey2, zhkey1, zhkey2
print_status_and_exit(1)
# The h_foo variables are about the messages we _received_ in Humbug
# The z_foo variables are about the messages we _received_ in Zephyr
h_contents = [message["content"] for message in messages]
z_contents = [notice.message.split('\0')[1] for notice in notices]
(h_key_counts, h_missing_z, h_missing_h, h_duplicates, h_success) = process_keys(h_contents)
(z_key_counts, z_missing_z, z_missing_h, z_duplicates, z_success) = process_keys(z_contents)
print "Success!"
if z_success and h_success:
logger.info("Success!")
print_status_and_exit(0)
elif z_success:
logger.info("Received everything correctly in Zephyr!")
elif h_success:
logger.info("Received everything correctly in Humbug!")
logger.error("Messages received the wrong number of times:")
for key in all_keys:
if z_key_counts[key] == 1 and h_key_counts[key] == 1:
continue
if key in zhkeys:
(stream, test) = zhkeys[key]
logger.warning("%10s: z got %s, h got %s. Sent via Zephyr(%s): class %s" % \
(key, z_key_counts[key], h_key_counts[key], test, stream))
if key in hzkeys:
(stream, test) = hzkeys[key]
logger.warning("%10s: z got %s. h got %s. Sent via Humbug(%s): class %s" % \
(key, z_key_counts[key], h_key_counts[key], test, stream))
logger.error("")
logger.error("Summary of specific problems:")
if h_duplicates:
logger.error("humbug: Received duplicate messages!")
logger.error("humbug: This is probably a bug in our message loop detection.")
logger.error("humbug: where Humbugs go humbug=>zephyr=>humbug")
if z_duplicates:
logger.error("zephyr: Received duplicate messages!")
logger.error("zephyr: This is probably a bug in our message loop detection.")
logger.error("zephyr: where Zephyrs go zephyr=>humbug=>zephyr")
if z_missing_z:
logger.error("zephyr: Didn't receive all the Zephyrs we sent on the Zephyr end!")
logger.error("zephyr: This is probably an issue with check-mirroring sending or receiving Zephyrs.")
if h_missing_h:
logger.error("humbug: Didn't receive all the Humbugs we sent on the Humbug end!")
logger.error("humbug: This is probably an issue with check-mirroring sending or receiving Humbugs.")
if z_missing_h:
logger.error("zephyr: Didn't receive all the Humbugs we sent on the Zephyr end!")
if z_missing_h == h_missing_h:
logger.error("zephyr: Including some Humbugs that we did receive on the Humbug end.")
logger.error("zephyr: This suggests we have a humbug=>zephyr mirroring problem.")
logger.error("zephyr: aka the personals mirroring script has issues.")
if h_missing_z:
logger.error("humbug: Didn't receive all the Zephyrs we sent on the Humbug end!")
if h_missing_z == z_missing_z:
logger.error("humbug: Including some Zephyrs that we did receive on the Zephyr end.")
logger.error("humbug: This suggests we have a zephyr=>humbug mirroring problem.")
logger.error("humbug: aka the global class mirroring script has issues.")
print_status_and_exit(1)