2013-02-18 19:00:54 -05:00
|
|
|
#!/usr/bin/env python
|
2012-10-19 12:20:52 -04:00
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import optparse
|
|
|
|
import os
|
|
|
|
import random
|
2012-11-28 22:34:49 -05:00
|
|
|
import logging
|
|
|
|
import subprocess
|
2012-12-12 13:46:18 -05:00
|
|
|
import hashlib
|
2012-10-19 12:20:52 -04:00
|
|
|
|
|
|
|
parser = optparse.OptionParser()
|
|
|
|
parser.add_option('--verbose',
|
|
|
|
dest='verbose',
|
|
|
|
default=False,
|
|
|
|
action='store_true')
|
|
|
|
parser.add_option('--site',
|
|
|
|
dest='site',
|
2013-07-24 17:53:39 -04:00
|
|
|
default="https://api.zulip.com",
|
2012-10-19 12:20:52 -04:00
|
|
|
action='store')
|
2012-11-28 22:34:49 -05:00
|
|
|
parser.add_option('--sharded',
|
|
|
|
default=False,
|
|
|
|
action='store_true')
|
2012-11-08 13:16:34 -05:00
|
|
|
parser.add_option('--root-path',
|
|
|
|
dest='root_path',
|
|
|
|
default="/home/humbug",
|
|
|
|
action='store')
|
2012-10-19 12:20:52 -04:00
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
2013-08-07 11:51:03 -04:00
|
|
|
# The 'api' directory needs to go first, so that 'import zulip' won't pick up
|
2012-12-03 17:13:41 -05:00
|
|
|
# some other directory named 'humbug'.
|
2013-08-19 14:16:47 -04:00
|
|
|
pyzephyr_lib_path = "python-zephyr/build/lib.linux-%s-%s/" % (os.uname()[4], sys.version[0:3])
|
2012-12-03 17:13:41 -05:00
|
|
|
sys.path[:0] = [os.path.join(options.root_path, "api/"),
|
|
|
|
os.path.join(options.root_path, "python-zephyr"),
|
2013-08-19 14:16:47 -04:00
|
|
|
os.path.join(options.root_path, pyzephyr_lib_path),
|
2012-11-08 13:16:34 -05:00
|
|
|
options.root_path]
|
|
|
|
|
2012-10-19 12:20:52 -04:00
|
|
|
mit_user = 'tabbott/extra@ATHENA.MIT.EDU'
|
2013-08-07 13:36:54 -04:00
|
|
|
zulip_user = 'tabbott/extra@mit.edu'
|
2012-10-19 12:20:52 -04:00
|
|
|
|
|
|
|
sys.path.append(".")
|
2013-08-07 11:51:03 -04:00
|
|
|
import zulip
|
|
|
|
zulip_client = zulip.Client(
|
2013-08-07 13:36:54 -04:00
|
|
|
email=zulip_user,
|
2012-12-03 12:23:06 -05:00
|
|
|
api_key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
|
|
verbose=True,
|
|
|
|
client="test: Humbug API",
|
|
|
|
site=options.site)
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
# Configure logging
|
2013-09-25 14:18:32 -04:00
|
|
|
log_file = "/var/log/zulip/check-mirroring-log"
|
2012-11-28 22:34:49 -05:00
|
|
|
log_format = "%(asctime)s: %(message)s"
|
|
|
|
logging.basicConfig(format=log_format)
|
|
|
|
|
|
|
|
formatter = logging.Formatter(log_format)
|
|
|
|
file_handler = logging.FileHandler(log_file)
|
|
|
|
file_handler.setFormatter(formatter)
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
logger.addHandler(file_handler)
|
|
|
|
|
|
|
|
# Initialize list of streams to test
|
|
|
|
if options.sharded:
|
2013-08-07 13:36:54 -04:00
|
|
|
# NOTE: Streams in this list must be in zulip_user's Zulip
|
2013-08-06 15:32:15 -04:00
|
|
|
# subscriptions, or we won't receive messages via Zulip.
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
# The sharded stream list has a bunch of pairs
|
|
|
|
# (stream, shard_name), where sha1sum(stream).startswith(shard_name)
|
|
|
|
test_streams = [
|
|
|
|
("message", "p"),
|
2012-12-12 13:46:18 -05:00
|
|
|
("tabbott-nagios-test-32", "0"),
|
|
|
|
("tabbott-nagios-test-33", "1"),
|
|
|
|
("tabbott-nagios-test-2", "2"),
|
|
|
|
("tabbott-nagios-test-5", "3"),
|
|
|
|
("tabbott-nagios-test-13", "4"),
|
|
|
|
("tabbott-nagios-test-7", "5"),
|
|
|
|
("tabbott-nagios-test-22", "6"),
|
|
|
|
("tabbott-nagios-test-35", "7"),
|
|
|
|
("tabbott-nagios-test-4", "8"),
|
2012-11-28 22:34:49 -05:00
|
|
|
("tabbott-nagios-test-3", "9"),
|
2012-12-12 13:46:18 -05:00
|
|
|
("tabbott-nagios-test-1", "a"),
|
|
|
|
("tabbott-nagios-test-49", "b"),
|
|
|
|
("tabbott-nagios-test-34", "c"),
|
|
|
|
("tabbott-nagios-test-12", "d"),
|
|
|
|
("tabbott-nagios-test-11", "e"),
|
|
|
|
("tabbott-nagios-test-9", "f"),
|
2012-11-28 22:34:49 -05:00
|
|
|
]
|
2012-12-12 13:46:18 -05:00
|
|
|
for (stream, test) in test_streams:
|
|
|
|
if stream == "message":
|
|
|
|
continue
|
|
|
|
assert(hashlib.sha1(stream).hexdigest().startswith(test))
|
2012-11-28 22:34:49 -05:00
|
|
|
else:
|
|
|
|
test_streams = [
|
|
|
|
("message", "p"),
|
|
|
|
("tabbott-nagios-test", "a"),
|
|
|
|
]
|
|
|
|
|
2012-11-15 14:24:38 -05:00
|
|
|
def print_status_and_exit(status):
|
|
|
|
# The output of this script is used by Nagios. Various outputs,
|
|
|
|
# e.g. true success and punting due to a SERVNAK, result in a
|
|
|
|
# non-alert case, so to give us something unambiguous to check in
|
|
|
|
# Nagios, print the exit status.
|
|
|
|
print status
|
|
|
|
sys.exit(status)
|
|
|
|
|
2013-08-07 13:36:54 -04:00
|
|
|
def send_zulip(message):
|
2013-08-07 11:51:03 -04:00
|
|
|
result = zulip_client.send_message(message)
|
2012-11-15 14:24:38 -05:00
|
|
|
if result["result"] != "success":
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("Error sending zulip, args were:")
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.error(message)
|
|
|
|
logger.error(result)
|
|
|
|
print_status_and_exit(1)
|
2012-11-15 14:24:38 -05:00
|
|
|
|
2012-12-18 10:54:39 -05:00
|
|
|
# Returns True if and only if we "Detected server failure" sending the zephyr.
|
|
|
|
def send_zephyr(zwrite_args, content):
|
2012-11-28 22:34:49 -05:00
|
|
|
p = subprocess.Popen(zwrite_args, stdin=subprocess.PIPE,
|
|
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
stdout, stderr = p.communicate(input=content.encode("utf-8"))
|
|
|
|
if p.returncode != 0:
|
2012-12-18 10:54:39 -05:00
|
|
|
if "Detected server failure while receiving acknowledgement for" in stdout:
|
2012-12-10 13:37:50 -05:00
|
|
|
logger.warning("Got server failure error sending zephyr; retrying")
|
|
|
|
logger.warning(stderr)
|
2012-12-18 10:54:39 -05:00
|
|
|
return True
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.error("Error sending zephyr:")
|
|
|
|
logger.info(stdout)
|
|
|
|
logger.error(stderr)
|
|
|
|
print_status_and_exit(1)
|
2012-12-18 10:54:39 -05:00
|
|
|
return False
|
2012-11-15 14:24:38 -05:00
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
# Subscribe to Zulip
|
2012-11-28 22:34:49 -05:00
|
|
|
try:
|
2013-08-07 11:51:03 -04:00
|
|
|
res = zulip_client.get_profile()
|
2012-11-28 22:34:49 -05:00
|
|
|
max_message_id = res.get('max_message_id')
|
|
|
|
if max_message_id is None:
|
2013-08-06 15:32:15 -04:00
|
|
|
logging.error("Error subscribing to Zulips!")
|
2012-11-28 22:34:49 -05:00
|
|
|
logging.error(res)
|
|
|
|
print_status_and_exit(1)
|
|
|
|
except Exception:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.exception("Unexpected error subscribing to Zulips")
|
2012-11-28 22:34:49 -05:00
|
|
|
print_status_and_exit(1)
|
2012-11-15 14:24:38 -05:00
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
# Subscribe to Zephyrs
|
|
|
|
import zephyr
|
|
|
|
zephyr_subs_to_add = []
|
|
|
|
for (stream, test) in test_streams:
|
|
|
|
if stream == "message":
|
|
|
|
zephyr_subs_to_add.append((stream, 'personal', mit_user))
|
|
|
|
else:
|
|
|
|
zephyr_subs_to_add.append((stream, '*', '*'))
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2013-01-02 10:27:23 -05:00
|
|
|
actually_subscribed = False
|
2012-11-28 22:34:49 -05:00
|
|
|
for tries in xrange(10):
|
2012-11-15 14:24:38 -05:00
|
|
|
try:
|
|
|
|
zephyr.init()
|
2012-11-28 22:34:49 -05:00
|
|
|
zephyr._z.subAll(zephyr_subs_to_add)
|
|
|
|
zephyr_subs = zephyr._z.getSubscriptions()
|
2013-01-02 10:27:23 -05:00
|
|
|
|
|
|
|
missing = 0
|
2012-11-28 22:34:49 -05:00
|
|
|
for elt in zephyr_subs_to_add:
|
|
|
|
if elt not in zephyr_subs:
|
|
|
|
logging.error("Failed to subscribe to %s" % (elt,))
|
2013-01-02 10:27:23 -05:00
|
|
|
missing += 1
|
|
|
|
if missing == 0:
|
|
|
|
actually_subscribed = True
|
|
|
|
break
|
2012-11-15 14:24:38 -05:00
|
|
|
except IOError, e:
|
2013-01-02 10:27:23 -05:00
|
|
|
if "SERVNAK received" in e:
|
|
|
|
logger.error("SERVNAK repeatedly received, punting rest of test")
|
|
|
|
else:
|
|
|
|
logger.exception("Exception subscribing to zephyrs")
|
|
|
|
|
|
|
|
if not actually_subscribed:
|
|
|
|
logger.error("Failed to subscribe to zephyrs")
|
|
|
|
print_status_and_exit(1)
|
2012-11-15 14:24:38 -05:00
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
# Prepare keys
|
|
|
|
zhkeys = {}
|
|
|
|
hzkeys = {}
|
2012-12-18 10:54:39 -05:00
|
|
|
def gen_key(key_dict):
|
|
|
|
bits = str(random.getrandbits(32))
|
|
|
|
while bits in key_dict:
|
|
|
|
# Avoid the unlikely event that we get the same bits twice
|
|
|
|
bits = str(random.getrandbits(32))
|
|
|
|
return bits
|
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
def gen_keys(key_dict):
|
|
|
|
for (stream, test) in test_streams:
|
2012-12-18 10:54:39 -05:00
|
|
|
key_dict[gen_key(key_dict)] = (stream, test)
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
gen_keys(zhkeys)
|
|
|
|
gen_keys(hzkeys)
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2012-12-11 17:38:34 -05:00
|
|
|
notices = []
|
|
|
|
|
|
|
|
# We check for new zephyrs multiple times, to avoid filling the zephyr
|
|
|
|
# receive queue with 30+ messages, which might result in messages
|
|
|
|
# being dropped.
|
|
|
|
def receive_zephyrs():
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
notice = zephyr.receive(block=False)
|
|
|
|
except Exception:
|
|
|
|
logging.exception("Exception receiving zephyrs:")
|
|
|
|
notice = None
|
|
|
|
if notice is None:
|
|
|
|
break
|
|
|
|
if notice.opcode != "":
|
|
|
|
continue
|
|
|
|
notices.append(notice)
|
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.info("Starting sending messages!")
|
|
|
|
# Send zephyrs
|
|
|
|
zsig = "Timothy Good Abbott"
|
|
|
|
for key, (stream, test) in zhkeys.items():
|
|
|
|
if stream == "message":
|
2012-12-10 10:21:49 -05:00
|
|
|
zwrite_args = ["zwrite", "-n", "-s", zsig, mit_user]
|
2012-11-28 22:34:49 -05:00
|
|
|
else:
|
2012-12-10 10:21:49 -05:00
|
|
|
zwrite_args = ["zwrite", "-n", "-s", zsig, "-c", stream, "-i", "test"]
|
2012-12-18 10:54:39 -05:00
|
|
|
server_failure = send_zephyr(zwrite_args, str(key))
|
|
|
|
if server_failure:
|
|
|
|
# Replace the key we're not sure was delivered with a new key
|
|
|
|
value = zhkeys.pop(key)
|
|
|
|
new_key = gen_key(zhkeys)
|
|
|
|
zhkeys[new_key] = value
|
|
|
|
server_failure_again = send_zephyr(zwrite_args, str(new_key))
|
|
|
|
if server_failure_again:
|
|
|
|
logging.error("Zephyr server failure twice in a row on keys %s and %s! Aborting." %
|
|
|
|
(key, new_key))
|
|
|
|
print_status_and_exit(1)
|
|
|
|
else:
|
|
|
|
logging.warning("Replaced key %s with %s due to Zephyr server failure." %
|
|
|
|
(key, new_key))
|
2013-01-02 10:36:47 -05:00
|
|
|
receive_zephyrs()
|
2012-12-11 17:38:34 -05:00
|
|
|
|
|
|
|
receive_zephyrs()
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.info("Sent Zephyr messages!")
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
# Send Zulips
|
2012-11-28 22:34:49 -05:00
|
|
|
for key, (stream, test) in hzkeys.items():
|
|
|
|
if stream == "message":
|
2013-08-07 13:36:54 -04:00
|
|
|
send_zulip({
|
2012-11-28 22:34:49 -05:00
|
|
|
"type": "private",
|
|
|
|
"content": str(key),
|
2013-08-07 13:36:54 -04:00
|
|
|
"to": zulip_user,
|
2012-11-28 22:34:49 -05:00
|
|
|
})
|
|
|
|
else:
|
2013-08-07 13:36:54 -04:00
|
|
|
send_zulip({
|
2012-11-28 22:34:49 -05:00
|
|
|
"type": "stream",
|
|
|
|
"subject": "test",
|
|
|
|
"content": str(key),
|
|
|
|
"to": stream,
|
|
|
|
})
|
2012-12-11 17:38:34 -05:00
|
|
|
receive_zephyrs()
|
2012-11-28 22:34:49 -05:00
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.info("Sent Zulip messages!")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
# Normally messages manage to forward through in under 3 seconds, but
|
|
|
|
# sleep 10 to give a safe margin since the messages do need to do 2
|
|
|
|
# round trips. This alert is for correctness, not performance, and so
|
|
|
|
# we want it to reliably alert only when messages aren't being
|
|
|
|
# delivered at all.
|
|
|
|
time.sleep(10)
|
2012-12-11 17:38:34 -05:00
|
|
|
receive_zephyrs()
|
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.info("Starting receiving messages!")
|
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
# receive zulips
|
2013-08-07 11:51:03 -04:00
|
|
|
messages = zulip_client.get_messages({'last': str(max_message_id)})['messages']
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.info("Finished receiving Zulip messages!")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
2012-12-11 17:38:34 -05:00
|
|
|
receive_zephyrs()
|
|
|
|
logger.info("Finished receiving Zephyr messages!")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
all_keys = set(zhkeys.keys() + hzkeys.keys())
|
|
|
|
def process_keys(content_list):
|
|
|
|
# Start by filtering out any keys that might have come from
|
|
|
|
# concurrent check-mirroring processes
|
|
|
|
content_keys = [key for key in content_list if key in all_keys]
|
|
|
|
key_counts = {}
|
|
|
|
for key in all_keys:
|
|
|
|
key_counts[key] = 0
|
|
|
|
for key in content_keys:
|
|
|
|
key_counts[key] += 1
|
|
|
|
z_missing = set(key for key in zhkeys.keys() if key_counts[key] == 0)
|
|
|
|
h_missing = set(key for key in hzkeys.keys() if key_counts[key] == 0)
|
|
|
|
duplicates = any(val > 1 for val in key_counts.values())
|
|
|
|
success = all(val == 1 for val in key_counts.values())
|
|
|
|
return key_counts, z_missing, h_missing, duplicates, success
|
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
# The h_foo variables are about the messages we _received_ in Zulip
|
2012-11-28 22:34:49 -05:00
|
|
|
# The z_foo variables are about the messages we _received_ in Zephyr
|
|
|
|
h_contents = [message["content"] for message in messages]
|
|
|
|
z_contents = [notice.message.split('\0')[1] for notice in notices]
|
|
|
|
(h_key_counts, h_missing_z, h_missing_h, h_duplicates, h_success) = process_keys(h_contents)
|
|
|
|
(z_key_counts, z_missing_z, z_missing_h, z_duplicates, z_success) = process_keys(z_contents)
|
2012-10-19 12:20:52 -04:00
|
|
|
|
2012-11-28 22:34:49 -05:00
|
|
|
if z_success and h_success:
|
|
|
|
logger.info("Success!")
|
2012-11-15 14:24:38 -05:00
|
|
|
print_status_and_exit(0)
|
2012-11-28 22:34:49 -05:00
|
|
|
elif z_success:
|
|
|
|
logger.info("Received everything correctly in Zephyr!")
|
|
|
|
elif h_success:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.info("Received everything correctly in Zulip!")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
logger.error("Messages received the wrong number of times:")
|
|
|
|
for key in all_keys:
|
|
|
|
if z_key_counts[key] == 1 and h_key_counts[key] == 1:
|
|
|
|
continue
|
|
|
|
if key in zhkeys:
|
|
|
|
(stream, test) = zhkeys[key]
|
|
|
|
logger.warning("%10s: z got %s, h got %s. Sent via Zephyr(%s): class %s" % \
|
|
|
|
(key, z_key_counts[key], h_key_counts[key], test, stream))
|
|
|
|
if key in hzkeys:
|
|
|
|
(stream, test) = hzkeys[key]
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.warning("%10s: z got %s. h got %s. Sent via Zulip(%s): class %s" % \
|
2012-11-28 22:34:49 -05:00
|
|
|
(key, z_key_counts[key], h_key_counts[key], test, stream))
|
|
|
|
logger.error("")
|
|
|
|
logger.error("Summary of specific problems:")
|
|
|
|
|
|
|
|
if h_duplicates:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zulip: Received duplicate messages!")
|
|
|
|
logger.error("zulip: This is probably a bug in our message loop detection.")
|
|
|
|
logger.error("zulip: where Zulips go zulip=>zephyr=>zulip")
|
2012-11-28 22:34:49 -05:00
|
|
|
if z_duplicates:
|
|
|
|
logger.error("zephyr: Received duplicate messages!")
|
|
|
|
logger.error("zephyr: This is probably a bug in our message loop detection.")
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zephyr: where Zephyrs go zephyr=>zulip=>zephyr")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
if z_missing_z:
|
|
|
|
logger.error("zephyr: Didn't receive all the Zephyrs we sent on the Zephyr end!")
|
|
|
|
logger.error("zephyr: This is probably an issue with check-mirroring sending or receiving Zephyrs.")
|
|
|
|
if h_missing_h:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zulip: Didn't receive all the Zulips we sent on the Zulip end!")
|
|
|
|
logger.error("zulip: This is probably an issue with check-mirroring sending or receiving Zulips.")
|
2012-11-28 22:34:49 -05:00
|
|
|
if z_missing_h:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zephyr: Didn't receive all the Zulips we sent on the Zephyr end!")
|
2012-11-28 22:34:49 -05:00
|
|
|
if z_missing_h == h_missing_h:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zephyr: Including some Zulips that we did receive on the Zulip end.")
|
|
|
|
logger.error("zephyr: This suggests we have a zulip=>zephyr mirroring problem.")
|
2012-11-28 22:34:49 -05:00
|
|
|
logger.error("zephyr: aka the personals mirroring script has issues.")
|
|
|
|
if h_missing_z:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zulip: Didn't receive all the Zephyrs we sent on the Zulip end!")
|
2012-11-28 22:34:49 -05:00
|
|
|
if h_missing_z == z_missing_z:
|
2013-08-06 15:32:15 -04:00
|
|
|
logger.error("zulip: Including some Zephyrs that we did receive on the Zephyr end.")
|
|
|
|
logger.error("zulip: This suggests we have a zephyr=>zulip mirroring problem.")
|
|
|
|
logger.error("zulip: aka the global class mirroring script has issues.")
|
2012-11-28 22:34:49 -05:00
|
|
|
|
|
|
|
print_status_and_exit(1)
|