From be6d71f13d65914502be5ea45b7f67e485a14cbd Mon Sep 17 00:00:00 2001 From: Steve Howell Date: Thu, 26 Dec 2013 13:20:35 -0500 Subject: [PATCH] Add summarize_stream.py. (imported from commit 061b1ccc6649acb9a9fc40370282fa34c645afed) --- bots/summarize_stream.py | 79 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 bots/summarize_stream.py diff --git a/bots/summarize_stream.py b/bots/summarize_stream.py new file mode 100644 index 0000000..55d9943 --- /dev/null +++ b/bots/summarize_stream.py @@ -0,0 +1,79 @@ +# This is hacky code to analyze data on our support stream. The main +# reusable bits are get_recent_messages and get_words. + +import zulip +import re +import collections + +def get_recent_messages(client, narrow, count=100): + narrow = [word.split(':') for word in narrow.split()] + req = { + 'narrow': narrow, + 'num_before': count, + 'num_after': 0, + 'anchor': 1000000000, + 'apply_markdown': False + } + old_messages = client.do_api_query(req, zulip.API_VERSTRING + 'messages', method='GET') + if 'messages' not in old_messages: + return [] + return old_messages['messages'] + +def get_words(content): + regex = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+" + words = re.findall(regex, content, re.M) + words = [w.lower() for w in words] + # words = [w.rstrip('s') for w in words] + return words + +def analyze_messages(msgs, word_count, email_count): + for msg in msgs: + if False: + if ' ack' in msg['content']: + name = msg['sender_full_name'].split()[0] + print 'ACK', name + m = re.search('ticket (Z....).*email: (\S+).*~~~(.*)', msg['content'], re.M | re.S) + if m: + ticket, email, req = m.groups() + words = get_words(req) + for word in words: + word_count[word] += 1 + email_count[email] += 1 + if False: + print + for k, v in msg.items(): + print '%-20s: %s' % (k, v) + +def generate_support_stats(): + client = zulip.Client() + narrow = 'stream:support' + count = 2000 + msgs = get_recent_messages(client, narrow, count) + msgs_by_topic = collections.defaultdict(list) + for msg in msgs: + topic = msg['subject'] + msgs_by_topic[topic].append(msg) + + word_count = collections.defaultdict(int) + email_count = collections.defaultdict(int) + + if False: + for topic in msgs_by_topic: + msgs = msgs_by_topic[topic] + analyze_messages(msgs, word_count, email_count) + + if True: + words = word_count.keys() + words = filter(lambda w: word_count[w] >= 10, words) + words = filter(lambda w: len(w) >= 5, words) + words = sorted(words, key=lambda w: word_count[w], reverse=True) + for word in words: + print word, word_count[word] + + if False: + emails = email_count.keys() + emails = sorted(emails, key=lambda w: email_count[w], reverse=True) + for email in emails: + print email, email_count[email] + +generate_support_stats()