Add summarize_stream.py.

(imported from commit 061b1ccc6649acb9a9fc40370282fa34c645afed)
2013-12-26 13:20:35 -05:00 · 2013-12-26 13:20:35 -05:00 · be6d71f13d
commit be6d71f13d
parent b0c7153107
1 changed files with 79 additions and 0 deletions
--- a/bots/summarize_stream.py
+++ b/bots/summarize_stream.py
@ -0,0 +1,79 @@
+# This is hacky code to analyze data on our support stream.  The main
+# reusable bits are get_recent_messages and get_words.
+
+import zulip
+import re
+import collections
+
+def get_recent_messages(client, narrow, count=100):
+    narrow = [word.split(':') for word in narrow.split()]
+    req = {
+        'narrow': narrow,
+        'num_before': count,
+        'num_after': 0,
+        'anchor': 1000000000,
+        'apply_markdown': False
+    }
+    old_messages = client.do_api_query(req, zulip.API_VERSTRING + 'messages', method='GET')
+    if 'messages' not in old_messages:
+        return []
+    return old_messages['messages']
+
+def get_words(content):
+    regex = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+"
+    words = re.findall(regex, content, re.M)
+    words = [w.lower() for w in words]
+    # words = [w.rstrip('s') for w in words]
+    return words
+
+def analyze_messages(msgs, word_count, email_count):
+    for msg in msgs:
+        if False:
+            if ' ack' in msg['content']:
+                name = msg['sender_full_name'].split()[0]
+                print 'ACK', name
+        m = re.search('ticket (Z....).*email: (\S+).*~~~(.*)', msg['content'], re.M | re.S)
+        if m:
+            ticket, email, req = m.groups()
+            words = get_words(req)
+            for word in words:
+                word_count[word] += 1
+            email_count[email] += 1
+        if False:
+            print
+            for k, v in msg.items():
+                print '%-20s: %s' % (k, v)
+
+def generate_support_stats():
+    client = zulip.Client()
+    narrow = 'stream:support'
+    count = 2000
+    msgs = get_recent_messages(client, narrow, count)
+    msgs_by_topic = collections.defaultdict(list)
+    for msg in msgs:
+        topic = msg['subject']
+        msgs_by_topic[topic].append(msg)
+
+    word_count = collections.defaultdict(int)
+    email_count = collections.defaultdict(int)
+
+    if False:
+        for topic in msgs_by_topic:
+            msgs = msgs_by_topic[topic]
+    analyze_messages(msgs, word_count, email_count)
+
+    if True:
+        words = word_count.keys()
+        words = filter(lambda w: word_count[w] >= 10, words)
+        words = filter(lambda w: len(w) >= 5, words)
+        words = sorted(words, key=lambda w: word_count[w], reverse=True)
+        for word in words:
+            print word, word_count[word]
+
+    if False:
+        emails = email_count.keys()
+        emails = sorted(emails, key=lambda w: email_count[w], reverse=True)
+        for email in emails:
+            print email, email_count[email]
+
+generate_support_stats()