Add an example RSS bot that runs out of cron.

(imported from commit 81d7e010f6316dfbe0ce7b645a7b61dde2881483)
2013-01-20 22:39:29 -05:00 · 2013-01-20 22:39:29 -05:00 · 4ea45de9bd
commit 4ea45de9bd
parent c34842397a
1 changed files with 195 additions and 0 deletions
--- a/demos/rss-bot
+++ b/demos/rss-bot
@ -0,0 +1,195 @@
+#!/usr/bin/env python
+import calendar
+import errno
+import hashlib
+from HTMLParser import HTMLParser
+import logging
+import optparse
+import os
+import sys
+import time
+import urlparse
+
+import feedparser
+import humbug
+
+RSS_DATA_DIR = os.path.expanduser(os.path.join('~', '.cache', 'humbug-rss'))
+OLDNESS_THRESHOLD = 30 # days
+
+usage = """Usage: Send summaries of RSS entries for your favorite feeds to Humbug.
+
+This bot requires the feedparser module.
+
+To use this script:
+
+1. Create an RSS feed file containing 1 feed URL per line (default feed
+   file location: ~/.cache/humbug-rss/rss-feeds)
+2. Subscribe to the stream that will receive RSS updates (default stream: rss)
+3. Test the script by running it manually, like this:
+
+/usr/local/share/humbug/demos/rss-bot
+
+You can customize the location on the feed file and recipient stream, e.g.:
+
+/usr/local/share/humbug/demos/rss-bot --feed-file=/path/to/my-feeds --stream=my-rss-stream
+
+4. Configure a crontab entry for this script. A sample crontab entry for
+processing feeds stored in the default location and sending to the default
+stream every 5 minutes is:
+
+*/5 * * * * /usr/local/share/humbug/demos/rss-bot"""
+
+parser = optparse.OptionParser(usage)
+parser.add_option('--email',
+                  dest='email',
+                  help='The email address for your Humbug account.',
+                  metavar='EMAIL')
+parser.add_option('--api-key',
+                  dest='api_key',
+                  help='API key for that user [default: read ~/.humbug-api-key]',
+                  action='store')
+parser.add_option('--stream',
+                  dest='stream',
+                  help='The stream to which to send RSS messages.',
+                  default="rss",
+                  action='store')
+parser.add_option('--data-dir',
+                  dest='data_dir',
+                  help='The directory where feed metadata is stored',
+                  default=os.path.join(RSS_DATA_DIR),
+                  action='store')
+parser.add_option('--feed-file',
+                  dest='feed_file',
+                  help='The file containing a list of RSS feed URLs to follow, one URL per line',
+                  default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
+                  action='store')
+parser.add_option('--site',
+                  dest='site',
+                  default="https://humbughq.com",
+                  help=optparse.SUPPRESS_HELP,
+                  action='store')
+(opts, args) = parser.parse_args()
+
+def mkdir_p(path):
+    # Python doesn't have an analog to `mkdir -p` < Python 3.2.
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+try:
+    mkdir_p(opts.data_dir)
+except OSError:
+    # We can't write to the logfile, so just print and give up.
+    print >>sys.stderr, "Unable to store RSS data at %s." % (opts.data_dir,)
+    exit(1)
+
+log_file = os.path.join(opts.data_dir, "rss-bot.log")
+log_format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=log_format)
+
+formatter = logging.Formatter(log_format)
+file_handler = logging.FileHandler(log_file)
+file_handler.setFormatter(formatter)
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(file_handler)
+
+def log_error_and_exit(error):
+    logger.error(error)
+    logger.error(usage)
+    exit(1)
+
+class MLStripper(HTMLParser):
+    def __init__(self):
+        self.reset()
+        self.fed = []
+
+    def handle_data(self, data):
+        self.fed.append(data)
+
+    def get_data(self):
+        return ''.join(self.fed)
+
+def strip_tags(html):
+    stripper = MLStripper()
+    stripper.feed(html)
+    return stripper.get_data()
+
+def compute_entry_hash(entry):
+    entry_time = entry.get("published", entry.get("updated"))
+    return hashlib.md5(entry.id + entry_time).hexdigest()
+
+def send_humbug(entry, feed_name):
+    content = "**%s**\n%s\n%s" % (entry.title,
+                                  strip_tags(entry.summary),
+                                  entry.link)
+    message = {"type": "stream",
+               "sender": opts.email,
+               "to": opts.stream,
+               "subject": feed_name,
+               "content": content,
+               }
+    return client.send_message(message)
+
+try:
+    with open(opts.feed_file, "r") as f:
+        feed_urls = [feed.strip() for feed in f.readlines()]
+except IOError:
+    log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,))
+
+client = humbug.Client(email=opts.email, api_key=opts.api_key,
+                       site=opts.site)
+
+first_message = True
+
+for feed_url in feed_urls:
+    feed_file = os.path.join(opts.data_dir, urlparse.urlparse(feed_url).netloc)
+
+    try:
+        with open(feed_file, "r") as f:
+            old_feed_hashes = dict((line.strip(), True) for line in f.readlines())
+    except IOError:
+        old_feed_hashes = {}
+
+    new_hashes = []
+    data = feedparser.parse(feed_url)
+
+    for entry in data.entries:
+        entry_hash = compute_entry_hash(entry)
+        # An entry has either been published or updated.
+        entry_time  = entry.get("published_parsed", entry.get("updated_parsed"))
+        if (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24:
+            # As a safeguard against misbehaving feeds, don't try to process
+            # entries older than some threshold.
+            continue
+        if entry_hash in old_feed_hashes:
+            # We've already seen this. No need to process any older entries.
+            break
+        if (not old_feed_hashes) and (len(new_hashes) >= 3):
+            # On a first run, pick up the 3 most recent entries. An RSS feed has
+            # entries in reverse chronological order.
+            break
+
+        response = send_humbug(entry, data.feed.title)
+        if response["result"] != "success":
+            logger.error("Error processing %s" % (feed_url,))
+            logger.error(response)
+            if first_message:
+                # This is probably some fundamental problem like the stream not
+                # existing or something being misconfigured, so bail instead of
+                # getting the same error for every RSS entry.
+                log_error_and_exit("Failed to process first message")
+        # Go ahead and move on -- perhaps this entry is corrupt.
+        new_hashes.append(entry_hash)
+        first_message = False
+
+    with open(feed_file, "a") as f:
+        for hash in new_hashes:
+            f.write(hash + "\n")
+
+    logger.info("Sent humbugs for %d %s entries" % (len(new_hashes), feed_url))