#!/usr/bin/env python import calendar import errno import hashlib from HTMLParser import HTMLParser import logging import optparse import os import sys import time import urlparse import feedparser import humbug RSS_DATA_DIR = os.path.expanduser(os.path.join('~', '.cache', 'humbug-rss')) OLDNESS_THRESHOLD = 30 # days usage = """Usage: Send summaries of RSS entries for your favorite feeds to Zulip. This bot requires the feedparser module. To use this script: 1. Create an RSS feed file containing 1 feed URL per line (default feed file location: ~/.cache/humbug-rss/rss-feeds) 2. Subscribe to the stream that will receive RSS updates (default stream: rss) 3. create a ~/.zuliprc, or specify user and api-key with command line arguments 4. Test the script by running it manually, like this: /usr/local/share/humbug/demos/rss-bot You can customize the location on the feed file and recipient stream, e.g.: /usr/local/share/humbug/demos/rss-bot --feed-file=/path/to/my-feeds --stream=my-rss-stream 4. Configure a crontab entry for this script. A sample crontab entry for processing feeds stored in the default location and sending to the default stream every 5 minutes is: */5 * * * * /usr/local/share/humbug/demos/rss-bot""" parser = optparse.OptionParser(usage) parser.add_option('--stream', dest='stream', help='The stream to which to send RSS messages.', default="rss", action='store') parser.add_option('--data-dir', dest='data_dir', help='The directory where feed metadata is stored', default=os.path.join(RSS_DATA_DIR), action='store') parser.add_option('--feed-file', dest='feed_file', help='The file containing a list of RSS feed URLs to follow, one URL per line', default=os.path.join(RSS_DATA_DIR, "rss-feeds"), action='store') parser.add_option_group(humbug.generate_option_group(parser)) (opts, args) = parser.parse_args() def mkdir_p(path): # Python doesn't have an analog to `mkdir -p` < Python 3.2. try: os.makedirs(path) except OSError, e: if e.errno == errno.EEXIST and os.path.isdir(path): pass else: raise try: mkdir_p(opts.data_dir) except OSError: # We can't write to the logfile, so just print and give up. print >>sys.stderr, "Unable to store RSS data at %s." % (opts.data_dir,) exit(1) log_file = os.path.join(opts.data_dir, "rss-bot.log") log_format = "%(asctime)s: %(message)s" logging.basicConfig(format=log_format) formatter = logging.Formatter(log_format) file_handler = logging.FileHandler(log_file) file_handler.setFormatter(formatter) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(file_handler) def log_error_and_exit(error): logger.error(error) logger.error(usage) exit(1) class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, data): self.fed.append(data) def get_data(self): return ''.join(self.fed) def strip_tags(html): stripper = MLStripper() stripper.feed(html) return stripper.get_data() def compute_entry_hash(entry): entry_time = entry.get("published", entry.get("updated")) return hashlib.md5(entry.id + entry_time).hexdigest() def send_humbug(entry, feed_name): content = "**%s**\n%s\n%s" % (entry.title, strip_tags(entry.summary), entry.link) message = {"type": "stream", "sender": opts.email, "to": opts.stream, "subject": feed_name, "content": content, } return client.send_message(message) try: with open(opts.feed_file, "r") as f: feed_urls = [feed.strip() for feed in f.readlines()] except IOError: log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,)) client = humbug.Client(email=opts.email, api_key=opts.api_key, site=opts.site) first_message = True for feed_url in feed_urls: feed_file = os.path.join(opts.data_dir, urlparse.urlparse(feed_url).netloc) try: with open(feed_file, "r") as f: old_feed_hashes = dict((line.strip(), True) for line in f.readlines()) except IOError: old_feed_hashes = {} new_hashes = [] data = feedparser.parse(feed_url) for entry in data.entries: entry_hash = compute_entry_hash(entry) # An entry has either been published or updated. entry_time = entry.get("published_parsed", entry.get("updated_parsed")) if (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24: # As a safeguard against misbehaving feeds, don't try to process # entries older than some threshold. continue if entry_hash in old_feed_hashes: # We've already seen this. No need to process any older entries. break if (not old_feed_hashes) and (len(new_hashes) >= 3): # On a first run, pick up the 3 most recent entries. An RSS feed has # entries in reverse chronological order. break response = send_humbug(entry, data.feed.title) if response["result"] != "success": logger.error("Error processing %s" % (feed_url,)) logger.error(response) if first_message: # This is probably some fundamental problem like the stream not # existing or something being misconfigured, so bail instead of # getting the same error for every RSS entry. log_error_and_exit("Failed to process first message") # Go ahead and move on -- perhaps this entry is corrupt. new_hashes.append(entry_hash) first_message = False with open(feed_file, "a") as f: for hash in new_hashes: f.write(hash + "\n") logger.info("Sent humbugs for %d %s entries" % (len(new_hashes), feed_url))