diff --git a/demos/rss-bot b/demos/rss-bot new file mode 100755 index 0000000..162d474 --- /dev/null +++ b/demos/rss-bot @@ -0,0 +1,195 @@ +#!/usr/bin/env python +import calendar +import errno +import hashlib +from HTMLParser import HTMLParser +import logging +import optparse +import os +import sys +import time +import urlparse + +import feedparser +import humbug + +RSS_DATA_DIR = os.path.expanduser(os.path.join('~', '.cache', 'humbug-rss')) +OLDNESS_THRESHOLD = 30 # days + +usage = """Usage: Send summaries of RSS entries for your favorite feeds to Humbug. + +This bot requires the feedparser module. + +To use this script: + +1. Create an RSS feed file containing 1 feed URL per line (default feed + file location: ~/.cache/humbug-rss/rss-feeds) +2. Subscribe to the stream that will receive RSS updates (default stream: rss) +3. Test the script by running it manually, like this: + +/usr/local/share/humbug/demos/rss-bot + +You can customize the location on the feed file and recipient stream, e.g.: + +/usr/local/share/humbug/demos/rss-bot --feed-file=/path/to/my-feeds --stream=my-rss-stream + +4. Configure a crontab entry for this script. A sample crontab entry for +processing feeds stored in the default location and sending to the default +stream every 5 minutes is: + +*/5 * * * * /usr/local/share/humbug/demos/rss-bot""" + +parser = optparse.OptionParser(usage) +parser.add_option('--email', + dest='email', + help='The email address for your Humbug account.', + metavar='EMAIL') +parser.add_option('--api-key', + dest='api_key', + help='API key for that user [default: read ~/.humbug-api-key]', + action='store') +parser.add_option('--stream', + dest='stream', + help='The stream to which to send RSS messages.', + default="rss", + action='store') +parser.add_option('--data-dir', + dest='data_dir', + help='The directory where feed metadata is stored', + default=os.path.join(RSS_DATA_DIR), + action='store') +parser.add_option('--feed-file', + dest='feed_file', + help='The file containing a list of RSS feed URLs to follow, one URL per line', + default=os.path.join(RSS_DATA_DIR, "rss-feeds"), + action='store') +parser.add_option('--site', + dest='site', + default="https://humbughq.com", + help=optparse.SUPPRESS_HELP, + action='store') +(opts, args) = parser.parse_args() + +def mkdir_p(path): + # Python doesn't have an analog to `mkdir -p` < Python 3.2. + try: + os.makedirs(path) + except OSError, e: + if e.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + +try: + mkdir_p(opts.data_dir) +except OSError: + # We can't write to the logfile, so just print and give up. + print >>sys.stderr, "Unable to store RSS data at %s." % (opts.data_dir,) + exit(1) + +log_file = os.path.join(opts.data_dir, "rss-bot.log") +log_format = "%(asctime)s: %(message)s" +logging.basicConfig(format=log_format) + +formatter = logging.Formatter(log_format) +file_handler = logging.FileHandler(log_file) +file_handler.setFormatter(formatter) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(file_handler) + +def log_error_and_exit(error): + logger.error(error) + logger.error(usage) + exit(1) + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.fed = [] + + def handle_data(self, data): + self.fed.append(data) + + def get_data(self): + return ''.join(self.fed) + +def strip_tags(html): + stripper = MLStripper() + stripper.feed(html) + return stripper.get_data() + +def compute_entry_hash(entry): + entry_time = entry.get("published", entry.get("updated")) + return hashlib.md5(entry.id + entry_time).hexdigest() + +def send_humbug(entry, feed_name): + content = "**%s**\n%s\n%s" % (entry.title, + strip_tags(entry.summary), + entry.link) + message = {"type": "stream", + "sender": opts.email, + "to": opts.stream, + "subject": feed_name, + "content": content, + } + return client.send_message(message) + +try: + with open(opts.feed_file, "r") as f: + feed_urls = [feed.strip() for feed in f.readlines()] +except IOError: + log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,)) + +client = humbug.Client(email=opts.email, api_key=opts.api_key, + site=opts.site) + +first_message = True + +for feed_url in feed_urls: + feed_file = os.path.join(opts.data_dir, urlparse.urlparse(feed_url).netloc) + + try: + with open(feed_file, "r") as f: + old_feed_hashes = dict((line.strip(), True) for line in f.readlines()) + except IOError: + old_feed_hashes = {} + + new_hashes = [] + data = feedparser.parse(feed_url) + + for entry in data.entries: + entry_hash = compute_entry_hash(entry) + # An entry has either been published or updated. + entry_time = entry.get("published_parsed", entry.get("updated_parsed")) + if (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24: + # As a safeguard against misbehaving feeds, don't try to process + # entries older than some threshold. + continue + if entry_hash in old_feed_hashes: + # We've already seen this. No need to process any older entries. + break + if (not old_feed_hashes) and (len(new_hashes) >= 3): + # On a first run, pick up the 3 most recent entries. An RSS feed has + # entries in reverse chronological order. + break + + response = send_humbug(entry, data.feed.title) + if response["result"] != "success": + logger.error("Error processing %s" % (feed_url,)) + logger.error(response) + if first_message: + # This is probably some fundamental problem like the stream not + # existing or something being misconfigured, so bail instead of + # getting the same error for every RSS entry. + log_error_and_exit("Failed to process first message") + # Go ahead and move on -- perhaps this entry is corrupt. + new_hashes.append(entry_hash) + first_message = False + + with open(feed_file, "a") as f: + for hash in new_hashes: + f.write(hash + "\n") + + logger.info("Sent humbugs for %d %s entries" % (len(new_hashes), feed_url))