Add an example RSS bot that runs out of cron.
(imported from commit 81d7e010f6316dfbe0ce7b645a7b61dde2881483)
This commit is contained in:
parent
c34842397a
commit
4ea45de9bd
195
demos/rss-bot
Executable file
195
demos/rss-bot
Executable file
|
@ -0,0 +1,195 @@
|
|||
#!/usr/bin/env python
|
||||
import calendar
|
||||
import errno
|
||||
import hashlib
|
||||
from HTMLParser import HTMLParser
|
||||
import logging
|
||||
import optparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urlparse
|
||||
|
||||
import feedparser
|
||||
import humbug
|
||||
|
||||
RSS_DATA_DIR = os.path.expanduser(os.path.join('~', '.cache', 'humbug-rss'))
|
||||
OLDNESS_THRESHOLD = 30 # days
|
||||
|
||||
usage = """Usage: Send summaries of RSS entries for your favorite feeds to Humbug.
|
||||
|
||||
This bot requires the feedparser module.
|
||||
|
||||
To use this script:
|
||||
|
||||
1. Create an RSS feed file containing 1 feed URL per line (default feed
|
||||
file location: ~/.cache/humbug-rss/rss-feeds)
|
||||
2. Subscribe to the stream that will receive RSS updates (default stream: rss)
|
||||
3. Test the script by running it manually, like this:
|
||||
|
||||
/usr/local/share/humbug/demos/rss-bot
|
||||
|
||||
You can customize the location on the feed file and recipient stream, e.g.:
|
||||
|
||||
/usr/local/share/humbug/demos/rss-bot --feed-file=/path/to/my-feeds --stream=my-rss-stream
|
||||
|
||||
4. Configure a crontab entry for this script. A sample crontab entry for
|
||||
processing feeds stored in the default location and sending to the default
|
||||
stream every 5 minutes is:
|
||||
|
||||
*/5 * * * * /usr/local/share/humbug/demos/rss-bot"""
|
||||
|
||||
parser = optparse.OptionParser(usage)
|
||||
parser.add_option('--email',
|
||||
dest='email',
|
||||
help='The email address for your Humbug account.',
|
||||
metavar='EMAIL')
|
||||
parser.add_option('--api-key',
|
||||
dest='api_key',
|
||||
help='API key for that user [default: read ~/.humbug-api-key]',
|
||||
action='store')
|
||||
parser.add_option('--stream',
|
||||
dest='stream',
|
||||
help='The stream to which to send RSS messages.',
|
||||
default="rss",
|
||||
action='store')
|
||||
parser.add_option('--data-dir',
|
||||
dest='data_dir',
|
||||
help='The directory where feed metadata is stored',
|
||||
default=os.path.join(RSS_DATA_DIR),
|
||||
action='store')
|
||||
parser.add_option('--feed-file',
|
||||
dest='feed_file',
|
||||
help='The file containing a list of RSS feed URLs to follow, one URL per line',
|
||||
default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
|
||||
action='store')
|
||||
parser.add_option('--site',
|
||||
dest='site',
|
||||
default="https://humbughq.com",
|
||||
help=optparse.SUPPRESS_HELP,
|
||||
action='store')
|
||||
(opts, args) = parser.parse_args()
|
||||
|
||||
def mkdir_p(path):
|
||||
# Python doesn't have an analog to `mkdir -p` < Python 3.2.
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except OSError, e:
|
||||
if e.errno == errno.EEXIST and os.path.isdir(path):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
try:
|
||||
mkdir_p(opts.data_dir)
|
||||
except OSError:
|
||||
# We can't write to the logfile, so just print and give up.
|
||||
print >>sys.stderr, "Unable to store RSS data at %s." % (opts.data_dir,)
|
||||
exit(1)
|
||||
|
||||
log_file = os.path.join(opts.data_dir, "rss-bot.log")
|
||||
log_format = "%(asctime)s: %(message)s"
|
||||
logging.basicConfig(format=log_format)
|
||||
|
||||
formatter = logging.Formatter(log_format)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
def log_error_and_exit(error):
|
||||
logger.error(error)
|
||||
logger.error(usage)
|
||||
exit(1)
|
||||
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.fed = []
|
||||
|
||||
def handle_data(self, data):
|
||||
self.fed.append(data)
|
||||
|
||||
def get_data(self):
|
||||
return ''.join(self.fed)
|
||||
|
||||
def strip_tags(html):
|
||||
stripper = MLStripper()
|
||||
stripper.feed(html)
|
||||
return stripper.get_data()
|
||||
|
||||
def compute_entry_hash(entry):
|
||||
entry_time = entry.get("published", entry.get("updated"))
|
||||
return hashlib.md5(entry.id + entry_time).hexdigest()
|
||||
|
||||
def send_humbug(entry, feed_name):
|
||||
content = "**%s**\n%s\n%s" % (entry.title,
|
||||
strip_tags(entry.summary),
|
||||
entry.link)
|
||||
message = {"type": "stream",
|
||||
"sender": opts.email,
|
||||
"to": opts.stream,
|
||||
"subject": feed_name,
|
||||
"content": content,
|
||||
}
|
||||
return client.send_message(message)
|
||||
|
||||
try:
|
||||
with open(opts.feed_file, "r") as f:
|
||||
feed_urls = [feed.strip() for feed in f.readlines()]
|
||||
except IOError:
|
||||
log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,))
|
||||
|
||||
client = humbug.Client(email=opts.email, api_key=opts.api_key,
|
||||
site=opts.site)
|
||||
|
||||
first_message = True
|
||||
|
||||
for feed_url in feed_urls:
|
||||
feed_file = os.path.join(opts.data_dir, urlparse.urlparse(feed_url).netloc)
|
||||
|
||||
try:
|
||||
with open(feed_file, "r") as f:
|
||||
old_feed_hashes = dict((line.strip(), True) for line in f.readlines())
|
||||
except IOError:
|
||||
old_feed_hashes = {}
|
||||
|
||||
new_hashes = []
|
||||
data = feedparser.parse(feed_url)
|
||||
|
||||
for entry in data.entries:
|
||||
entry_hash = compute_entry_hash(entry)
|
||||
# An entry has either been published or updated.
|
||||
entry_time = entry.get("published_parsed", entry.get("updated_parsed"))
|
||||
if (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24:
|
||||
# As a safeguard against misbehaving feeds, don't try to process
|
||||
# entries older than some threshold.
|
||||
continue
|
||||
if entry_hash in old_feed_hashes:
|
||||
# We've already seen this. No need to process any older entries.
|
||||
break
|
||||
if (not old_feed_hashes) and (len(new_hashes) >= 3):
|
||||
# On a first run, pick up the 3 most recent entries. An RSS feed has
|
||||
# entries in reverse chronological order.
|
||||
break
|
||||
|
||||
response = send_humbug(entry, data.feed.title)
|
||||
if response["result"] != "success":
|
||||
logger.error("Error processing %s" % (feed_url,))
|
||||
logger.error(response)
|
||||
if first_message:
|
||||
# This is probably some fundamental problem like the stream not
|
||||
# existing or something being misconfigured, so bail instead of
|
||||
# getting the same error for every RSS entry.
|
||||
log_error_and_exit("Failed to process first message")
|
||||
# Go ahead and move on -- perhaps this entry is corrupt.
|
||||
new_hashes.append(entry_hash)
|
||||
first_message = False
|
||||
|
||||
with open(feed_file, "a") as f:
|
||||
for hash in new_hashes:
|
||||
f.write(hash + "\n")
|
||||
|
||||
logger.info("Sent humbugs for %d %s entries" % (len(new_hashes), feed_url))
|
Loading…
Reference in a new issue