2020-04-02 09:59:28 -04:00
|
|
|
#!/usr/bin/env python3
|
2013-10-02 11:10:46 -04:00
|
|
|
#
|
|
|
|
# RSS integration for Zulip
|
|
|
|
#
|
|
|
|
|
2013-01-20 22:39:29 -05:00
|
|
|
import calendar
|
|
|
|
import errno
|
|
|
|
import hashlib
|
2020-04-02 11:11:08 -04:00
|
|
|
from html.parser import HTMLParser
|
2013-01-20 22:39:29 -05:00
|
|
|
import logging
|
2017-08-02 16:21:48 -04:00
|
|
|
import argparse
|
2013-01-20 22:39:29 -05:00
|
|
|
import os
|
2017-05-25 14:58:24 -04:00
|
|
|
import re
|
2013-01-20 22:39:29 -05:00
|
|
|
import sys
|
|
|
|
import time
|
2020-04-03 05:23:36 -04:00
|
|
|
import urllib.parse
|
2016-12-29 14:29:49 -05:00
|
|
|
from typing import Dict, List, Tuple, Any
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
import feedparser
|
2013-08-07 11:51:03 -04:00
|
|
|
import zulip
|
2017-05-31 15:20:08 -04:00
|
|
|
VERSION = "0.9" # type: str
|
|
|
|
RSS_DATA_DIR = os.path.expanduser(os.path.join('~', '.cache', 'zulip-rss')) # type: str
|
|
|
|
OLDNESS_THRESHOLD = 30 # type: int
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2013-08-06 15:32:15 -04:00
|
|
|
usage = """Usage: Send summaries of RSS entries for your favorite feeds to Zulip.
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
This bot requires the feedparser module.
|
|
|
|
|
|
|
|
To use this script:
|
|
|
|
|
|
|
|
1. Create an RSS feed file containing 1 feed URL per line (default feed
|
2013-08-07 12:30:22 -04:00
|
|
|
file location: ~/.cache/zulip-rss/rss-feeds)
|
2013-01-20 22:39:29 -05:00
|
|
|
2. Subscribe to the stream that will receive RSS updates (default stream: rss)
|
2020-06-08 17:03:27 -04:00
|
|
|
3. create a ~/.zuliprc as described on https://zulip.com/api/configuring-python-bindings
|
2013-05-29 14:00:27 -04:00
|
|
|
4. Test the script by running it manually, like this:
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2013-11-18 08:04:40 -05:00
|
|
|
/usr/local/share/zulip/integrations/rss/rss-bot
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
You can customize the location on the feed file and recipient stream, e.g.:
|
|
|
|
|
2013-11-18 08:04:40 -05:00
|
|
|
/usr/local/share/zulip/integrations/rss/rss-bot --feed-file=/path/to/my-feeds --stream=my-rss-stream
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
4. Configure a crontab entry for this script. A sample crontab entry for
|
|
|
|
processing feeds stored in the default location and sending to the default
|
|
|
|
stream every 5 minutes is:
|
|
|
|
|
2013-11-18 08:04:40 -05:00
|
|
|
*/5 * * * * /usr/local/share/zulip/integrations/rss/rss-bot"""
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2017-08-02 16:21:48 -04:00
|
|
|
parser = zulip.add_default_arguments(argparse.ArgumentParser(usage)) # type: argparse.ArgumentParser
|
|
|
|
parser.add_argument('--stream',
|
|
|
|
dest='stream',
|
|
|
|
help='The stream to which to send RSS messages.',
|
|
|
|
default="rss",
|
|
|
|
action='store')
|
|
|
|
parser.add_argument('--data-dir',
|
|
|
|
dest='data_dir',
|
|
|
|
help='The directory where feed metadata is stored',
|
|
|
|
default=os.path.join(RSS_DATA_DIR),
|
|
|
|
action='store')
|
|
|
|
parser.add_argument('--feed-file',
|
|
|
|
dest='feed_file',
|
|
|
|
help='The file containing a list of RSS feed URLs to follow, one URL per line',
|
|
|
|
default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
|
|
|
|
action='store')
|
|
|
|
parser.add_argument('--unwrap',
|
|
|
|
dest='unwrap',
|
|
|
|
action='store_true',
|
|
|
|
help='Convert word-wrapped paragraphs into single lines',
|
|
|
|
default=False)
|
|
|
|
parser.add_argument('--math',
|
|
|
|
dest='math',
|
|
|
|
action='store_true',
|
|
|
|
help='Convert $ to $$ (for KaTeX processing)',
|
|
|
|
default=False)
|
|
|
|
|
|
|
|
opts = parser.parse_args() # type: Any
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def mkdir_p(path: str) -> None:
|
2013-01-20 22:39:29 -05:00
|
|
|
# Python doesn't have an analog to `mkdir -p` < Python 3.2.
|
|
|
|
try:
|
|
|
|
os.makedirs(path)
|
2016-03-10 07:53:26 -05:00
|
|
|
except OSError as e:
|
2013-01-20 22:39:29 -05:00
|
|
|
if e.errno == errno.EEXIST and os.path.isdir(path):
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
|
|
|
|
try:
|
|
|
|
mkdir_p(opts.data_dir)
|
|
|
|
except OSError:
|
|
|
|
# We can't write to the logfile, so just print and give up.
|
2016-03-10 11:15:34 -05:00
|
|
|
print("Unable to store RSS data at %s." % (opts.data_dir,), file=sys.stderr)
|
2013-01-20 22:39:29 -05:00
|
|
|
exit(1)
|
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
log_file = os.path.join(opts.data_dir, "rss-bot.log") # type: str
|
|
|
|
log_format = "%(asctime)s: %(message)s" # type: str
|
2013-01-20 22:39:29 -05:00
|
|
|
logging.basicConfig(format=log_format)
|
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
formatter = logging.Formatter(log_format) # type: logging.Formatter
|
|
|
|
file_handler = logging.FileHandler(log_file) # type: logging.FileHandler
|
2013-01-20 22:39:29 -05:00
|
|
|
file_handler.setFormatter(formatter)
|
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
logger = logging.getLogger(__name__) # type: logging.Logger
|
2013-01-20 22:39:29 -05:00
|
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
logger.addHandler(file_handler)
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def log_error_and_exit(error: str) -> None:
|
2013-01-20 22:39:29 -05:00
|
|
|
logger.error(error)
|
|
|
|
logger.error(usage)
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
class MLStripper(HTMLParser):
|
2020-04-18 18:59:12 -04:00
|
|
|
def __init__(self) -> None:
|
2013-01-20 22:39:29 -05:00
|
|
|
self.reset()
|
2017-05-31 15:20:08 -04:00
|
|
|
self.fed = [] # type: List[str]
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def handle_data(self, data: str) -> None:
|
2013-01-20 22:39:29 -05:00
|
|
|
self.fed.append(data)
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def get_data(self) -> str:
|
2013-01-20 22:39:29 -05:00
|
|
|
return ''.join(self.fed)
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def strip_tags(html: str) -> str:
|
2013-01-20 22:39:29 -05:00
|
|
|
stripper = MLStripper()
|
|
|
|
stripper.feed(html)
|
|
|
|
return stripper.get_data()
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def compute_entry_hash(entry: Dict[str, Any]) -> str:
|
2013-01-20 22:39:29 -05:00
|
|
|
entry_time = entry.get("published", entry.get("updated"))
|
2013-11-18 08:00:42 -05:00
|
|
|
entry_id = entry.get("id", entry.get("link"))
|
2014-02-11 17:05:17 -05:00
|
|
|
return hashlib.md5(entry_id + str(entry_time)).hexdigest()
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def unwrap_text(body: str) -> str:
|
2017-05-25 14:58:24 -04:00
|
|
|
# Replace \n by space if it is preceded and followed by a non-\n.
|
|
|
|
# Example: '\na\nb\nc\n\nd\n' -> '\na b c\n\nd\n'
|
|
|
|
return re.sub('(?<=[^\n])\n(?=[^\n])', ' ', body)
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def elide_subject(subject: str) -> str:
|
2013-11-22 10:39:15 -05:00
|
|
|
MAX_TOPIC_LENGTH = 60
|
|
|
|
if len(subject) > MAX_TOPIC_LENGTH:
|
|
|
|
subject = subject[:MAX_TOPIC_LENGTH - 3].rstrip() + '...'
|
|
|
|
return subject
|
|
|
|
|
2020-04-18 18:59:12 -04:00
|
|
|
def send_zulip(entry: Any, feed_name: str) -> Dict[str, Any]:
|
2017-06-04 05:32:31 -04:00
|
|
|
body = entry.summary # type: str
|
2017-05-25 14:58:24 -04:00
|
|
|
if opts.unwrap:
|
|
|
|
body = unwrap_text(body)
|
|
|
|
|
2013-08-08 17:17:37 -04:00
|
|
|
content = "**[%s](%s)**\n%s\n%s" % (entry.title,
|
2016-12-02 18:04:17 -05:00
|
|
|
entry.link,
|
2017-05-25 14:58:24 -04:00
|
|
|
strip_tags(body),
|
2017-05-31 15:20:08 -04:00
|
|
|
entry.link) # type: str
|
2017-05-25 14:58:24 -04:00
|
|
|
|
|
|
|
if opts.math:
|
|
|
|
content = content.replace('$', '$$')
|
|
|
|
|
2013-01-20 22:39:29 -05:00
|
|
|
message = {"type": "stream",
|
2016-11-15 01:47:56 -05:00
|
|
|
"sender": opts.zulip_email,
|
2013-01-20 22:39:29 -05:00
|
|
|
"to": opts.stream,
|
2013-11-22 10:39:15 -05:00
|
|
|
"subject": elide_subject(feed_name),
|
2013-01-20 22:39:29 -05:00
|
|
|
"content": content,
|
2017-05-31 15:20:08 -04:00
|
|
|
} # type: Dict[str, str]
|
2013-01-20 22:39:29 -05:00
|
|
|
return client.send_message(message)
|
|
|
|
|
|
|
|
try:
|
2020-04-09 20:14:01 -04:00
|
|
|
with open(opts.feed_file) as f:
|
2017-05-31 15:20:08 -04:00
|
|
|
feed_urls = [feed.strip() for feed in f.readlines()] # type: List[str]
|
2020-04-09 20:14:01 -04:00
|
|
|
except OSError:
|
2013-01-20 22:39:29 -05:00
|
|
|
log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,))
|
|
|
|
|
2016-11-15 01:47:56 -05:00
|
|
|
client = zulip.Client(email=opts.zulip_email, api_key=opts.zulip_api_key,
|
2019-01-22 20:44:28 -05:00
|
|
|
config_file=opts.zulip_config_file,
|
2017-05-31 15:20:08 -04:00
|
|
|
site=opts.zulip_site, client="ZulipRSS/" + VERSION) # type: zulip.Client
|
2013-01-20 22:39:29 -05:00
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
first_message = True # type: bool
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
for feed_url in feed_urls:
|
2017-05-31 15:20:08 -04:00
|
|
|
feed_file = os.path.join(opts.data_dir, urllib.parse.urlparse(feed_url).netloc) # Type: str
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
try:
|
2020-04-09 20:14:01 -04:00
|
|
|
with open(feed_file) as f:
|
|
|
|
old_feed_hashes = {line.strip(): True for line in f.readlines()} # type: Dict[str, bool]
|
|
|
|
except OSError:
|
2013-01-20 22:39:29 -05:00
|
|
|
old_feed_hashes = {}
|
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
new_hashes = [] # type: List[str]
|
|
|
|
data = feedparser.parse(feed_url) # type: feedparser.parse
|
2013-01-20 22:39:29 -05:00
|
|
|
|
|
|
|
for entry in data.entries:
|
2017-05-31 15:20:08 -04:00
|
|
|
entry_hash = compute_entry_hash(entry) # type: str
|
2013-01-20 22:39:29 -05:00
|
|
|
# An entry has either been published or updated.
|
2017-05-31 15:20:08 -04:00
|
|
|
entry_time = entry.get("published_parsed", entry.get("updated_parsed")) # type: Tuple[int, int]
|
2014-02-11 17:05:17 -05:00
|
|
|
if entry_time is not None and (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24:
|
2013-01-20 22:39:29 -05:00
|
|
|
# As a safeguard against misbehaving feeds, don't try to process
|
|
|
|
# entries older than some threshold.
|
|
|
|
continue
|
|
|
|
if entry_hash in old_feed_hashes:
|
|
|
|
# We've already seen this. No need to process any older entries.
|
|
|
|
break
|
|
|
|
if (not old_feed_hashes) and (len(new_hashes) >= 3):
|
|
|
|
# On a first run, pick up the 3 most recent entries. An RSS feed has
|
|
|
|
# entries in reverse chronological order.
|
|
|
|
break
|
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
feed_name = data.feed.title or feed_url # type: str
|
2013-11-22 10:39:15 -05:00
|
|
|
|
2017-05-31 15:20:08 -04:00
|
|
|
response = send_zulip(entry, feed_name) # type: Dict[str, Any]
|
2013-01-20 22:39:29 -05:00
|
|
|
if response["result"] != "success":
|
|
|
|
logger.error("Error processing %s" % (feed_url,))
|
2016-12-29 14:29:49 -05:00
|
|
|
logger.error(str(response))
|
2013-01-20 22:39:29 -05:00
|
|
|
if first_message:
|
|
|
|
# This is probably some fundamental problem like the stream not
|
|
|
|
# existing or something being misconfigured, so bail instead of
|
|
|
|
# getting the same error for every RSS entry.
|
|
|
|
log_error_and_exit("Failed to process first message")
|
|
|
|
# Go ahead and move on -- perhaps this entry is corrupt.
|
|
|
|
new_hashes.append(entry_hash)
|
|
|
|
first_message = False
|
|
|
|
|
|
|
|
with open(feed_file, "a") as f:
|
|
|
|
for hash in new_hashes:
|
|
|
|
f.write(hash + "\n")
|
|
|
|
|
2013-08-07 12:30:22 -04:00
|
|
|
logger.info("Sent zulips for %d %s entries" % (len(new_hashes), feed_url))
|