black: Reformat skipping string normalization.
This commit is contained in:
parent
5580c68ae5
commit
fba21bb00d
178 changed files with 6562 additions and 4469 deletions
|
@ -48,35 +48,48 @@ stream every 5 minutes is:
|
|||
|
||||
*/5 * * * * /usr/local/share/zulip/integrations/rss/rss-bot"""
|
||||
|
||||
parser = zulip.add_default_arguments(argparse.ArgumentParser(usage)) # type: argparse.ArgumentParser
|
||||
parser.add_argument('--stream',
|
||||
dest='stream',
|
||||
help='The stream to which to send RSS messages.',
|
||||
default="rss",
|
||||
action='store')
|
||||
parser.add_argument('--data-dir',
|
||||
dest='data_dir',
|
||||
help='The directory where feed metadata is stored',
|
||||
default=os.path.join(RSS_DATA_DIR),
|
||||
action='store')
|
||||
parser.add_argument('--feed-file',
|
||||
dest='feed_file',
|
||||
help='The file containing a list of RSS feed URLs to follow, one URL per line',
|
||||
default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
|
||||
action='store')
|
||||
parser.add_argument('--unwrap',
|
||||
dest='unwrap',
|
||||
action='store_true',
|
||||
help='Convert word-wrapped paragraphs into single lines',
|
||||
default=False)
|
||||
parser.add_argument('--math',
|
||||
dest='math',
|
||||
action='store_true',
|
||||
help='Convert $ to $$ (for KaTeX processing)',
|
||||
default=False)
|
||||
parser = zulip.add_default_arguments(
|
||||
argparse.ArgumentParser(usage)
|
||||
) # type: argparse.ArgumentParser
|
||||
parser.add_argument(
|
||||
'--stream',
|
||||
dest='stream',
|
||||
help='The stream to which to send RSS messages.',
|
||||
default="rss",
|
||||
action='store',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--data-dir',
|
||||
dest='data_dir',
|
||||
help='The directory where feed metadata is stored',
|
||||
default=os.path.join(RSS_DATA_DIR),
|
||||
action='store',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--feed-file',
|
||||
dest='feed_file',
|
||||
help='The file containing a list of RSS feed URLs to follow, one URL per line',
|
||||
default=os.path.join(RSS_DATA_DIR, "rss-feeds"),
|
||||
action='store',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--unwrap',
|
||||
dest='unwrap',
|
||||
action='store_true',
|
||||
help='Convert word-wrapped paragraphs into single lines',
|
||||
default=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--math',
|
||||
dest='math',
|
||||
action='store_true',
|
||||
help='Convert $ to $$ (for KaTeX processing)',
|
||||
default=False,
|
||||
)
|
||||
|
||||
opts = parser.parse_args() # type: Any
|
||||
|
||||
|
||||
def mkdir_p(path: str) -> None:
|
||||
# Python doesn't have an analog to `mkdir -p` < Python 3.2.
|
||||
try:
|
||||
|
@ -87,6 +100,7 @@ def mkdir_p(path: str) -> None:
|
|||
else:
|
||||
raise
|
||||
|
||||
|
||||
try:
|
||||
mkdir_p(opts.data_dir)
|
||||
except OSError:
|
||||
|
@ -106,11 +120,13 @@ logger = logging.getLogger(__name__) # type: logging.Logger
|
|||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def log_error_and_exit(error: str) -> None:
|
||||
logger.error(error)
|
||||
logger.error(usage)
|
||||
exit(1)
|
||||
|
||||
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
@ -123,57 +139,70 @@ class MLStripper(HTMLParser):
|
|||
def get_data(self) -> str:
|
||||
return ''.join(self.fed)
|
||||
|
||||
|
||||
def strip_tags(html: str) -> str:
|
||||
stripper = MLStripper()
|
||||
stripper.feed(html)
|
||||
return stripper.get_data()
|
||||
|
||||
|
||||
def compute_entry_hash(entry: Dict[str, Any]) -> str:
|
||||
entry_time = entry.get("published", entry.get("updated"))
|
||||
entry_id = entry.get("id", entry.get("link"))
|
||||
return hashlib.md5((entry_id + str(entry_time)).encode()).hexdigest()
|
||||
|
||||
|
||||
def unwrap_text(body: str) -> str:
|
||||
# Replace \n by space if it is preceded and followed by a non-\n.
|
||||
# Example: '\na\nb\nc\n\nd\n' -> '\na b c\n\nd\n'
|
||||
return re.sub('(?<=[^\n])\n(?=[^\n])', ' ', body)
|
||||
|
||||
|
||||
def elide_subject(subject: str) -> str:
|
||||
MAX_TOPIC_LENGTH = 60
|
||||
if len(subject) > MAX_TOPIC_LENGTH:
|
||||
subject = subject[:MAX_TOPIC_LENGTH - 3].rstrip() + '...'
|
||||
subject = subject[: MAX_TOPIC_LENGTH - 3].rstrip() + '...'
|
||||
return subject
|
||||
|
||||
|
||||
def send_zulip(entry: Any, feed_name: str) -> Dict[str, Any]:
|
||||
body = entry.summary # type: str
|
||||
if opts.unwrap:
|
||||
body = unwrap_text(body)
|
||||
|
||||
content = "**[%s](%s)**\n%s\n%s" % (entry.title,
|
||||
entry.link,
|
||||
strip_tags(body),
|
||||
entry.link) # type: str
|
||||
content = "**[%s](%s)**\n%s\n%s" % (
|
||||
entry.title,
|
||||
entry.link,
|
||||
strip_tags(body),
|
||||
entry.link,
|
||||
) # type: str
|
||||
|
||||
if opts.math:
|
||||
content = content.replace('$', '$$')
|
||||
|
||||
message = {"type": "stream",
|
||||
"sender": opts.zulip_email,
|
||||
"to": opts.stream,
|
||||
"subject": elide_subject(feed_name),
|
||||
"content": content,
|
||||
} # type: Dict[str, str]
|
||||
message = {
|
||||
"type": "stream",
|
||||
"sender": opts.zulip_email,
|
||||
"to": opts.stream,
|
||||
"subject": elide_subject(feed_name),
|
||||
"content": content,
|
||||
} # type: Dict[str, str]
|
||||
return client.send_message(message)
|
||||
|
||||
|
||||
try:
|
||||
with open(opts.feed_file) as f:
|
||||
feed_urls = [feed.strip() for feed in f.readlines()] # type: List[str]
|
||||
except OSError:
|
||||
log_error_and_exit("Unable to read feed file at %s." % (opts.feed_file,))
|
||||
|
||||
client = zulip.Client(email=opts.zulip_email, api_key=opts.zulip_api_key,
|
||||
config_file=opts.zulip_config_file,
|
||||
site=opts.zulip_site, client="ZulipRSS/" + VERSION) # type: zulip.Client
|
||||
client = zulip.Client(
|
||||
email=opts.zulip_email,
|
||||
api_key=opts.zulip_api_key,
|
||||
config_file=opts.zulip_config_file,
|
||||
site=opts.zulip_site,
|
||||
client="ZulipRSS/" + VERSION,
|
||||
) # type: zulip.Client
|
||||
|
||||
first_message = True # type: bool
|
||||
|
||||
|
@ -182,7 +211,9 @@ for feed_url in feed_urls:
|
|||
|
||||
try:
|
||||
with open(feed_file) as f:
|
||||
old_feed_hashes = {line.strip(): True for line in f.readlines()} # type: Dict[str, bool]
|
||||
old_feed_hashes = {
|
||||
line.strip(): True for line in f.readlines()
|
||||
} # type: Dict[str, bool]
|
||||
except OSError:
|
||||
old_feed_hashes = {}
|
||||
|
||||
|
@ -192,8 +223,13 @@ for feed_url in feed_urls:
|
|||
for entry in data.entries:
|
||||
entry_hash = compute_entry_hash(entry) # type: str
|
||||
# An entry has either been published or updated.
|
||||
entry_time = entry.get("published_parsed", entry.get("updated_parsed")) # type: Tuple[int, int]
|
||||
if entry_time is not None and (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24:
|
||||
entry_time = entry.get(
|
||||
"published_parsed", entry.get("updated_parsed")
|
||||
) # type: Tuple[int, int]
|
||||
if (
|
||||
entry_time is not None
|
||||
and (time.time() - calendar.timegm(entry_time)) > OLDNESS_THRESHOLD * 60 * 60 * 24
|
||||
):
|
||||
# As a safeguard against misbehaving feeds, don't try to process
|
||||
# entries older than some threshold.
|
||||
continue
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue