poop-graph/src/fetch.rs

152 lines
4.8 KiB
Rust

use std::{
path::PathBuf,
time::{Duration, Instant, SystemTime},
};
use futures::{sink::SinkExt, TryStreamExt};
use lazy_static::lazy_static;
use log::{info, warn};
use reqwest::Url;
use scraper::Selector;
use time::PrimitiveDateTime;
use tokio_util::codec;
lazy_static! {
static ref CHARTS_URL: Url = Url::parse("https://www.mwra.com/biobot/biobotdata.htm").unwrap();
static ref PDF_SEL: Selector = Selector::parse(r#"a[href$="-data.pdf"]"#).unwrap();
static ref MIN_CHECK_INTERVAL: Duration = Duration::from_secs(300);
}
const LAST_MODIFIED_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
"[weekday repr:short], [day] [month repr:short] [year] [hour repr:24]:[minute]:[second] GMT"
);
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("HTTP request failed")]
Http(#[from] reqwest::Error),
#[error("couldn't parse URL")]
UrlParse(#[from] url::ParseError),
#[error("couldn't read PDF")]
Pdf(#[from] pdf::error::PdfError),
#[error("I/O error")]
Io(#[from] std::io::Error),
#[error("system clock anomaly")]
SystemClock(#[from] std::time::SystemTimeError),
#[error("data not found")]
NotFound,
#[error("requesting data too soon")]
TooSoon,
}
pub struct PdfFetcher {
client: reqwest::Client,
cached_pdf_path: PathBuf,
last_checked: Option<Instant>,
}
impl PdfFetcher {
pub fn new(cached_pdf_path: PathBuf) -> Result<Self, Error> {
Ok(Self {
client: reqwest::ClientBuilder::new().build()?,
cached_pdf_path,
last_checked: None,
})
}
pub async fn fetch(&mut self) -> Result<pdf::file::File<Vec<u8>>, Error> {
info!("Fetching data PDF");
let cache_modtime = match tokio::fs::File::open(&self.cached_pdf_path).await {
Ok(file) => Some(
file.metadata()
.await?
.modified()?
.duration_since(SystemTime::UNIX_EPOCH)?
.as_secs(),
),
Err(_) => None,
};
let now = Instant::now();
if let Some(instant) = self.last_checked {
if now - instant < *MIN_CHECK_INTERVAL {
return if cache_modtime.is_some() {
info!("Already checked origin recently, not rechecking");
self.cached_pdf()
} else {
Err(Error::TooSoon)
};
}
}
let body = self
.client
.get(CHARTS_URL.as_ref())
.send()
.await?
.text()
.await?;
let document = scraper::Html::parse_document(&body);
let pdf_href = document
.select(&PDF_SEL)
.next()
.ok_or(Error::NotFound)?
.value()
.attr("href")
.ok_or(Error::NotFound)?;
let pdf_url = CHARTS_URL.join(pdf_href)?;
let head_resp = self.client.head(pdf_url.clone()).send().await?;
let origin_modtime = head_resp
.headers()
.get(reqwest::header::LAST_MODIFIED)
.and_then(|val| {
u64::try_from(
PrimitiveDateTime::parse(val.to_str().ok()?, LAST_MODIFIED_FORMAT)
.ok()?
.assume_utc()
.unix_timestamp(),
)
.ok()
});
let outdated = cache_modtime
.zip(origin_modtime)
.map_or(true, |(cache, origin)| origin > cache);
if !head_resp.status().is_success() {
warn!(
"MWRA server returned unexpected response, not fetching updated PDF: {:?}",
head_resp
);
// Just use the cached PDF, assuming we have it
} else if outdated {
info!("Cached PDF is outdated, downloading latest version");
let mut pdf_stream = self
.client
.get(pdf_url)
.send()
.await?
.bytes_stream()
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e));
let cache_file = tokio::fs::File::create(&self.cached_pdf_path).await?;
let mut sink = codec::FramedWrite::new(cache_file, codec::BytesCodec::new());
sink.send_all(&mut pdf_stream).await?;
<dyn futures::Sink<bytes::Bytes, Error = std::io::Error> + Unpin>::close(&mut sink)
.await?;
} else {
info!("Cached PDF is already up to date");
}
self.last_checked = Some(now);
self.cached_pdf()
}
fn cached_pdf(&self) -> Result<pdf::file::File<Vec<u8>>, Error> {
Ok(pdf::file::File::open(&self.cached_pdf_path)?)
}
}