2022-04-05 18:21:33 -04:00
|
|
|
use std::time::{Duration, Instant, SystemTime};
|
|
|
|
|
|
|
|
use futures::{sink::SinkExt, TryStreamExt};
|
|
|
|
use lazy_static::lazy_static;
|
2022-04-16 00:03:09 -04:00
|
|
|
use log::info;
|
2022-04-05 18:21:33 -04:00
|
|
|
use reqwest::Url;
|
|
|
|
use scraper::Selector;
|
|
|
|
use time::PrimitiveDateTime;
|
|
|
|
use tokio_util::codec;
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
static ref CHARTS_URL: Url = Url::parse("https://www.mwra.com/biobot/biobotdata.htm").unwrap();
|
|
|
|
static ref PDF_SEL: Selector = Selector::parse(r#"a[href$="-data.pdf"]"#).unwrap();
|
|
|
|
static ref MIN_CHECK_INTERVAL: Duration = Duration::from_secs(300);
|
|
|
|
}
|
|
|
|
|
|
|
|
const CACHED_PDF_PATH: &str = "data.pdf";
|
|
|
|
const LAST_MODIFIED_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
|
|
|
|
"[weekday repr:short], [day] [month repr:short] [year] [hour repr:24]:[minute]:[second] GMT"
|
|
|
|
);
|
|
|
|
|
|
|
|
#[derive(thiserror::Error, Debug)]
|
|
|
|
pub enum Error {
|
|
|
|
#[error("HTTP request failed")]
|
|
|
|
Http(#[from] reqwest::Error),
|
|
|
|
#[error("couldn't parse URL")]
|
|
|
|
UrlParse(#[from] url::ParseError),
|
|
|
|
#[error("couldn't read PDF")]
|
|
|
|
Pdf(#[from] pdf::error::PdfError),
|
|
|
|
#[error("I/O error")]
|
|
|
|
Io(#[from] std::io::Error),
|
|
|
|
#[error("system clock anomaly")]
|
|
|
|
SystemClock(#[from] std::time::SystemTimeError),
|
|
|
|
#[error("data not found")]
|
|
|
|
NotFound,
|
|
|
|
#[error("requesting data too soon")]
|
|
|
|
TooSoon,
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct PdfFetcher {
|
|
|
|
client: reqwest::Client,
|
|
|
|
last_checked: Option<Instant>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl PdfFetcher {
|
|
|
|
pub fn new() -> Result<Self, Error> {
|
|
|
|
Ok(Self {
|
|
|
|
client: reqwest::ClientBuilder::new().build()?,
|
|
|
|
last_checked: None,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
pub async fn fetch(&mut self) -> Result<pdf::file::File<Vec<u8>>, Error> {
|
2022-04-16 00:03:09 -04:00
|
|
|
info!("Fetching data PDF");
|
|
|
|
|
2022-04-05 18:21:33 -04:00
|
|
|
let cache_modtime = match tokio::fs::File::open(CACHED_PDF_PATH).await {
|
|
|
|
Ok(file) => Some(
|
|
|
|
file.metadata()
|
|
|
|
.await?
|
|
|
|
.modified()?
|
|
|
|
.duration_since(SystemTime::UNIX_EPOCH)?
|
|
|
|
.as_secs(),
|
|
|
|
),
|
|
|
|
Err(_) => None,
|
|
|
|
};
|
|
|
|
|
2022-04-05 18:50:55 -04:00
|
|
|
let now = Instant::now();
|
2022-04-05 18:21:33 -04:00
|
|
|
if let Some(instant) = self.last_checked {
|
2022-04-05 18:50:55 -04:00
|
|
|
if now - instant < *MIN_CHECK_INTERVAL {
|
2022-04-05 18:21:33 -04:00
|
|
|
return if cache_modtime.is_some() {
|
2022-04-16 00:03:09 -04:00
|
|
|
info!("Already checked origin recently, not rechecking");
|
2022-04-05 18:21:33 -04:00
|
|
|
self.cached_pdf()
|
|
|
|
} else {
|
|
|
|
Err(Error::TooSoon)
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let body = self
|
|
|
|
.client
|
|
|
|
.get(CHARTS_URL.as_ref())
|
|
|
|
.send()
|
|
|
|
.await?
|
|
|
|
.text()
|
|
|
|
.await?;
|
|
|
|
let document = scraper::Html::parse_document(&body);
|
|
|
|
let pdf_href = document
|
|
|
|
.select(&PDF_SEL)
|
|
|
|
.next()
|
|
|
|
.ok_or(Error::NotFound)?
|
|
|
|
.value()
|
|
|
|
.attr("href")
|
|
|
|
.ok_or(Error::NotFound)?;
|
|
|
|
let pdf_url = CHARTS_URL.join(pdf_href)?;
|
|
|
|
|
|
|
|
let origin_modtime = self
|
|
|
|
.client
|
|
|
|
.head(pdf_url.clone())
|
|
|
|
.send()
|
|
|
|
.await?
|
|
|
|
.headers()
|
|
|
|
.get(reqwest::header::LAST_MODIFIED)
|
|
|
|
.and_then(|val| {
|
|
|
|
u64::try_from(
|
|
|
|
PrimitiveDateTime::parse(val.to_str().ok()?, LAST_MODIFIED_FORMAT)
|
|
|
|
.ok()?
|
|
|
|
.assume_utc()
|
|
|
|
.unix_timestamp(),
|
|
|
|
)
|
|
|
|
.ok()
|
|
|
|
});
|
|
|
|
|
|
|
|
let outdated = cache_modtime
|
|
|
|
.zip(origin_modtime)
|
|
|
|
.map_or(true, |(cache, origin)| origin > cache);
|
|
|
|
|
|
|
|
if outdated {
|
2022-04-16 00:03:09 -04:00
|
|
|
info!("Cached PDF is outdated, downloading latest version");
|
|
|
|
|
2022-04-05 18:21:33 -04:00
|
|
|
let mut pdf_stream = self
|
|
|
|
.client
|
|
|
|
.get(pdf_url)
|
|
|
|
.send()
|
|
|
|
.await?
|
|
|
|
.bytes_stream()
|
|
|
|
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e));
|
|
|
|
|
|
|
|
let cache_file = tokio::fs::File::create(CACHED_PDF_PATH).await?;
|
|
|
|
let mut sink = codec::FramedWrite::new(cache_file, codec::BytesCodec::new());
|
|
|
|
sink.send_all(&mut pdf_stream).await?;
|
|
|
|
<dyn futures::Sink<bytes::Bytes, Error = std::io::Error> + Unpin>::close(&mut sink)
|
|
|
|
.await?;
|
2022-04-16 00:03:09 -04:00
|
|
|
} else {
|
|
|
|
info!("Cached PDF is already up to date");
|
2022-04-05 18:21:33 -04:00
|
|
|
}
|
2022-04-05 18:50:55 -04:00
|
|
|
self.last_checked = Some(now);
|
2022-04-05 18:21:33 -04:00
|
|
|
self.cached_pdf()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn cached_pdf(&self) -> Result<pdf::file::File<Vec<u8>>, Error> {
|
|
|
|
Ok(pdf::file::File::open(CACHED_PDF_PATH)?)
|
|
|
|
}
|
|
|
|
}
|