poop-graph/src/extract.rs

178 lines
5.5 KiB
Rust

use std::{collections::HashMap, sync::Arc};
use lazy_static::lazy_static;
use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}};
use regex::Regex;
use time::Date;
const POSITION_ERROR_MARGIN: f32 = 2.0;
lazy_static! {
static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}
const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] =
time::macros::format_description!("[month padding:none]/[day padding:none]/[year]");
pub struct DataSet {
pub columns: Vec<Arc<String>>,
pub rows: Vec<DataPoint>,
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("PDF contained no data rows")]
NoData,
}
impl DataSet {
pub fn extract<B: Backend>(doc: &pdf::file::File<B>) -> Result<Self, Error> {
let mut doc_iter = DocumentIterator::new(doc).peekable();
let mut columns: Vec<(Arc<String>, f32)> = Vec::new();
let mut rows: Vec<DataPoint> = Vec::new();
let (mut current_datapoint, mut current_y) = loop {
if let Some(text) = doc_iter.next() {
if is_new_column_header(&text.text) {
let mut column_name = text.text;
let column_x = text.point.x;
while let Some(more) = doc_iter.peek() {
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
columns.push((Arc::new(column_name), column_x));
break;
}
column_name += " ";
column_name += &more.text;
doc_iter.next();
}
} else if DATE_REGEX.is_match(&text.text) {
break (
DataPoint::new(&text).expect("Failed to parse date!"),
text.point.y,
);
}
} else {
return Err(Error::NoData);
}
};
columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
for text in doc_iter {
if DATE_REGEX.is_match(&text.text) {
rows.push(std::mem::replace(
&mut current_datapoint,
DataPoint::new(&text).expect("Failed to parse date!"),
));
current_y = text.point.y;
} else if VALUE_REGEX.is_match(&text.text) {
if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN {
continue;
}
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) {
current_datapoint.values.insert(
column.clone(),
text.text.parse().expect("Failed to parse value!"),
);
}
}
}
Ok(Self {
columns: columns.into_iter().map(|(column, _)| column).collect(),
rows,
})
}
}
pub struct DataPoint {
pub date: Date,
pub values: HashMap<Arc<String>, u32>,
}
impl DataPoint {
fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
Ok(Self {
date: Date::parse(&text.text, DATE_PARSE_FORMAT)?,
values: HashMap::new(),
})
}
}
struct DocumentText {
point: Point,
text: String,
}
struct DocumentIterator<'a> {
point: Point,
operations: Box<dyn Iterator<Item = Op> + 'a>,
}
impl<'a> DocumentIterator<'a> {
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
Self {
point: Point { x: 0.0, y: 0.0 },
operations: Box::new(
document
.pages()
.filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter()))
.flatten(),
),
}
}
}
impl<'a> Iterator for DocumentIterator<'a> {
type Item = DocumentText;
fn next(&mut self) -> Option<Self::Item> {
for operation in self.operations.as_mut() {
match operation {
Op::SetTextMatrix { matrix } => {
self.point = Point { x: matrix.e, y: matrix.f };
}
Op::TextDraw { text } => {
if let Ok(text) = text.to_string() {
return Some(DocumentText {
point: self.point,
text,
});
}
}
Op::TextDrawAdjusted { array } => {
if let Some(text) = concatenate_adjusted_text(array) {
return Some(DocumentText {
point: self.point,
text,
});
}
}
_ => continue,
}
}
None
}
}
fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> {
let mut acc = String::new();
for element in array.iter() {
if let TextDrawAdjusted::Text(s) = element {
acc += s.to_string().ok()?.as_ref();
}
};
Some(
WHITESPACE_REGEX
.replace_all(acc.trim(), " ")
.to_string(),
)
}
fn is_new_column_header(s: &str) -> bool {
s.starts_with("Northern") || s.starts_with("Southern")
}