poop-graph/src/extract.rs

use std::{collections::HashMap, sync::Arc};

use lazy_static::lazy_static;
use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}};
use regex::Regex;
use time::Date;

const POSITION_ERROR_MARGIN: f32 = 2.0;

lazy_static! {
    static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
    static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}

const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] =
    time::macros::format_description!("[month padding:none]/[day padding:none]/[year]");

pub struct DataSet {
    pub columns: Vec<Arc<String>>,
    pub rows: Vec<DataPoint>,
}

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("PDF contained no data rows")]
    NoData,
}

impl DataSet {
    pub fn extract<B: Backend>(doc: &pdf::file::File<B>) -> Result<Self, Error> {
        let mut doc_iter = DocumentIterator::new(doc).peekable();

        let mut columns: Vec<(Arc<String>, f32)> = Vec::new();
        let mut rows: Vec<DataPoint> = Vec::new();

        let (mut current_datapoint, mut current_y) = loop {
            if let Some(text) = doc_iter.next() {
                if is_new_column_header(&text.text) {
                    let mut column_name = text.text;
                    let column_x = text.point.x;
                    while let Some(more) = doc_iter.peek() {
                        if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
                            columns.push((Arc::new(column_name), column_x));
                            break;
                        }
                        column_name += " ";
                        column_name += &more.text;
                        doc_iter.next();
                    }
                } else if DATE_REGEX.is_match(&text.text) {
                    break (
                        DataPoint::new(&text).expect("Failed to parse date!"),
                        text.point.y,
                    );
                }
            } else {
                return Err(Error::NoData);
            }
        };

        columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));

        for text in doc_iter {
            if DATE_REGEX.is_match(&text.text) {
                rows.push(std::mem::replace(
                    &mut current_datapoint,
                    DataPoint::new(&text).expect("Failed to parse date!"),
                ));
                current_y = text.point.y;
            } else if VALUE_REGEX.is_match(&text.text) {
                if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN {
                    continue;
                }
                if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) {
                    current_datapoint.values.insert(
                        column.clone(),
                        text.text.parse().expect("Failed to parse value!"),
                    );
                }
            }
        }

        Ok(Self {
            columns: columns.into_iter().map(|(column, _)| column).collect(),
            rows,
        })
    }
}

pub struct DataPoint {
    pub date: Date,
    pub values: HashMap<Arc<String>, u32>,
}

impl DataPoint {
    fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
        Ok(Self {
            date: Date::parse(&text.text, DATE_PARSE_FORMAT)?,
            values: HashMap::new(),
        })
    }
}

struct DocumentText {
    point: Point,
    text: String,
}

struct DocumentIterator<'a> {
    point: Point,
    operations: Box<dyn Iterator<Item = Op> + 'a>,
}

impl<'a> DocumentIterator<'a> {
    fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
        Self {
            point: Point { x: 0.0, y: 0.0 },
            operations: Box::new(
                document
                    .pages()
                    .filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter()))
                    .flatten(),
            ),
        }
    }
}

impl<'a> Iterator for DocumentIterator<'a> {
    type Item = DocumentText;

    fn next(&mut self) -> Option<Self::Item> {
        for operation in self.operations.as_mut() {
            match operation {
                Op::SetTextMatrix { matrix } => {
                    self.point = Point { x: matrix.e, y: matrix.f };
                }
                Op::TextDraw { text } => {
                    if let Ok(text) = text.to_string() {
                        return Some(DocumentText {
                            point: self.point,
                            text,
                        });
                    }
                }
                Op::TextDrawAdjusted { array } => {
                    if let Some(text) = concatenate_adjusted_text(array) {
                        return Some(DocumentText {
                            point: self.point,
                            text,
                        });
                    }
                }
                _ => continue,
            }
        }
        None
    }
}

fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> {
    let mut acc = String::new();
    for element in array.iter() {
        if let TextDrawAdjusted::Text(s) = element {
            acc += s.to_string().ok()?.as_ref();
        }
    };
    Some(
        WHITESPACE_REGEX
            .replace_all(acc.trim(), " ")
            .to_string(),
    )
}

fn is_new_column_header(s: &str) -> bool {
    s.starts_with("Northern") || s.starts_with("Southern")
}