use std::{collections::HashMap, sync::Arc}; use lazy_static::lazy_static; use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}, file::{NoCache, NoLog}}; use regex::Regex; use time::Date; const POSITION_ERROR_MARGIN: f32 = 2.0; lazy_static! { static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); } const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!("[month padding:none]/[day padding:none]/[year]"); pub struct DataSet { pub columns: Vec>, pub rows: Vec, } #[derive(thiserror::Error, Debug)] pub enum Error { #[error("PDF contained no data rows")] NoData, } impl DataSet { pub fn extract(doc: &pdf::file::File) -> Result { let mut doc_iter = DocumentIterator::new(doc).peekable(); let mut columns: Vec<(Arc, f32)> = Vec::new(); let mut rows: Vec = Vec::new(); let (mut current_datapoint, mut current_y) = loop { if let Some(text) = doc_iter.next() { if is_new_column_header(&text.text) { let mut column_name = text.text; let column_x = text.point.x; while let Some(more) = doc_iter.peek() { if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { columns.push((Arc::new(column_name), column_x)); break; } column_name += " "; column_name += &more.text; doc_iter.next(); } } else if DATE_REGEX.is_match(&text.text) { break ( DataPoint::new(&text).expect("Failed to parse date!"), text.point.y, ); } } else { return Err(Error::NoData); } }; columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); for text in doc_iter { if DATE_REGEX.is_match(&text.text) { rows.push(std::mem::replace( &mut current_datapoint, DataPoint::new(&text).expect("Failed to parse date!"), )); current_y = text.point.y; } else if VALUE_REGEX.is_match(&text.text) { if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN { continue; } if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) { current_datapoint.values.insert( column.clone(), text.text.parse().expect("Failed to parse value!"), ); } } } Ok(Self { columns: columns.into_iter().map(|(column, _)| column).collect(), rows, }) } } pub struct DataPoint { pub date: Date, pub values: HashMap, u32>, } impl DataPoint { fn new(text: &DocumentText) -> Result { Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new(), }) } } struct DocumentText { point: Point, text: String, } struct DocumentIterator<'a> { point: Point, operations: Box + 'a>, } impl<'a> DocumentIterator<'a> { fn new(document: &'a pdf::file::File) -> Self { Self { point: Point { x: 0.0, y: 0.0 }, operations: Box::new( document .pages() .filter_map(|page| Some(page.ok()?.contents.clone()?.operations(&document.resolver()).ok()?.into_iter())) .flatten(), ), } } } impl<'a> Iterator for DocumentIterator<'a> { type Item = DocumentText; fn next(&mut self) -> Option { for operation in self.operations.as_mut() { match operation { Op::SetTextMatrix { matrix } => { self.point = Point { x: matrix.e, y: matrix.f }; } Op::TextDraw { text } => { if let Ok(text) = text.to_string() { return Some(DocumentText { point: self.point, text, }); } } Op::TextDrawAdjusted { array } => { if let Some(text) = concatenate_adjusted_text(array) { return Some(DocumentText { point: self.point, text, }); } } _ => continue, } } None } } fn concatenate_adjusted_text(array: Vec) -> Option { let mut acc = String::new(); for element in array.iter() { if let TextDrawAdjusted::Text(s) = element { acc += s.to_string().ok()?.as_ref(); } }; Some( WHITESPACE_REGEX .replace_all(acc.trim(), " ") .to_string(), ) } fn is_new_column_header(s: &str) -> bool { s.starts_with("Northern") || s.starts_with("Southern") }