use std::{collections::HashMap, sync::Arc}; use lazy_static::lazy_static; use pdf::{backend::Backend, content::Operation, primitive::Primitive}; use regex::Regex; use time::Date; const POSITION_ERROR_MARGIN: f32 = 2.0; lazy_static! { static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); } const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!("[month padding:none]/[day padding:none]/[year]"); pub struct DataSet { pub columns: Vec>, pub rows: Vec, } #[derive(thiserror::Error, Debug)] pub enum Error { #[error("PDF contained no data rows")] NoData, } impl DataSet { pub fn extract(doc: &pdf::file::File) -> Result { let mut doc_iter = DocumentIterator::new(doc).peekable(); let mut columns: Vec<(Arc, f32)> = Vec::new(); let mut rows: Vec = Vec::new(); let (mut current_datapoint, mut current_y) = loop { if let Some(text) = doc_iter.next() { if is_new_column_header(&text.text) { let mut column_name = text.text; let column_x = text.x; while let Some(more) = doc_iter.peek() { if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { columns.push((Arc::new(column_name), column_x)); break; } column_name += " "; column_name += &more.text; doc_iter.next(); } } else if DATE_REGEX.is_match(&text.text) { break ( DataPoint::new(&text).expect("Failed to parse date!"), text.y, ); } } else { return Err(Error::NoData); } }; columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); for text in doc_iter { if DATE_REGEX.is_match(&text.text) { rows.push(std::mem::replace( &mut current_datapoint, DataPoint::new(&text).expect("Failed to parse date!"), )); current_y = text.y; } else if VALUE_REGEX.is_match(&text.text) { if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { continue; } if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { current_datapoint.values.insert( column.clone(), text.text.parse().expect("Failed to parse value!"), ); } } } Ok(Self { columns: columns.into_iter().map(|(column, _)| column).collect(), rows, }) } } pub struct DataPoint { pub date: Date, pub values: HashMap, u32>, } impl DataPoint { fn new(text: &DocumentText) -> Result { Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new(), }) } } struct DocumentText { x: f32, y: f32, text: String, } struct DocumentIterator<'a> { x: f32, y: f32, operations: Box + 'a>, } impl<'a> DocumentIterator<'a> { fn new(document: &'a pdf::file::File) -> Self { Self { x: 0.0, y: 0.0, operations: Box::new( document .pages() .filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter())) .flatten(), ), } } } impl<'a> Iterator for DocumentIterator<'a> { type Item = DocumentText; fn next(&mut self) -> Option { for Operation { operator, operands } in self.operations.as_mut() { if operator == "Tm" { if let (Some(x), Some(y)) = ( operands.get(4).and_then(extract_number), operands.get(5).and_then(extract_number), ) { self.x = x; self.y = y; } } else if operator == "TJ" || operator == "Tj" { if let Some(text) = operands.get(0).and_then(extract_string) { return Some(DocumentText { x: self.x, y: self.y, text, }); } } } None } } fn extract_number(p: &Primitive) -> Option { match p { Primitive::Number(n) => Some(*n), Primitive::Integer(n) => Some(*n as f32), _ => None, } } fn extract_string(p: &Primitive) -> Option { let result: Box> = match p { Primitive::Array(array) => { let mut acc = String::new(); for element in array.iter() { if let Primitive::String(s) = element { acc += s.as_str().ok()?.as_ref(); } } Box::new(acc) } Primitive::String(s) => Box::new(s.as_str().ok()?), _ => return None, }; Some( WHITESPACE_REGEX .replace_all((*result).as_ref().trim(), " ") .to_string(), ) } fn is_new_column_header(s: &str) -> bool { s.starts_with("Northern") || s.starts_with("Southern") }