use std::collections::HashMap; use lazy_static::lazy_static; use pdf::{primitive::Primitive, content::Operation, backend::Backend}; use regex::Regex; use time::Date; const POSITION_ERROR_MARGIN: f32 = 2.0; lazy_static! { static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); } const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( "[month padding:none]/[day padding:none]/[year]" ); const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( "[year]-[month]-[day]" ); #[derive(Debug)] struct DataPoint<'a> { date: Date, values: HashMap<&'a str, u32>, } impl<'a> DataPoint<'a> { fn new(text: &DocumentText) -> Result { Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() }) } } struct DocumentText { x: f32, y: f32, text: String, } struct DocumentIterator<'a> { x: f32, y: f32, operations: Box + 'a>, } impl<'a> DocumentIterator<'a> { fn new(document: &'a pdf::file::File) -> Self { Self { x: 0.0, y: 0.0, operations: Box::new( document .pages() .filter_map(|page| { Some(page.ok()?.contents.clone()?.operations.into_iter()) }) .flatten() ), } } } impl<'a> Iterator for DocumentIterator<'a> { type Item = DocumentText; fn next(&mut self) -> Option { for Operation { operator, operands } in self.operations.as_mut() { if operator == "Tm" { if let (Some(x), Some(y)) = ( operands.get(4).and_then(extract_number), operands.get(5).and_then(extract_number), ) { self.x = x; self.y = y; } } else if operator == "TJ" || operator == "Tj" { if let Some(text) = operands.get(0).and_then(extract_string) { return Some(DocumentText { x: self.x, y: self.y, text, }); } } } None } } fn extract_number(p: &Primitive) -> Option { match p { Primitive::Number(n) => Some(*n), Primitive::Integer(n) => Some(*n as f32), _ => None, } } fn extract_string(p: &Primitive) -> Option { let result: Box> = match p { Primitive::Array(array) => { let mut acc = String::new(); for element in array.iter() { if let Primitive::String(s) = element { acc += s.as_str().ok()?.as_ref(); } } Box::new(acc) } Primitive::String(s) => Box::new(s.as_str().ok()?), _ => return None }; Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string()) } fn is_new_column_header(s: &str) -> bool { s.starts_with("Northern") || s.starts_with("Southern") } fn main() { let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF"); let mut doc_iter = DocumentIterator::new(&doc).peekable(); let mut columns: Vec<(String, f32)> = Vec::new(); let mut datapoints: Vec = Vec::new(); let (mut current_datapoint, mut current_y) = loop { if let Some(text) = doc_iter.next() { if is_new_column_header(&text.text) { let mut column_name = text.text; let column_x = text.x; while let Some(more) = doc_iter.peek() { if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { columns.push((column_name, column_x)); break; } column_name += " "; column_name += &more.text; doc_iter.next(); } } else if DATE_REGEX.is_match(&text.text) { break (DataPoint::new(&text).expect("Failed to parse date!"), text.y); } } else { return; } }; columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); for text in doc_iter { if DATE_REGEX.is_match(&text.text) { datapoints.push(std::mem::replace( &mut current_datapoint, DataPoint::new(&text).expect("Failed to parse date!"), )); current_y = text.y; } else if VALUE_REGEX.is_match(&text.text) { if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { continue; } if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!")); } } } print!("Date"); for (column, _) in columns.iter() { print!(",{}", column); } println!(); for datapoint in datapoints.iter() { print!( "{}", datapoint .date .format(DATE_DISPLAY_FORMAT) .expect("Failed to format date!") ); for (column, _) in columns.iter() { if let Some(val) = datapoint.values.get(&column.as_ref()) { print!(",{}", val); } else { print!(","); } } println!(); } }