diff --git a/src/extract.rs b/src/extract.rs index 7890738..62dde2e 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,7 +1,7 @@ use std::{collections::HashMap, sync::Arc}; use lazy_static::lazy_static; -use pdf::{backend::Backend, content::Operation, primitive::Primitive}; +use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}}; use regex::Regex; use time::Date; @@ -38,7 +38,7 @@ impl DataSet { if let Some(text) = doc_iter.next() { if is_new_column_header(&text.text) { let mut column_name = text.text; - let column_x = text.x; + let column_x = text.point.x; while let Some(more) = doc_iter.peek() { if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { columns.push((Arc::new(column_name), column_x)); @@ -51,7 +51,7 @@ impl DataSet { } else if DATE_REGEX.is_match(&text.text) { break ( DataPoint::new(&text).expect("Failed to parse date!"), - text.y, + text.point.y, ); } } else { @@ -67,12 +67,12 @@ impl DataSet { &mut current_datapoint, DataPoint::new(&text).expect("Failed to parse date!"), )); - current_y = text.y; + current_y = text.point.y; } else if VALUE_REGEX.is_match(&text.text) { - if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { + if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN { continue; } - if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { + if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) { current_datapoint.values.insert( column.clone(), text.text.parse().expect("Failed to parse value!"), @@ -103,26 +103,23 @@ impl DataPoint { } struct DocumentText { - x: f32, - y: f32, + point: Point, text: String, } struct DocumentIterator<'a> { - x: f32, - y: f32, - operations: Box + 'a>, + point: Point, + operations: Box + 'a>, } impl<'a> DocumentIterator<'a> { fn new(document: &'a pdf::file::File) -> Self { Self { - x: 0.0, - y: 0.0, + point: Point { x: 0.0, y: 0.0 }, operations: Box::new( document .pages() - .filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter())) + .filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter())) .flatten(), ), } @@ -133,54 +130,44 @@ impl<'a> Iterator for DocumentIterator<'a> { type Item = DocumentText; fn next(&mut self) -> Option { - for Operation { operator, operands } in self.operations.as_mut() { - if operator == "Tm" { - if let (Some(x), Some(y)) = ( - operands.get(4).and_then(extract_number), - operands.get(5).and_then(extract_number), - ) { - self.x = x; - self.y = y; + for operation in self.operations.as_mut() { + match operation { + Op::SetTextMatrix { matrix } => { + self.point = Point { x: matrix.e, y: matrix.f }; } - } else if operator == "TJ" || operator == "Tj" { - if let Some(text) = operands.get(0).and_then(extract_string) { - return Some(DocumentText { - x: self.x, - y: self.y, - text, - }); + Op::TextDraw { text } => { + if let Ok(text) = text.to_string() { + return Some(DocumentText { + point: self.point, + text, + }); + } } + Op::TextDrawAdjusted { array } => { + if let Some(text) = concatenate_adjusted_text(array) { + return Some(DocumentText { + point: self.point, + text, + }); + } + } + _ => continue, } } None } } -fn extract_number(p: &Primitive) -> Option { - match p { - Primitive::Number(n) => Some(*n), - Primitive::Integer(n) => Some(*n as f32), - _ => None, - } -} - -fn extract_string(p: &Primitive) -> Option { - let result: Box> = match p { - Primitive::Array(array) => { - let mut acc = String::new(); - for element in array.iter() { - if let Primitive::String(s) = element { - acc += s.as_str().ok()?.as_ref(); - } - } - Box::new(acc) +fn concatenate_adjusted_text(array: Vec) -> Option { + let mut acc = String::new(); + for element in array.iter() { + if let TextDrawAdjusted::Text(s) = element { + acc += s.to_string().ok()?.as_ref(); } - Primitive::String(s) => Box::new(s.as_str().ok()?), - _ => return None, }; Some( WHITESPACE_REGEX - .replace_all((*result).as_ref().trim(), " ") + .replace_all(acc.trim(), " ") .to_string(), ) }