update extractor for pdf 8.0

This commit is contained in:
xenofem 2023-03-31 18:13:56 -04:00
parent e54bf1350c
commit f6da7a7642

View file

@ -1,7 +1,7 @@
use std::{collections::HashMap, sync::Arc}; use std::{collections::HashMap, sync::Arc};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use pdf::{backend::Backend, content::Operation, primitive::Primitive}; use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}};
use regex::Regex; use regex::Regex;
use time::Date; use time::Date;
@ -38,7 +38,7 @@ impl DataSet {
if let Some(text) = doc_iter.next() { if let Some(text) = doc_iter.next() {
if is_new_column_header(&text.text) { if is_new_column_header(&text.text) {
let mut column_name = text.text; let mut column_name = text.text;
let column_x = text.x; let column_x = text.point.x;
while let Some(more) = doc_iter.peek() { while let Some(more) = doc_iter.peek() {
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
columns.push((Arc::new(column_name), column_x)); columns.push((Arc::new(column_name), column_x));
@ -51,7 +51,7 @@ impl DataSet {
} else if DATE_REGEX.is_match(&text.text) { } else if DATE_REGEX.is_match(&text.text) {
break ( break (
DataPoint::new(&text).expect("Failed to parse date!"), DataPoint::new(&text).expect("Failed to parse date!"),
text.y, text.point.y,
); );
} }
} else { } else {
@ -67,12 +67,12 @@ impl DataSet {
&mut current_datapoint, &mut current_datapoint,
DataPoint::new(&text).expect("Failed to parse date!"), DataPoint::new(&text).expect("Failed to parse date!"),
)); ));
current_y = text.y; current_y = text.point.y;
} else if VALUE_REGEX.is_match(&text.text) { } else if VALUE_REGEX.is_match(&text.text) {
if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN {
continue; continue;
} }
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) {
current_datapoint.values.insert( current_datapoint.values.insert(
column.clone(), column.clone(),
text.text.parse().expect("Failed to parse value!"), text.text.parse().expect("Failed to parse value!"),
@ -103,26 +103,23 @@ impl DataPoint {
} }
struct DocumentText { struct DocumentText {
x: f32, point: Point,
y: f32,
text: String, text: String,
} }
struct DocumentIterator<'a> { struct DocumentIterator<'a> {
x: f32, point: Point,
y: f32, operations: Box<dyn Iterator<Item = Op> + 'a>,
operations: Box<dyn Iterator<Item = Operation> + 'a>,
} }
impl<'a> DocumentIterator<'a> { impl<'a> DocumentIterator<'a> {
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self { fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
Self { Self {
x: 0.0, point: Point { x: 0.0, y: 0.0 },
y: 0.0,
operations: Box::new( operations: Box::new(
document document
.pages() .pages()
.filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter())) .filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter()))
.flatten(), .flatten(),
), ),
} }
@ -133,54 +130,44 @@ impl<'a> Iterator for DocumentIterator<'a> {
type Item = DocumentText; type Item = DocumentText;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
for Operation { operator, operands } in self.operations.as_mut() { for operation in self.operations.as_mut() {
if operator == "Tm" { match operation {
if let (Some(x), Some(y)) = ( Op::SetTextMatrix { matrix } => {
operands.get(4).and_then(extract_number), self.point = Point { x: matrix.e, y: matrix.f };
operands.get(5).and_then(extract_number),
) {
self.x = x;
self.y = y;
} }
} else if operator == "TJ" || operator == "Tj" { Op::TextDraw { text } => {
if let Some(text) = operands.get(0).and_then(extract_string) { if let Ok(text) = text.to_string() {
return Some(DocumentText { return Some(DocumentText {
x: self.x, point: self.point,
y: self.y, text,
text, });
}); }
} }
Op::TextDrawAdjusted { array } => {
if let Some(text) = concatenate_adjusted_text(array) {
return Some(DocumentText {
point: self.point,
text,
});
}
}
_ => continue,
} }
} }
None None
} }
} }
fn extract_number(p: &Primitive) -> Option<f32> { fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> {
match p { let mut acc = String::new();
Primitive::Number(n) => Some(*n), for element in array.iter() {
Primitive::Integer(n) => Some(*n as f32), if let TextDrawAdjusted::Text(s) = element {
_ => None, acc += s.to_string().ok()?.as_ref();
}
}
fn extract_string(p: &Primitive) -> Option<String> {
let result: Box<dyn AsRef<str>> = match p {
Primitive::Array(array) => {
let mut acc = String::new();
for element in array.iter() {
if let Primitive::String(s) = element {
acc += s.as_str().ok()?.as_ref();
}
}
Box::new(acc)
} }
Primitive::String(s) => Box::new(s.as_str().ok()?),
_ => return None,
}; };
Some( Some(
WHITESPACE_REGEX WHITESPACE_REGEX
.replace_all((*result).as_ref().trim(), " ") .replace_all(acc.trim(), " ")
.to_string(), .to_string(),
) )
} }