update extractor for pdf 8.0
This commit is contained in:
parent
e54bf1350c
commit
f6da7a7642
|
@ -1,7 +1,7 @@
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use pdf::{backend::Backend, content::Operation, primitive::Primitive};
|
use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use time::Date;
|
use time::Date;
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ impl DataSet {
|
||||||
if let Some(text) = doc_iter.next() {
|
if let Some(text) = doc_iter.next() {
|
||||||
if is_new_column_header(&text.text) {
|
if is_new_column_header(&text.text) {
|
||||||
let mut column_name = text.text;
|
let mut column_name = text.text;
|
||||||
let column_x = text.x;
|
let column_x = text.point.x;
|
||||||
while let Some(more) = doc_iter.peek() {
|
while let Some(more) = doc_iter.peek() {
|
||||||
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
|
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
|
||||||
columns.push((Arc::new(column_name), column_x));
|
columns.push((Arc::new(column_name), column_x));
|
||||||
|
@ -51,7 +51,7 @@ impl DataSet {
|
||||||
} else if DATE_REGEX.is_match(&text.text) {
|
} else if DATE_REGEX.is_match(&text.text) {
|
||||||
break (
|
break (
|
||||||
DataPoint::new(&text).expect("Failed to parse date!"),
|
DataPoint::new(&text).expect("Failed to parse date!"),
|
||||||
text.y,
|
text.point.y,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -67,12 +67,12 @@ impl DataSet {
|
||||||
&mut current_datapoint,
|
&mut current_datapoint,
|
||||||
DataPoint::new(&text).expect("Failed to parse date!"),
|
DataPoint::new(&text).expect("Failed to parse date!"),
|
||||||
));
|
));
|
||||||
current_y = text.y;
|
current_y = text.point.y;
|
||||||
} else if VALUE_REGEX.is_match(&text.text) {
|
} else if VALUE_REGEX.is_match(&text.text) {
|
||||||
if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
|
if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
|
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) {
|
||||||
current_datapoint.values.insert(
|
current_datapoint.values.insert(
|
||||||
column.clone(),
|
column.clone(),
|
||||||
text.text.parse().expect("Failed to parse value!"),
|
text.text.parse().expect("Failed to parse value!"),
|
||||||
|
@ -103,26 +103,23 @@ impl DataPoint {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct DocumentText {
|
struct DocumentText {
|
||||||
x: f32,
|
point: Point,
|
||||||
y: f32,
|
|
||||||
text: String,
|
text: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct DocumentIterator<'a> {
|
struct DocumentIterator<'a> {
|
||||||
x: f32,
|
point: Point,
|
||||||
y: f32,
|
operations: Box<dyn Iterator<Item = Op> + 'a>,
|
||||||
operations: Box<dyn Iterator<Item = Operation> + 'a>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DocumentIterator<'a> {
|
impl<'a> DocumentIterator<'a> {
|
||||||
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
|
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
x: 0.0,
|
point: Point { x: 0.0, y: 0.0 },
|
||||||
y: 0.0,
|
|
||||||
operations: Box::new(
|
operations: Box::new(
|
||||||
document
|
document
|
||||||
.pages()
|
.pages()
|
||||||
.filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter()))
|
.filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter()))
|
||||||
.flatten(),
|
.flatten(),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
@ -133,54 +130,44 @@ impl<'a> Iterator for DocumentIterator<'a> {
|
||||||
type Item = DocumentText;
|
type Item = DocumentText;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
for Operation { operator, operands } in self.operations.as_mut() {
|
for operation in self.operations.as_mut() {
|
||||||
if operator == "Tm" {
|
match operation {
|
||||||
if let (Some(x), Some(y)) = (
|
Op::SetTextMatrix { matrix } => {
|
||||||
operands.get(4).and_then(extract_number),
|
self.point = Point { x: matrix.e, y: matrix.f };
|
||||||
operands.get(5).and_then(extract_number),
|
|
||||||
) {
|
|
||||||
self.x = x;
|
|
||||||
self.y = y;
|
|
||||||
}
|
}
|
||||||
} else if operator == "TJ" || operator == "Tj" {
|
Op::TextDraw { text } => {
|
||||||
if let Some(text) = operands.get(0).and_then(extract_string) {
|
if let Ok(text) = text.to_string() {
|
||||||
return Some(DocumentText {
|
return Some(DocumentText {
|
||||||
x: self.x,
|
point: self.point,
|
||||||
y: self.y,
|
|
||||||
text,
|
text,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Op::TextDrawAdjusted { array } => {
|
||||||
|
if let Some(text) = concatenate_adjusted_text(array) {
|
||||||
|
return Some(DocumentText {
|
||||||
|
point: self.point,
|
||||||
|
text,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_number(p: &Primitive) -> Option<f32> {
|
fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> {
|
||||||
match p {
|
|
||||||
Primitive::Number(n) => Some(*n),
|
|
||||||
Primitive::Integer(n) => Some(*n as f32),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_string(p: &Primitive) -> Option<String> {
|
|
||||||
let result: Box<dyn AsRef<str>> = match p {
|
|
||||||
Primitive::Array(array) => {
|
|
||||||
let mut acc = String::new();
|
let mut acc = String::new();
|
||||||
for element in array.iter() {
|
for element in array.iter() {
|
||||||
if let Primitive::String(s) = element {
|
if let TextDrawAdjusted::Text(s) = element {
|
||||||
acc += s.as_str().ok()?.as_ref();
|
acc += s.to_string().ok()?.as_ref();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
Box::new(acc)
|
|
||||||
}
|
|
||||||
Primitive::String(s) => Box::new(s.as_str().ok()?),
|
|
||||||
_ => return None,
|
|
||||||
};
|
};
|
||||||
Some(
|
Some(
|
||||||
WHITESPACE_REGEX
|
WHITESPACE_REGEX
|
||||||
.replace_all((*result).as_ref().trim(), " ")
|
.replace_all(acc.trim(), " ")
|
||||||
.to_string(),
|
.to_string(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue