178 lines
5.5 KiB
Rust
178 lines
5.5 KiB
Rust
use std::{collections::HashMap, sync::Arc};
|
|
|
|
use lazy_static::lazy_static;
|
|
use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}};
|
|
use regex::Regex;
|
|
use time::Date;
|
|
|
|
const POSITION_ERROR_MARGIN: f32 = 2.0;
|
|
|
|
lazy_static! {
|
|
static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
|
|
static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
|
|
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
|
|
}
|
|
|
|
const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] =
|
|
time::macros::format_description!("[month padding:none]/[day padding:none]/[year]");
|
|
|
|
pub struct DataSet {
|
|
pub columns: Vec<Arc<String>>,
|
|
pub rows: Vec<DataPoint>,
|
|
}
|
|
|
|
#[derive(thiserror::Error, Debug)]
|
|
pub enum Error {
|
|
#[error("PDF contained no data rows")]
|
|
NoData,
|
|
}
|
|
|
|
impl DataSet {
|
|
pub fn extract<B: Backend>(doc: &pdf::file::File<B>) -> Result<Self, Error> {
|
|
let mut doc_iter = DocumentIterator::new(doc).peekable();
|
|
|
|
let mut columns: Vec<(Arc<String>, f32)> = Vec::new();
|
|
let mut rows: Vec<DataPoint> = Vec::new();
|
|
|
|
let (mut current_datapoint, mut current_y) = loop {
|
|
if let Some(text) = doc_iter.next() {
|
|
if is_new_column_header(&text.text) {
|
|
let mut column_name = text.text;
|
|
let column_x = text.point.x;
|
|
while let Some(more) = doc_iter.peek() {
|
|
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
|
|
columns.push((Arc::new(column_name), column_x));
|
|
break;
|
|
}
|
|
column_name += " ";
|
|
column_name += &more.text;
|
|
doc_iter.next();
|
|
}
|
|
} else if DATE_REGEX.is_match(&text.text) {
|
|
break (
|
|
DataPoint::new(&text).expect("Failed to parse date!"),
|
|
text.point.y,
|
|
);
|
|
}
|
|
} else {
|
|
return Err(Error::NoData);
|
|
}
|
|
};
|
|
|
|
columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
|
|
|
|
for text in doc_iter {
|
|
if DATE_REGEX.is_match(&text.text) {
|
|
rows.push(std::mem::replace(
|
|
&mut current_datapoint,
|
|
DataPoint::new(&text).expect("Failed to parse date!"),
|
|
));
|
|
current_y = text.point.y;
|
|
} else if VALUE_REGEX.is_match(&text.text) {
|
|
if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN {
|
|
continue;
|
|
}
|
|
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) {
|
|
current_datapoint.values.insert(
|
|
column.clone(),
|
|
text.text.parse().expect("Failed to parse value!"),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(Self {
|
|
columns: columns.into_iter().map(|(column, _)| column).collect(),
|
|
rows,
|
|
})
|
|
}
|
|
}
|
|
|
|
pub struct DataPoint {
|
|
pub date: Date,
|
|
pub values: HashMap<Arc<String>, u32>,
|
|
}
|
|
|
|
impl DataPoint {
|
|
fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
|
|
Ok(Self {
|
|
date: Date::parse(&text.text, DATE_PARSE_FORMAT)?,
|
|
values: HashMap::new(),
|
|
})
|
|
}
|
|
}
|
|
|
|
struct DocumentText {
|
|
point: Point,
|
|
text: String,
|
|
}
|
|
|
|
struct DocumentIterator<'a> {
|
|
point: Point,
|
|
operations: Box<dyn Iterator<Item = Op> + 'a>,
|
|
}
|
|
|
|
impl<'a> DocumentIterator<'a> {
|
|
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
|
|
Self {
|
|
point: Point { x: 0.0, y: 0.0 },
|
|
operations: Box::new(
|
|
document
|
|
.pages()
|
|
.filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter()))
|
|
.flatten(),
|
|
),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for DocumentIterator<'a> {
|
|
type Item = DocumentText;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
for operation in self.operations.as_mut() {
|
|
match operation {
|
|
Op::SetTextMatrix { matrix } => {
|
|
self.point = Point { x: matrix.e, y: matrix.f };
|
|
}
|
|
Op::TextDraw { text } => {
|
|
if let Ok(text) = text.to_string() {
|
|
return Some(DocumentText {
|
|
point: self.point,
|
|
text,
|
|
});
|
|
}
|
|
}
|
|
Op::TextDrawAdjusted { array } => {
|
|
if let Some(text) = concatenate_adjusted_text(array) {
|
|
return Some(DocumentText {
|
|
point: self.point,
|
|
text,
|
|
});
|
|
}
|
|
}
|
|
_ => continue,
|
|
}
|
|
}
|
|
None
|
|
}
|
|
}
|
|
|
|
fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> {
|
|
let mut acc = String::new();
|
|
for element in array.iter() {
|
|
if let TextDrawAdjusted::Text(s) = element {
|
|
acc += s.to_string().ok()?.as_ref();
|
|
}
|
|
};
|
|
Some(
|
|
WHITESPACE_REGEX
|
|
.replace_all(acc.trim(), " ")
|
|
.to_string(),
|
|
)
|
|
}
|
|
|
|
fn is_new_column_header(s: &str) -> bool {
|
|
s.starts_with("Northern") || s.starts_with("Southern")
|
|
}
|