poop-graph/src/main.rs

191 lines
5.8 KiB
Rust
Raw Normal View History

2022-04-05 14:07:28 -04:00
use std::collections::HashMap;
use lazy_static::lazy_static;
use pdf::{primitive::Primitive, content::Operation, backend::Backend};
use regex::Regex;
use time::Date;
const POSITION_ERROR_MARGIN: f32 = 2.0;
lazy_static! {
static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}
const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
"[month padding:none]/[day padding:none]/[year]"
);
const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
"[year]-[month]-[day]"
);
#[derive(Debug)]
struct DataPoint<'a> {
date: Date,
values: HashMap<&'a str, u32>,
}
impl<'a> DataPoint<'a> {
fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })
}
}
struct DocumentText {
x: f32,
y: f32,
text: String,
}
struct DocumentIterator<'a> {
x: f32,
y: f32,
operations: Box<dyn Iterator<Item = Operation> + 'a>,
}
impl<'a> DocumentIterator<'a> {
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
Self {
x: 0.0,
y: 0.0,
operations: Box::new(
document
.pages()
.filter_map(|page| {
Some(page.ok()?.contents.clone()?.operations.into_iter())
})
.flatten()
),
}
}
}
impl<'a> Iterator for DocumentIterator<'a> {
type Item = DocumentText;
fn next(&mut self) -> Option<Self::Item> {
for Operation { operator, operands } in self.operations.as_mut() {
if operator == "Tm" {
if let (Some(x), Some(y)) = (
operands.get(4).and_then(extract_number),
operands.get(5).and_then(extract_number),
) {
self.x = x;
self.y = y;
}
} else if operator == "TJ" || operator == "Tj" {
if let Some(text) = operands.get(0).and_then(extract_string) {
return Some(DocumentText {
x: self.x,
y: self.y,
text,
});
}
}
}
None
}
}
fn extract_number(p: &Primitive) -> Option<f32> {
match p {
Primitive::Number(n) => Some(*n),
Primitive::Integer(n) => Some(*n as f32),
_ => None,
}
}
fn extract_string(p: &Primitive) -> Option<String> {
let result: Box<dyn AsRef<str>> = match p {
Primitive::Array(array) => {
let mut acc = String::new();
for element in array.iter() {
if let Primitive::String(s) = element {
acc += s.as_str().ok()?.as_ref();
}
}
Box::new(acc)
}
Primitive::String(s) => Box::new(s.as_str().ok()?),
_ => return None
};
Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())
}
fn is_new_column_header(s: &str) -> bool {
s.starts_with("Northern") || s.starts_with("Southern")
}
fn main() {
let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");
let mut doc_iter = DocumentIterator::new(&doc).peekable();
let mut columns: Vec<(String, f32)> = Vec::new();
let mut datapoints: Vec<DataPoint> = Vec::new();
let (mut current_datapoint, mut current_y) = loop {
if let Some(text) = doc_iter.next() {
if is_new_column_header(&text.text) {
let mut column_name = text.text;
let column_x = text.x;
while let Some(more) = doc_iter.peek() {
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
columns.push((column_name, column_x));
break;
}
column_name += " ";
column_name += &more.text;
doc_iter.next();
}
} else if DATE_REGEX.is_match(&text.text) {
break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);
}
} else {
return;
}
};
columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
for text in doc_iter {
if DATE_REGEX.is_match(&text.text) {
datapoints.push(std::mem::replace(
&mut current_datapoint,
DataPoint::new(&text).expect("Failed to parse date!"),
));
current_y = text.y;
} else if VALUE_REGEX.is_match(&text.text) {
if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
continue;
}
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));
}
}
}
print!("Date");
for (column, _) in columns.iter() {
print!(",{}", column);
}
println!();
for datapoint in datapoints.iter() {
print!(
"{}",
datapoint
.date
.format(DATE_DISPLAY_FORMAT)
.expect("Failed to format date!")
);
for (column, _) in columns.iter() {
if let Some(val) = datapoint.values.get(&column.as_ref()) {
print!(",{}", val);
} else {
print!(",");
}
}
println!();
}
}