poop graph data extractor prototype

2022-04-05 14:07:28 -04:00 · 2022-04-05 14:07:28 -04:00 · 7b22bbab16
commit 7b22bbab16
4 changed files with 751 additions and 0 deletions
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,190 @@
+use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+use pdf::{primitive::Primitive, content::Operation, backend::Backend};
+use regex::Regex;
+use time::Date;
+
+const POSITION_ERROR_MARGIN: f32 = 2.0;
+
+lazy_static! {
+    static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
+    static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
+    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
+}
+
+const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
+    "[month padding:none]/[day padding:none]/[year]"
+);
+const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
+    "[year]-[month]-[day]"
+);
+
+#[derive(Debug)]
+struct DataPoint<'a> {
+    date: Date,
+    values: HashMap<&'a str, u32>,
+}
+
+impl<'a> DataPoint<'a> {
+    fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
+        Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })
+    }
+}
+
+struct DocumentText {
+    x: f32,
+    y: f32,
+    text: String,
+}
+
+struct DocumentIterator<'a> {
+    x: f32,
+    y: f32,
+    operations: Box<dyn Iterator<Item = Operation> + 'a>,
+}
+
+impl<'a> DocumentIterator<'a> {
+    fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
+        Self {
+            x: 0.0,
+            y: 0.0,
+            operations: Box::new(
+                document
+                    .pages()
+                    .filter_map(|page| {
+                        Some(page.ok()?.contents.clone()?.operations.into_iter())
+                    })
+                    .flatten()
+            ),
+        }
+    }
+}
+
+impl<'a> Iterator for DocumentIterator<'a> {
+    type Item = DocumentText;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for Operation { operator, operands } in self.operations.as_mut() {
+            if operator == "Tm" {
+                if let (Some(x), Some(y)) = (
+                    operands.get(4).and_then(extract_number),
+                    operands.get(5).and_then(extract_number),
+                ) {
+                    self.x = x;
+                    self.y = y;
+                }
+            } else if operator == "TJ" || operator == "Tj" {
+                if let Some(text) = operands.get(0).and_then(extract_string) {
+                    return Some(DocumentText {
+                        x: self.x,
+                        y: self.y,
+                        text,
+                    });
+                }
+            }
+        }
+        None
+    }
+}
+
+fn extract_number(p: &Primitive) -> Option<f32> {
+    match p {
+        Primitive::Number(n) => Some(*n),
+        Primitive::Integer(n) => Some(*n as f32),
+        _ => None,
+    }
+}
+
+fn extract_string(p: &Primitive) -> Option<String> {
+    let result: Box<dyn AsRef<str>> = match p {
+        Primitive::Array(array) => {
+            let mut acc = String::new();
+            for element in array.iter() {
+                if let Primitive::String(s) = element {
+                    acc += s.as_str().ok()?.as_ref();
+                }
+            }
+            Box::new(acc)
+        }
+        Primitive::String(s) => Box::new(s.as_str().ok()?),
+        _ => return None
+    };
+    Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())
+}
+
+fn is_new_column_header(s: &str) -> bool {
+    s.starts_with("Northern") || s.starts_with("Southern")
+}
+
+fn main() {
+    let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");
+    let mut doc_iter = DocumentIterator::new(&doc).peekable();
+
+    let mut columns: Vec<(String, f32)> = Vec::new();
+    let mut datapoints: Vec<DataPoint> = Vec::new();
+
+    let (mut current_datapoint, mut current_y) = loop {
+        if let Some(text) = doc_iter.next() {
+            if is_new_column_header(&text.text) {
+                let mut column_name = text.text;
+                let column_x = text.x;
+                while let Some(more) = doc_iter.peek() {
+                    if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
+                        columns.push((column_name, column_x));
+                        break;
+                    }
+                    column_name += " ";
+                    column_name += &more.text;
+                    doc_iter.next();
+                }
+            } else if DATE_REGEX.is_match(&text.text) {
+                break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);
+            }
+        } else {
+            return;
+        }
+    };
+
+    columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
+
+    for text in doc_iter {
+        if DATE_REGEX.is_match(&text.text) {
+            datapoints.push(std::mem::replace(
+                &mut current_datapoint, 
+                DataPoint::new(&text).expect("Failed to parse date!"),
+            ));
+            current_y = text.y;
+        } else if VALUE_REGEX.is_match(&text.text) {
+            if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
+                continue;
+            }
+            if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
+                current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));
+            }
+        }
+    }
+
+    print!("Date");
+    for (column, _) in columns.iter() {
+        print!(",{}", column);
+    }
+    println!();
+    for datapoint in datapoints.iter() {
+        print!(
+            "{}",
+            datapoint
+                .date
+                .format(DATE_DISPLAY_FORMAT)
+                .expect("Failed to format date!")
+        );
+        for (column, _) in columns.iter() {
+            if let Some(val) = datapoint.values.get(&column.as_ref()) {
+                print!(",{}", val);
+            } else {
+                print!(",");
+            }
+        }
+        println!();
+    }
+}