poop-graph/src/main.rs

use std::collections::HashMap;

use lazy_static::lazy_static;
use pdf::{primitive::Primitive, content::Operation, backend::Backend};
use regex::Regex;
use time::Date;

const POSITION_ERROR_MARGIN: f32 = 2.0;

lazy_static! {
    static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
    static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}

const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
    "[month padding:none]/[day padding:none]/[year]"
);
const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
    "[year]-[month]-[day]"
);

#[derive(Debug)]
struct DataPoint<'a> {
    date: Date,
    values: HashMap<&'a str, u32>,
}

impl<'a> DataPoint<'a> {
    fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
        Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })
    }
}

struct DocumentText {
    x: f32,
    y: f32,
    text: String,
}

struct DocumentIterator<'a> {
    x: f32,
    y: f32,
    operations: Box<dyn Iterator<Item = Operation> + 'a>,
}

impl<'a> DocumentIterator<'a> {
    fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
        Self {
            x: 0.0,
            y: 0.0,
            operations: Box::new(
                document
                    .pages()
                    .filter_map(|page| {
                        Some(page.ok()?.contents.clone()?.operations.into_iter())
                    })
                    .flatten()
            ),
        }
    }
}

impl<'a> Iterator for DocumentIterator<'a> {
    type Item = DocumentText;

    fn next(&mut self) -> Option<Self::Item> {
        for Operation { operator, operands } in self.operations.as_mut() {
            if operator == "Tm" {
                if let (Some(x), Some(y)) = (
                    operands.get(4).and_then(extract_number),
                    operands.get(5).and_then(extract_number),
                ) {
                    self.x = x;
                    self.y = y;
                }
            } else if operator == "TJ" || operator == "Tj" {
                if let Some(text) = operands.get(0).and_then(extract_string) {
                    return Some(DocumentText {
                        x: self.x,
                        y: self.y,
                        text,
                    });
                }
            }
        }
        None
    }
}

fn extract_number(p: &Primitive) -> Option<f32> {
    match p {
        Primitive::Number(n) => Some(*n),
        Primitive::Integer(n) => Some(*n as f32),
        _ => None,
    }
}

fn extract_string(p: &Primitive) -> Option<String> {
    let result: Box<dyn AsRef<str>> = match p {
        Primitive::Array(array) => {
            let mut acc = String::new();
            for element in array.iter() {
                if let Primitive::String(s) = element {
                    acc += s.as_str().ok()?.as_ref();
                }
            }
            Box::new(acc)
        }
        Primitive::String(s) => Box::new(s.as_str().ok()?),
        _ => return None
    };
    Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())
}

fn is_new_column_header(s: &str) -> bool {
    s.starts_with("Northern") || s.starts_with("Southern")
}

fn main() {
    let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");
    let mut doc_iter = DocumentIterator::new(&doc).peekable();

    let mut columns: Vec<(String, f32)> = Vec::new();
    let mut datapoints: Vec<DataPoint> = Vec::new();

    let (mut current_datapoint, mut current_y) = loop {
        if let Some(text) = doc_iter.next() {
            if is_new_column_header(&text.text) {
                let mut column_name = text.text;
                let column_x = text.x;
                while let Some(more) = doc_iter.peek() {
                    if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
                        columns.push((column_name, column_x));
                        break;
                    }
                    column_name += " ";
                    column_name += &more.text;
                    doc_iter.next();
                }
            } else if DATE_REGEX.is_match(&text.text) {
                break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);
            }
        } else {
            return;
        }
    };

    columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));

    for text in doc_iter {
        if DATE_REGEX.is_match(&text.text) {
            datapoints.push(std::mem::replace(
                &mut current_datapoint, 
                DataPoint::new(&text).expect("Failed to parse date!"),
            ));
            current_y = text.y;
        } else if VALUE_REGEX.is_match(&text.text) {
            if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
                continue;
            }
            if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
                current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));
            }
        }
    }

    print!("Date");
    for (column, _) in columns.iter() {
        print!(",{}", column);
    }
    println!();
    for datapoint in datapoints.iter() {
        print!(
            "{}",
            datapoint
                .date
                .format(DATE_DISPLAY_FORMAT)
                .expect("Failed to format date!")
        );
        for (column, _) in columns.iter() {
            if let Some(val) = datapoint.values.get(&column.as_ref()) {
                print!(",{}", val);
            } else {
                print!(",");
            }
        }
        println!();
    }
}
poop graph data extractor prototype 2022-04-05 14:07:28 -04:00			`use std::collections::HashMap;`

			`use lazy_static::lazy_static;`
			`use pdf::{primitive::Primitive, content::Operation, backend::Backend};`
			`use regex::Regex;`
			`use time::Date;`

			`const POSITION_ERROR_MARGIN: f32 = 2.0;`

			`lazy_static! {`
			`static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();`
			`static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();`
			`static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();`
			`}`

			`const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(`
			`"[month padding:none]/[day padding:none]/[year]"`
			`);`
			`const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(`
			`"[year]-[month]-[day]"`
			`);`

			`#[derive(Debug)]`
			`struct DataPoint<'a> {`
			`date: Date,`
			`values: HashMap<&'a str, u32>,`
			`}`

			`impl<'a> DataPoint<'a> {`
			`fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {`
			`Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })`
			`}`
			`}`

			`struct DocumentText {`
			`x: f32,`
			`y: f32,`
			`text: String,`
			`}`

			`struct DocumentIterator<'a> {`
			`x: f32,`
			`y: f32,`
			`operations: Box<dyn Iterator<Item = Operation> + 'a>,`
			`}`

			`impl<'a> DocumentIterator<'a> {`
			`fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {`
			`Self {`
			`x: 0.0,`
			`y: 0.0,`
			`operations: Box::new(`
			`document`
			`.pages()`
			`.filter_map(\|page\| {`
			`Some(page.ok()?.contents.clone()?.operations.into_iter())`
			`})`
			`.flatten()`
			`),`
			`}`
			`}`
			`}`

			`impl<'a> Iterator for DocumentIterator<'a> {`
			`type Item = DocumentText;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`for Operation { operator, operands } in self.operations.as_mut() {`
			`if operator == "Tm" {`
			`if let (Some(x), Some(y)) = (`
			`operands.get(4).and_then(extract_number),`
			`operands.get(5).and_then(extract_number),`
			`) {`
			`self.x = x;`
			`self.y = y;`
			`}`
			`} else if operator == "TJ" \|\| operator == "Tj" {`
			`if let Some(text) = operands.get(0).and_then(extract_string) {`
			`return Some(DocumentText {`
			`x: self.x,`
			`y: self.y,`
			`text,`
			`});`
			`}`
			`}`
			`}`
			`None`
			`}`
			`}`

			`fn extract_number(p: &Primitive) -> Option<f32> {`
			`match p {`
			`Primitive::Number(n) => Some(*n),`
			`Primitive::Integer(n) => Some(*n as f32),`
			`_ => None,`
			`}`
			`}`

			`fn extract_string(p: &Primitive) -> Option<String> {`
			`let result: Box<dyn AsRef<str>> = match p {`
			`Primitive::Array(array) => {`
			`let mut acc = String::new();`
			`for element in array.iter() {`
			`if let Primitive::String(s) = element {`
			`acc += s.as_str().ok()?.as_ref();`
			`}`
			`}`
			`Box::new(acc)`
			`}`
			`Primitive::String(s) => Box::new(s.as_str().ok()?),`
			`_ => return None`
			`};`
			`Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())`
			`}`

			`fn is_new_column_header(s: &str) -> bool {`
			`s.starts_with("Northern") \|\| s.starts_with("Southern")`
			`}`

			`fn main() {`
			`let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");`
			`let mut doc_iter = DocumentIterator::new(&doc).peekable();`

			`let mut columns: Vec<(String, f32)> = Vec::new();`
			`let mut datapoints: Vec<DataPoint> = Vec::new();`

			`let (mut current_datapoint, mut current_y) = loop {`
			`if let Some(text) = doc_iter.next() {`
			`if is_new_column_header(&text.text) {`
			`let mut column_name = text.text;`
			`let column_x = text.x;`
			`while let Some(more) = doc_iter.peek() {`
			`if is_new_column_header(&more.text) \|\| DATE_REGEX.is_match(&more.text) {`
			`columns.push((column_name, column_x));`
			`break;`
			`}`
			`column_name += " ";`
			`column_name += &more.text;`
			`doc_iter.next();`
			`}`
			`} else if DATE_REGEX.is_match(&text.text) {`
			`break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);`
			`}`
			`} else {`
			`return;`
			`}`
			`};`

			`columns.sort_by(\|(_, x0), (_, x1)\| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));`

			`for text in doc_iter {`
			`if DATE_REGEX.is_match(&text.text) {`
			`datapoints.push(std::mem::replace(`
			`&mut current_datapoint,`
			`DataPoint::new(&text).expect("Failed to parse date!"),`
			`));`
			`current_y = text.y;`
			`} else if VALUE_REGEX.is_match(&text.text) {`
			`if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {`
			`continue;`
			`}`
			`if let Some((column, _)) = columns.iter().rev().find(\|(_, x)\| *x < text.x) {`
			`current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));`
			`}`
			`}`
			`}`

			`print!("Date");`
			`for (column, _) in columns.iter() {`
			`print!(",{}", column);`
			`}`
			`println!();`
			`for datapoint in datapoints.iter() {`
			`print!(`
			`"{}",`
			`datapoint`
			`.date`
			`.format(DATE_DISPLAY_FORMAT)`
			`.expect("Failed to format date!")`
			`);`
			`for (column, _) in columns.iter() {`
			`if let Some(val) = datapoint.values.get(&column.as_ref()) {`
			`print!(",{}", val);`
			`} else {`
			`print!(",");`
			`}`
			`}`
			`println!();`
			`}`
			`}`