commit 7b22bbab16e064fb3267e22a957e63db9b9d1be3 Author: xenofem Date: Tue Apr 5 14:07:28 2022 -0400 poop graph data extractor prototype diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..5625650 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,548 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "aes" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "884391ef1066acaa41e766ba8f596341b96e93ce34f9a43e7d24bf0a0eaf0561" +dependencies = [ + "aes-soft", + "aesni", + "cipher", +] + +[[package]] +name = "aes-soft" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be14c7498ea50828a38d0e24a765ed2effe92a705885b57d029cd67d45744072" +dependencies = [ + "cipher", + "opaque-debug", +] + +[[package]] +name = "aesni" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2e11f5e94c2f7d386164cc2aa1f97823fed6f259e486940a71c174dd01b0ce" +dependencies = [ + "cipher", + "opaque-debug", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-modes" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a0e8073e8baa88212fb5823574c02ebccb395136ba9a164ab89379ec6072f0" +dependencies = [ + "block-padding", + "cipher", +] + +[[package]] +name = "block-padding" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d696c370c750c948ada61c69a0ee2cbbb9c50b1019ddb86d9317157a99c2cae" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "time 0.1.44", + "winapi", +] + +[[package]] +name = "cipher" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f8e7987cbd042a63249497f41aed09f8e65add917ea6566effbc56578d6801" +dependencies = [ + "generic-array", +] + +[[package]] +name = "cpufeatures" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b" +dependencies = [ + "libc", +] + +[[package]] +name = "deflate" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f95bf05dffba6e6cce8dfbb30def788154949ccd9aed761b472119c21e01c70" +dependencies = [ + "adler32", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "fax" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be28317220fbcf14cead021ae0a973a4433df68fd3c3a022bf8a4fb3bdc3ba8e" +dependencies = [ + "fax_derive", +] + +[[package]] +name = "fax_derive" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c1d7ffc9f2dc8316348c75281a99c8fdc60c1ddf4f82a366d117bf1b74d5a39" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "generic-array" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "inflate" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff" +dependencies = [ + "adler32", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + +[[package]] +name = "jpeg-decoder" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229d53d58899083193af11e15917b5640cd40b29ff475a1fe4ef725deb02d0f2" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" + +[[package]] +name = "log" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_threads" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" +dependencies = [ + "libc", +] + +[[package]] +name = "once_cell" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "ordermap" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91409674c628d07a6b4b79cc877c6b63ba5ccbfbadddd77ca822f55069ed1bd4" + +[[package]] +name = "pdf" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f48644b2f4e9c3f3468d7c31fa75e9b823dfb167c8581ec15e90a90c34430d9" +dependencies = [ + "aes", + "block-modes", + "byteorder", + "chrono", + "deflate", + "fax", + "glob", + "inflate", + "itertools", + "jpeg-decoder", + "log", + "md5", + "num-traits", + "once_cell", + "ordermap", + "pdf_derive", + "sha2", + "snafu", + "stringprep", + "weezl", +] + +[[package]] +name = "pdf_derive" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4007262775d0798de87b15cbc64cf1aed5f7ee87eec847e297b69d8ed4b4f8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "poop-graph" +version = "0.1.0" +dependencies = [ + "lazy_static", + "pdf", + "regex", + "time 0.3.9", +] + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "632d02bff7f874a36f33ea8bb416cd484b90cc66c1194b1a1110d067a7013f58" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer", + "cfg-if", + "cpufeatures", + "digest", + "opaque-debug", +] + +[[package]] +name = "snafu" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eab12d3c261b2308b0d80c26fffb58d17eba81a4be97890101f416b478c79ca7" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1508efa03c362e23817f96cde18abed596a25219a8b2c66e8db33c03543d315b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "stringprep" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "syn" +version = "1.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704df27628939572cd88d33f171cd6f896f4eaca85252c6e0a72d8d8287ee86f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "time" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +dependencies = [ + "libc", + "wasi", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +dependencies = [ + "itoa", + "libc", + "num_threads", + "time-macros", +] + +[[package]] +name = "time-macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" + +[[package]] +name = "tinyvec" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + +[[package]] +name = "unicode-bidi" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "weezl" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b77fdfd5a253be4ab714e4ffa3c49caf146b4de743e97510c0656cf90f1e8e" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9c3d2f9 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "poop-graph" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +lazy_static = "1.4" +pdf = "0.7.2" +regex = "1.5.5" +time = { version = "0.3.9", features = ["formatting", "macros", "parsing"] } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..53ac347 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,190 @@ +use std::collections::HashMap; + +use lazy_static::lazy_static; +use pdf::{primitive::Primitive, content::Operation, backend::Backend}; +use regex::Regex; +use time::Date; + +const POSITION_ERROR_MARGIN: f32 = 2.0; + +lazy_static! { + static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); + static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); + static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); +} + +const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( + "[month padding:none]/[day padding:none]/[year]" +); +const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( + "[year]-[month]-[day]" +); + +#[derive(Debug)] +struct DataPoint<'a> { + date: Date, + values: HashMap<&'a str, u32>, +} + +impl<'a> DataPoint<'a> { + fn new(text: &DocumentText) -> Result { + Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() }) + } +} + +struct DocumentText { + x: f32, + y: f32, + text: String, +} + +struct DocumentIterator<'a> { + x: f32, + y: f32, + operations: Box + 'a>, +} + +impl<'a> DocumentIterator<'a> { + fn new(document: &'a pdf::file::File) -> Self { + Self { + x: 0.0, + y: 0.0, + operations: Box::new( + document + .pages() + .filter_map(|page| { + Some(page.ok()?.contents.clone()?.operations.into_iter()) + }) + .flatten() + ), + } + } +} + +impl<'a> Iterator for DocumentIterator<'a> { + type Item = DocumentText; + + fn next(&mut self) -> Option { + for Operation { operator, operands } in self.operations.as_mut() { + if operator == "Tm" { + if let (Some(x), Some(y)) = ( + operands.get(4).and_then(extract_number), + operands.get(5).and_then(extract_number), + ) { + self.x = x; + self.y = y; + } + } else if operator == "TJ" || operator == "Tj" { + if let Some(text) = operands.get(0).and_then(extract_string) { + return Some(DocumentText { + x: self.x, + y: self.y, + text, + }); + } + } + } + None + } +} + +fn extract_number(p: &Primitive) -> Option { + match p { + Primitive::Number(n) => Some(*n), + Primitive::Integer(n) => Some(*n as f32), + _ => None, + } +} + +fn extract_string(p: &Primitive) -> Option { + let result: Box> = match p { + Primitive::Array(array) => { + let mut acc = String::new(); + for element in array.iter() { + if let Primitive::String(s) = element { + acc += s.as_str().ok()?.as_ref(); + } + } + Box::new(acc) + } + Primitive::String(s) => Box::new(s.as_str().ok()?), + _ => return None + }; + Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string()) +} + +fn is_new_column_header(s: &str) -> bool { + s.starts_with("Northern") || s.starts_with("Southern") +} + +fn main() { + let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF"); + let mut doc_iter = DocumentIterator::new(&doc).peekable(); + + let mut columns: Vec<(String, f32)> = Vec::new(); + let mut datapoints: Vec = Vec::new(); + + let (mut current_datapoint, mut current_y) = loop { + if let Some(text) = doc_iter.next() { + if is_new_column_header(&text.text) { + let mut column_name = text.text; + let column_x = text.x; + while let Some(more) = doc_iter.peek() { + if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { + columns.push((column_name, column_x)); + break; + } + column_name += " "; + column_name += &more.text; + doc_iter.next(); + } + } else if DATE_REGEX.is_match(&text.text) { + break (DataPoint::new(&text).expect("Failed to parse date!"), text.y); + } + } else { + return; + } + }; + + columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); + + for text in doc_iter { + if DATE_REGEX.is_match(&text.text) { + datapoints.push(std::mem::replace( + &mut current_datapoint, + DataPoint::new(&text).expect("Failed to parse date!"), + )); + current_y = text.y; + } else if VALUE_REGEX.is_match(&text.text) { + if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { + continue; + } + if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { + current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!")); + } + } + } + + print!("Date"); + for (column, _) in columns.iter() { + print!(",{}", column); + } + println!(); + for datapoint in datapoints.iter() { + print!( + "{}", + datapoint + .date + .format(DATE_DISPLAY_FORMAT) + .expect("Failed to format date!") + ); + for (column, _) in columns.iter() { + if let Some(val) = datapoint.values.get(&column.as_ref()) { + print!(",{}", val); + } else { + print!(","); + } + } + println!(); + } +}