poop graph data extractor prototype

This commit is contained in:
xenofem 2022-04-05 14:07:28 -04:00
commit 7b22bbab16
4 changed files with 751 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

548
Cargo.lock generated Normal file
View file

@ -0,0 +1,548 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "adler32"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
[[package]]
name = "aes"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "884391ef1066acaa41e766ba8f596341b96e93ce34f9a43e7d24bf0a0eaf0561"
dependencies = [
"aes-soft",
"aesni",
"cipher",
]
[[package]]
name = "aes-soft"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be14c7498ea50828a38d0e24a765ed2effe92a705885b57d029cd67d45744072"
dependencies = [
"cipher",
"opaque-debug",
]
[[package]]
name = "aesni"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea2e11f5e94c2f7d386164cc2aa1f97823fed6f259e486940a71c174dd01b0ce"
dependencies = [
"cipher",
"opaque-debug",
]
[[package]]
name = "aho-corasick"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "block-buffer"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
dependencies = [
"generic-array",
]
[[package]]
name = "block-modes"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57a0e8073e8baa88212fb5823574c02ebccb395136ba9a164ab89379ec6072f0"
dependencies = [
"block-padding",
"cipher",
]
[[package]]
name = "block-padding"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d696c370c750c948ada61c69a0ee2cbbb9c50b1019ddb86d9317157a99c2cae"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
dependencies = [
"libc",
"num-integer",
"num-traits",
"time 0.1.44",
"winapi",
]
[[package]]
name = "cipher"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12f8e7987cbd042a63249497f41aed09f8e65add917ea6566effbc56578d6801"
dependencies = [
"generic-array",
]
[[package]]
name = "cpufeatures"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b"
dependencies = [
"libc",
]
[[package]]
name = "deflate"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f95bf05dffba6e6cce8dfbb30def788154949ccd9aed761b472119c21e01c70"
dependencies = [
"adler32",
]
[[package]]
name = "digest"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
dependencies = [
"generic-array",
]
[[package]]
name = "doc-comment"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "fax"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be28317220fbcf14cead021ae0a973a4433df68fd3c3a022bf8a4fb3bdc3ba8e"
dependencies = [
"fax_derive",
]
[[package]]
name = "fax_derive"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c1d7ffc9f2dc8316348c75281a99c8fdc60c1ddf4f82a366d117bf1b74d5a39"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "generic-array"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "glob"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]]
name = "inflate"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff"
dependencies = [
"adler32",
]
[[package]]
name = "itertools"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
[[package]]
name = "jpeg-decoder"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "229d53d58899083193af11e15917b5640cd40b29ff475a1fe4ef725deb02d0f2"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.121"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f"
[[package]]
name = "log"
version = "0.4.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8"
dependencies = [
"cfg-if",
]
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "memchr"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "num-integer"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
dependencies = [
"autocfg",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
[[package]]
name = "num_threads"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0"
dependencies = [
"libc",
]
[[package]]
name = "once_cell"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
[[package]]
name = "opaque-debug"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
[[package]]
name = "ordermap"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91409674c628d07a6b4b79cc877c6b63ba5ccbfbadddd77ca822f55069ed1bd4"
[[package]]
name = "pdf"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f48644b2f4e9c3f3468d7c31fa75e9b823dfb167c8581ec15e90a90c34430d9"
dependencies = [
"aes",
"block-modes",
"byteorder",
"chrono",
"deflate",
"fax",
"glob",
"inflate",
"itertools",
"jpeg-decoder",
"log",
"md5",
"num-traits",
"once_cell",
"ordermap",
"pdf_derive",
"sha2",
"snafu",
"stringprep",
"weezl",
]
[[package]]
name = "pdf_derive"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f4007262775d0798de87b15cbc64cf1aed5f7ee87eec847e297b69d8ed4b4f8"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "poop-graph"
version = "0.1.0"
dependencies = [
"lazy_static",
"pdf",
"regex",
"time 0.3.9",
]
[[package]]
name = "proc-macro2"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "632d02bff7f874a36f33ea8bb416cd484b90cc66c1194b1a1110d067a7013f58"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "sha2"
version = "0.9.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800"
dependencies = [
"block-buffer",
"cfg-if",
"cpufeatures",
"digest",
"opaque-debug",
]
[[package]]
name = "snafu"
version = "0.6.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eab12d3c261b2308b0d80c26fffb58d17eba81a4be97890101f416b478c79ca7"
dependencies = [
"doc-comment",
"snafu-derive",
]
[[package]]
name = "snafu-derive"
version = "0.6.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1508efa03c362e23817f96cde18abed596a25219a8b2c66e8db33c03543d315b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "stringprep"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1"
dependencies = [
"unicode-bidi",
"unicode-normalization",
]
[[package]]
name = "syn"
version = "1.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "704df27628939572cd88d33f171cd6f896f4eaca85252c6e0a72d8d8287ee86f"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "time"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
dependencies = [
"libc",
"wasi",
"winapi",
]
[[package]]
name = "time"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd"
dependencies = [
"itoa",
"libc",
"num_threads",
"time-macros",
]
[[package]]
name = "time-macros"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792"
[[package]]
name = "tinyvec"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "typenum"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "unicode-bidi"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f"
[[package]]
name = "unicode-normalization"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-xid"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "weezl"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8b77fdfd5a253be4ab714e4ffa3c49caf146b4de743e97510c0656cf90f1e8e"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

12
Cargo.toml Normal file
View file

@ -0,0 +1,12 @@
[package]
name = "poop-graph"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
lazy_static = "1.4"
pdf = "0.7.2"
regex = "1.5.5"
time = { version = "0.3.9", features = ["formatting", "macros", "parsing"] }

190
src/main.rs Normal file
View file

@ -0,0 +1,190 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
use pdf::{primitive::Primitive, content::Operation, backend::Backend};
use regex::Regex;
use time::Date;
const POSITION_ERROR_MARGIN: f32 = 2.0;
lazy_static! {
static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}
const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
"[month padding:none]/[day padding:none]/[year]"
);
const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
"[year]-[month]-[day]"
);
#[derive(Debug)]
struct DataPoint<'a> {
date: Date,
values: HashMap<&'a str, u32>,
}
impl<'a> DataPoint<'a> {
fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })
}
}
struct DocumentText {
x: f32,
y: f32,
text: String,
}
struct DocumentIterator<'a> {
x: f32,
y: f32,
operations: Box<dyn Iterator<Item = Operation> + 'a>,
}
impl<'a> DocumentIterator<'a> {
fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
Self {
x: 0.0,
y: 0.0,
operations: Box::new(
document
.pages()
.filter_map(|page| {
Some(page.ok()?.contents.clone()?.operations.into_iter())
})
.flatten()
),
}
}
}
impl<'a> Iterator for DocumentIterator<'a> {
type Item = DocumentText;
fn next(&mut self) -> Option<Self::Item> {
for Operation { operator, operands } in self.operations.as_mut() {
if operator == "Tm" {
if let (Some(x), Some(y)) = (
operands.get(4).and_then(extract_number),
operands.get(5).and_then(extract_number),
) {
self.x = x;
self.y = y;
}
} else if operator == "TJ" || operator == "Tj" {
if let Some(text) = operands.get(0).and_then(extract_string) {
return Some(DocumentText {
x: self.x,
y: self.y,
text,
});
}
}
}
None
}
}
fn extract_number(p: &Primitive) -> Option<f32> {
match p {
Primitive::Number(n) => Some(*n),
Primitive::Integer(n) => Some(*n as f32),
_ => None,
}
}
fn extract_string(p: &Primitive) -> Option<String> {
let result: Box<dyn AsRef<str>> = match p {
Primitive::Array(array) => {
let mut acc = String::new();
for element in array.iter() {
if let Primitive::String(s) = element {
acc += s.as_str().ok()?.as_ref();
}
}
Box::new(acc)
}
Primitive::String(s) => Box::new(s.as_str().ok()?),
_ => return None
};
Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())
}
fn is_new_column_header(s: &str) -> bool {
s.starts_with("Northern") || s.starts_with("Southern")
}
fn main() {
let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");
let mut doc_iter = DocumentIterator::new(&doc).peekable();
let mut columns: Vec<(String, f32)> = Vec::new();
let mut datapoints: Vec<DataPoint> = Vec::new();
let (mut current_datapoint, mut current_y) = loop {
if let Some(text) = doc_iter.next() {
if is_new_column_header(&text.text) {
let mut column_name = text.text;
let column_x = text.x;
while let Some(more) = doc_iter.peek() {
if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
columns.push((column_name, column_x));
break;
}
column_name += " ";
column_name += &more.text;
doc_iter.next();
}
} else if DATE_REGEX.is_match(&text.text) {
break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);
}
} else {
return;
}
};
columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
for text in doc_iter {
if DATE_REGEX.is_match(&text.text) {
datapoints.push(std::mem::replace(
&mut current_datapoint,
DataPoint::new(&text).expect("Failed to parse date!"),
));
current_y = text.y;
} else if VALUE_REGEX.is_match(&text.text) {
if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
continue;
}
if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));
}
}
}
print!("Date");
for (column, _) in columns.iter() {
print!(",{}", column);
}
println!();
for datapoint in datapoints.iter() {
print!(
"{}",
datapoint
.date
.format(DATE_DISPLAY_FORMAT)
.expect("Failed to format date!")
);
for (column, _) in columns.iter() {
if let Some(val) = datapoint.values.get(&column.as_ref()) {
print!(",{}", val);
} else {
print!(",");
}
}
println!();
}
}