From e640a7cd09654d9362b1cd3261e0a0a08c706e0a Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 5 Apr 2022 15:21:52 -0400 Subject: [PATCH] factor out extraction code into a module --- src/extract.rs | 212 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 189 ++----------------------------------------- 2 files changed, 217 insertions(+), 184 deletions(-) create mode 100644 src/extract.rs diff --git a/src/extract.rs b/src/extract.rs new file mode 100644 index 0000000..c91fda3 --- /dev/null +++ b/src/extract.rs @@ -0,0 +1,212 @@ +use std::collections::HashMap; +use std::fmt::Write; +use std::rc::Rc; + +use lazy_static::lazy_static; +use pdf::{backend::Backend, content::Operation, primitive::Primitive}; +use regex::Regex; +use time::Date; + +const POSITION_ERROR_MARGIN: f32 = 2.0; + +lazy_static! { + static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); + static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); + static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); +} + +const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = + time::macros::format_description!("[month padding:none]/[day padding:none]/[year]"); +const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = + time::macros::format_description!("[year]-[month]-[day]"); + +pub struct DataSet { + pub columns: Vec>, + pub rows: Vec, +} + +impl DataSet { + pub fn extract(doc: &pdf::file::File) -> Option { + let mut doc_iter = DocumentIterator::new(doc).peekable(); + + let mut columns: Vec<(Rc, f32)> = Vec::new(); + let mut rows: Vec = Vec::new(); + + let (mut current_datapoint, mut current_y) = loop { + if let Some(text) = doc_iter.next() { + if is_new_column_header(&text.text) { + let mut column_name = text.text; + let column_x = text.x; + while let Some(more) = doc_iter.peek() { + if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { + columns.push((Rc::new(column_name), column_x)); + break; + } + column_name += " "; + column_name += &more.text; + doc_iter.next(); + } + } else if DATE_REGEX.is_match(&text.text) { + break ( + DataPoint::new(&text).expect("Failed to parse date!"), + text.y, + ); + } + } else { + return None; + } + }; + + columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); + + for text in doc_iter { + if DATE_REGEX.is_match(&text.text) { + rows.push(std::mem::replace( + &mut current_datapoint, + DataPoint::new(&text).expect("Failed to parse date!"), + )); + current_y = text.y; + } else if VALUE_REGEX.is_match(&text.text) { + if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { + continue; + } + if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { + current_datapoint.values.insert( + column.clone(), + text.text.parse().expect("Failed to parse value!"), + ); + } + } + } + + Some(Self { + columns: columns.into_iter().map(|(column, _)| column).collect(), + rows, + }) + } + + pub fn csv_rows(&self) -> impl Iterator> + '_ { + std::iter::once_with(|| { + let mut header = String::from("Date"); + for column in self.columns.iter() { + write!(&mut header, ",{}", column)?; + } + Ok(header) + }) + .chain(self.rows.iter().map(|datapoint| { + let mut csv_row = datapoint + .date + .format(DATE_DISPLAY_FORMAT) + .expect("Failed to format date!"); + for column in self.columns.iter() { + if let Some(val) = datapoint.values.get(column) { + write!(&mut csv_row, ",{}", val)?; + } else { + write!(&mut csv_row, ",")?; + } + } + Ok(csv_row) + })) + } +} + +pub struct DataPoint { + pub date: Date, + pub values: HashMap, u32>, +} + +impl DataPoint { + fn new(text: &DocumentText) -> Result { + Ok(Self { + date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, + values: HashMap::new(), + }) + } +} + +struct DocumentText { + x: f32, + y: f32, + text: String, +} + +struct DocumentIterator<'a> { + x: f32, + y: f32, + operations: Box + 'a>, +} + +impl<'a> DocumentIterator<'a> { + fn new(document: &'a pdf::file::File) -> Self { + Self { + x: 0.0, + y: 0.0, + operations: Box::new( + document + .pages() + .filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter())) + .flatten(), + ), + } + } +} + +impl<'a> Iterator for DocumentIterator<'a> { + type Item = DocumentText; + + fn next(&mut self) -> Option { + for Operation { operator, operands } in self.operations.as_mut() { + if operator == "Tm" { + if let (Some(x), Some(y)) = ( + operands.get(4).and_then(extract_number), + operands.get(5).and_then(extract_number), + ) { + self.x = x; + self.y = y; + } + } else if operator == "TJ" || operator == "Tj" { + if let Some(text) = operands.get(0).and_then(extract_string) { + return Some(DocumentText { + x: self.x, + y: self.y, + text, + }); + } + } + } + None + } +} + +fn extract_number(p: &Primitive) -> Option { + match p { + Primitive::Number(n) => Some(*n), + Primitive::Integer(n) => Some(*n as f32), + _ => None, + } +} + +fn extract_string(p: &Primitive) -> Option { + let result: Box> = match p { + Primitive::Array(array) => { + let mut acc = String::new(); + for element in array.iter() { + if let Primitive::String(s) = element { + acc += s.as_str().ok()?.as_ref(); + } + } + Box::new(acc) + } + Primitive::String(s) => Box::new(s.as_str().ok()?), + _ => return None, + }; + Some( + WHITESPACE_REGEX + .replace_all((*result).as_ref().trim(), " ") + .to_string(), + ) +} + +fn is_new_column_header(s: &str) -> bool { + s.starts_with("Northern") || s.starts_with("Southern") +} diff --git a/src/main.rs b/src/main.rs index 53ac347..8af8b79 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,190 +1,11 @@ -use std::collections::HashMap; +mod extract; -use lazy_static::lazy_static; -use pdf::{primitive::Primitive, content::Operation, backend::Backend}; -use regex::Regex; -use time::Date; - -const POSITION_ERROR_MARGIN: f32 = 2.0; - -lazy_static! { - static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(); - static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap(); - static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); -} - -const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( - "[month padding:none]/[day padding:none]/[year]" -); -const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!( - "[year]-[month]-[day]" -); - -#[derive(Debug)] -struct DataPoint<'a> { - date: Date, - values: HashMap<&'a str, u32>, -} - -impl<'a> DataPoint<'a> { - fn new(text: &DocumentText) -> Result { - Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() }) - } -} - -struct DocumentText { - x: f32, - y: f32, - text: String, -} - -struct DocumentIterator<'a> { - x: f32, - y: f32, - operations: Box + 'a>, -} - -impl<'a> DocumentIterator<'a> { - fn new(document: &'a pdf::file::File) -> Self { - Self { - x: 0.0, - y: 0.0, - operations: Box::new( - document - .pages() - .filter_map(|page| { - Some(page.ok()?.contents.clone()?.operations.into_iter()) - }) - .flatten() - ), - } - } -} - -impl<'a> Iterator for DocumentIterator<'a> { - type Item = DocumentText; - - fn next(&mut self) -> Option { - for Operation { operator, operands } in self.operations.as_mut() { - if operator == "Tm" { - if let (Some(x), Some(y)) = ( - operands.get(4).and_then(extract_number), - operands.get(5).and_then(extract_number), - ) { - self.x = x; - self.y = y; - } - } else if operator == "TJ" || operator == "Tj" { - if let Some(text) = operands.get(0).and_then(extract_string) { - return Some(DocumentText { - x: self.x, - y: self.y, - text, - }); - } - } - } - None - } -} - -fn extract_number(p: &Primitive) -> Option { - match p { - Primitive::Number(n) => Some(*n), - Primitive::Integer(n) => Some(*n as f32), - _ => None, - } -} - -fn extract_string(p: &Primitive) -> Option { - let result: Box> = match p { - Primitive::Array(array) => { - let mut acc = String::new(); - for element in array.iter() { - if let Primitive::String(s) = element { - acc += s.as_str().ok()?.as_ref(); - } - } - Box::new(acc) - } - Primitive::String(s) => Box::new(s.as_str().ok()?), - _ => return None - }; - Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string()) -} - -fn is_new_column_header(s: &str) -> bool { - s.starts_with("Northern") || s.starts_with("Southern") -} +use extract::DataSet; fn main() { let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF"); - let mut doc_iter = DocumentIterator::new(&doc).peekable(); - - let mut columns: Vec<(String, f32)> = Vec::new(); - let mut datapoints: Vec = Vec::new(); - - let (mut current_datapoint, mut current_y) = loop { - if let Some(text) = doc_iter.next() { - if is_new_column_header(&text.text) { - let mut column_name = text.text; - let column_x = text.x; - while let Some(more) = doc_iter.peek() { - if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { - columns.push((column_name, column_x)); - break; - } - column_name += " "; - column_name += &more.text; - doc_iter.next(); - } - } else if DATE_REGEX.is_match(&text.text) { - break (DataPoint::new(&text).expect("Failed to parse date!"), text.y); - } - } else { - return; - } - }; - - columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal)); - - for text in doc_iter { - if DATE_REGEX.is_match(&text.text) { - datapoints.push(std::mem::replace( - &mut current_datapoint, - DataPoint::new(&text).expect("Failed to parse date!"), - )); - current_y = text.y; - } else if VALUE_REGEX.is_match(&text.text) { - if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { - continue; - } - if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { - current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!")); - } - } - } - - print!("Date"); - for (column, _) in columns.iter() { - print!(",{}", column); - } - println!(); - for datapoint in datapoints.iter() { - print!( - "{}", - datapoint - .date - .format(DATE_DISPLAY_FORMAT) - .expect("Failed to format date!") - ); - for (column, _) in columns.iter() { - if let Some(val) = datapoint.values.get(&column.as_ref()) { - print!(",{}", val); - } else { - print!(","); - } - } - println!(); + let dataset = DataSet::extract(&doc).expect("Failed to extract dataset"); + for row in dataset.csv_rows() { + println!("{}", row.unwrap()); } }