factor out extraction code into a module

2022-04-05 15:21:52 -04:00 · 2022-04-05 15:21:52 -04:00 · e640a7cd09
commit e640a7cd09
parent 7b22bbab16
2 changed files with 217 additions and 184 deletions
--- a/src/extract.rs
+++ b/src/extract.rs
@ -0,0 +1,212 @@
+use std::collections::HashMap;
+use std::fmt::Write;
+use std::rc::Rc;
+
+use lazy_static::lazy_static;
+use pdf::{backend::Backend, content::Operation, primitive::Primitive};
+use regex::Regex;
+use time::Date;
+
+const POSITION_ERROR_MARGIN: f32 = 2.0;
+
+lazy_static! {
+    static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
+    static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
+    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
+}
+
+const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] =
+    time::macros::format_description!("[month padding:none]/[day padding:none]/[year]");
+const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] =
+    time::macros::format_description!("[year]-[month]-[day]");
+
+pub struct DataSet {
+    pub columns: Vec<Rc<String>>,
+    pub rows: Vec<DataPoint>,
+}
+
+impl DataSet {
+    pub fn extract<B: Backend>(doc: &pdf::file::File<B>) -> Option<Self> {
+        let mut doc_iter = DocumentIterator::new(doc).peekable();
+
+        let mut columns: Vec<(Rc<String>, f32)> = Vec::new();
+        let mut rows: Vec<DataPoint> = Vec::new();
+
+        let (mut current_datapoint, mut current_y) = loop {
+            if let Some(text) = doc_iter.next() {
+                if is_new_column_header(&text.text) {
+                    let mut column_name = text.text;
+                    let column_x = text.x;
+                    while let Some(more) = doc_iter.peek() {
+                        if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
+                            columns.push((Rc::new(column_name), column_x));
+                            break;
+                        }
+                        column_name += " ";
+                        column_name += &more.text;
+                        doc_iter.next();
+                    }
+                } else if DATE_REGEX.is_match(&text.text) {
+                    break (
+                        DataPoint::new(&text).expect("Failed to parse date!"),
+                        text.y,
+                    );
+                }
+            } else {
+                return None;
+            }
+        };
+
+        columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
+
+        for text in doc_iter {
+            if DATE_REGEX.is_match(&text.text) {
+                rows.push(std::mem::replace(
+                    &mut current_datapoint,
+                    DataPoint::new(&text).expect("Failed to parse date!"),
+                ));
+                current_y = text.y;
+            } else if VALUE_REGEX.is_match(&text.text) {
+                if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
+                    continue;
+                }
+                if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
+                    current_datapoint.values.insert(
+                        column.clone(),
+                        text.text.parse().expect("Failed to parse value!"),
+                    );
+                }
+            }
+        }
+
+        Some(Self {
+            columns: columns.into_iter().map(|(column, _)| column).collect(),
+            rows,
+        })
+    }
+
+    pub fn csv_rows(&self) -> impl Iterator<Item = Result<String, std::fmt::Error>> + '_ {
+        std::iter::once_with(|| {
+            let mut header = String::from("Date");
+            for column in self.columns.iter() {
+                write!(&mut header, ",{}", column)?;
+            }
+            Ok(header)
+        })
+        .chain(self.rows.iter().map(|datapoint| {
+            let mut csv_row = datapoint
+                .date
+                .format(DATE_DISPLAY_FORMAT)
+                .expect("Failed to format date!");
+            for column in self.columns.iter() {
+                if let Some(val) = datapoint.values.get(column) {
+                    write!(&mut csv_row, ",{}", val)?;
+                } else {
+                    write!(&mut csv_row, ",")?;
+                }
+            }
+            Ok(csv_row)
+        }))
+    }
+}
+
+pub struct DataPoint {
+    pub date: Date,
+    pub values: HashMap<Rc<String>, u32>,
+}
+
+impl DataPoint {
+    fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
+        Ok(Self {
+            date: Date::parse(&text.text, DATE_PARSE_FORMAT)?,
+            values: HashMap::new(),
+        })
+    }
+}
+
+struct DocumentText {
+    x: f32,
+    y: f32,
+    text: String,
+}
+
+struct DocumentIterator<'a> {
+    x: f32,
+    y: f32,
+    operations: Box<dyn Iterator<Item = Operation> + 'a>,
+}
+
+impl<'a> DocumentIterator<'a> {
+    fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
+        Self {
+            x: 0.0,
+            y: 0.0,
+            operations: Box::new(
+                document
+                    .pages()
+                    .filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter()))
+                    .flatten(),
+            ),
+        }
+    }
+}
+
+impl<'a> Iterator for DocumentIterator<'a> {
+    type Item = DocumentText;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for Operation { operator, operands } in self.operations.as_mut() {
+            if operator == "Tm" {
+                if let (Some(x), Some(y)) = (
+                    operands.get(4).and_then(extract_number),
+                    operands.get(5).and_then(extract_number),
+                ) {
+                    self.x = x;
+                    self.y = y;
+                }
+            } else if operator == "TJ" || operator == "Tj" {
+                if let Some(text) = operands.get(0).and_then(extract_string) {
+                    return Some(DocumentText {
+                        x: self.x,
+                        y: self.y,
+                        text,
+                    });
+                }
+            }
+        }
+        None
+    }
+}
+
+fn extract_number(p: &Primitive) -> Option<f32> {
+    match p {
+        Primitive::Number(n) => Some(*n),
+        Primitive::Integer(n) => Some(*n as f32),
+        _ => None,
+    }
+}
+
+fn extract_string(p: &Primitive) -> Option<String> {
+    let result: Box<dyn AsRef<str>> = match p {
+        Primitive::Array(array) => {
+            let mut acc = String::new();
+            for element in array.iter() {
+                if let Primitive::String(s) = element {
+                    acc += s.as_str().ok()?.as_ref();
+                }
+            }
+            Box::new(acc)
+        }
+        Primitive::String(s) => Box::new(s.as_str().ok()?),
+        _ => return None,
+    };
+    Some(
+        WHITESPACE_REGEX
+            .replace_all((*result).as_ref().trim(), " ")
+            .to_string(),
+    )
+}
+
+fn is_new_column_header(s: &str) -> bool {
+    s.starts_with("Northern") || s.starts_with("Southern")
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,190 +1,11 @@
-use std::collections::HashMap;
+mod extract;

-use lazy_static::lazy_static;
-use pdf::{primitive::Primitive, content::Operation, backend::Backend};
-use regex::Regex;
-use time::Date;
-
-const POSITION_ERROR_MARGIN: f32 = 2.0;
-
-lazy_static! {
-    static ref DATE_REGEX: Regex = Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap();
-    static ref VALUE_REGEX: Regex = Regex::new(r"^\d+$").unwrap();
-    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
-}
-
-const DATE_PARSE_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
-    "[month padding:none]/[day padding:none]/[year]"
-);
-const DATE_DISPLAY_FORMAT: &[time::format_description::FormatItem] = time::macros::format_description!(
-    "[year]-[month]-[day]"
-);
-
-#[derive(Debug)]
-struct DataPoint<'a> {
-    date: Date,
-    values: HashMap<&'a str, u32>,
-}
-
-impl<'a> DataPoint<'a> {
-    fn new(text: &DocumentText) -> Result<Self, time::error::Parse> {
-        Ok(Self { date: Date::parse(&text.text, DATE_PARSE_FORMAT)?, values: HashMap::new() })
-    }
-}
-
-struct DocumentText {
-    x: f32,
-    y: f32,
-    text: String,
-}
-
-struct DocumentIterator<'a> {
-    x: f32,
-    y: f32,
-    operations: Box<dyn Iterator<Item = Operation> + 'a>,
-}
-
-impl<'a> DocumentIterator<'a> {
-    fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self {
-        Self {
-            x: 0.0,
-            y: 0.0,
-            operations: Box::new(
-                document
-                    .pages()
-                    .filter_map(|page| {
-                        Some(page.ok()?.contents.clone()?.operations.into_iter())
-                    })
-                    .flatten()
-            ),
-        }
-    }
-}
-
-impl<'a> Iterator for DocumentIterator<'a> {
-    type Item = DocumentText;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        for Operation { operator, operands } in self.operations.as_mut() {
-            if operator == "Tm" {
-                if let (Some(x), Some(y)) = (
-                    operands.get(4).and_then(extract_number),
-                    operands.get(5).and_then(extract_number),
-                ) {
-                    self.x = x;
-                    self.y = y;
-                }
-            } else if operator == "TJ" || operator == "Tj" {
-                if let Some(text) = operands.get(0).and_then(extract_string) {
-                    return Some(DocumentText {
-                        x: self.x,
-                        y: self.y,
-                        text,
-                    });
-                }
-            }
-        }
-        None
-    }
-}
-
-fn extract_number(p: &Primitive) -> Option<f32> {
-    match p {
-        Primitive::Number(n) => Some(*n),
-        Primitive::Integer(n) => Some(*n as f32),
-        _ => None,
-    }
-}
-
-fn extract_string(p: &Primitive) -> Option<String> {
-    let result: Box<dyn AsRef<str>> = match p {
-        Primitive::Array(array) => {
-            let mut acc = String::new();
-            for element in array.iter() {
-                if let Primitive::String(s) = element {
-                    acc += s.as_str().ok()?.as_ref();
-                }
-            }
-            Box::new(acc)
-        }
-        Primitive::String(s) => Box::new(s.as_str().ok()?),
-        _ => return None
-    };
-    Some(WHITESPACE_REGEX.replace_all((*result).as_ref().trim(), " ").to_string())
-}
-
-fn is_new_column_header(s: &str) -> bool {
-    s.starts_with("Northern") || s.starts_with("Southern")
-}
+use extract::DataSet;

 fn main() {
    let doc = pdf::file::File::open("data.pdf").expect("Failed to read PDF");
-    let mut doc_iter = DocumentIterator::new(&doc).peekable();
-
-    let mut columns: Vec<(String, f32)> = Vec::new();
-    let mut datapoints: Vec<DataPoint> = Vec::new();
-
-    let (mut current_datapoint, mut current_y) = loop {
-        if let Some(text) = doc_iter.next() {
-            if is_new_column_header(&text.text) {
-                let mut column_name = text.text;
-                let column_x = text.x;
-                while let Some(more) = doc_iter.peek() {
-                    if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) {
-                        columns.push((column_name, column_x));
-                        break;
-                    }
-                    column_name += " ";
-                    column_name += &more.text;
-                    doc_iter.next();
-                }
-            } else if DATE_REGEX.is_match(&text.text) {
-                break (DataPoint::new(&text).expect("Failed to parse date!"), text.y);
-            }
-        } else {
-            return;
-        }
-    };
-
-    columns.sort_by(|(_, x0), (_, x1)| x0.partial_cmp(x1).unwrap_or(std::cmp::Ordering::Equal));
-
-    for text in doc_iter {
-        if DATE_REGEX.is_match(&text.text) {
-            datapoints.push(std::mem::replace(
-                &mut current_datapoint, 
-                DataPoint::new(&text).expect("Failed to parse date!"),
-            ));
-            current_y = text.y;
-        } else if VALUE_REGEX.is_match(&text.text) {
-            if (current_y - text.y).abs() > POSITION_ERROR_MARGIN {
-                continue;
-            }
-            if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) {
-                current_datapoint.values.insert(column, text.text.parse().expect("Failed to parse value!"));
-            }
-        }
-    }
-
-    print!("Date");
-    for (column, _) in columns.iter() {
-        print!(",{}", column);
-    }
-    println!();
-    for datapoint in datapoints.iter() {
-        print!(
-            "{}",
-            datapoint
-                .date
-                .format(DATE_DISPLAY_FORMAT)
-                .expect("Failed to format date!")
-        );
-        for (column, _) in columns.iter() {
-            if let Some(val) = datapoint.values.get(&column.as_ref()) {
-                print!(",{}", val);
-            } else {
-                print!(",");
-            }
-        }
-        println!();
+    let dataset = DataSet::extract(&doc).expect("Failed to extract dataset");
+    for row in dataset.csv_rows() {
+        println!("{}", row.unwrap());
    }
 }