update extractor for pdf 8.0
This commit is contained in:
		
							parent
							
								
									e54bf1350c
								
							
						
					
					
						commit
						f6da7a7642
					
				
					 1 changed files with 37 additions and 50 deletions
				
			
		|  | @ -1,7 +1,7 @@ | |||
| use std::{collections::HashMap, sync::Arc}; | ||||
| 
 | ||||
| use lazy_static::lazy_static; | ||||
| use pdf::{backend::Backend, content::Operation, primitive::Primitive}; | ||||
| use pdf::{backend::Backend, content::{Op, Point, TextDrawAdjusted}}; | ||||
| use regex::Regex; | ||||
| use time::Date; | ||||
| 
 | ||||
|  | @ -38,7 +38,7 @@ impl DataSet { | |||
|             if let Some(text) = doc_iter.next() { | ||||
|                 if is_new_column_header(&text.text) { | ||||
|                     let mut column_name = text.text; | ||||
|                     let column_x = text.x; | ||||
|                     let column_x = text.point.x; | ||||
|                     while let Some(more) = doc_iter.peek() { | ||||
|                         if is_new_column_header(&more.text) || DATE_REGEX.is_match(&more.text) { | ||||
|                             columns.push((Arc::new(column_name), column_x)); | ||||
|  | @ -51,7 +51,7 @@ impl DataSet { | |||
|                 } else if DATE_REGEX.is_match(&text.text) { | ||||
|                     break ( | ||||
|                         DataPoint::new(&text).expect("Failed to parse date!"), | ||||
|                         text.y, | ||||
|                         text.point.y, | ||||
|                     ); | ||||
|                 } | ||||
|             } else { | ||||
|  | @ -67,12 +67,12 @@ impl DataSet { | |||
|                     &mut current_datapoint, | ||||
|                     DataPoint::new(&text).expect("Failed to parse date!"), | ||||
|                 )); | ||||
|                 current_y = text.y; | ||||
|                 current_y = text.point.y; | ||||
|             } else if VALUE_REGEX.is_match(&text.text) { | ||||
|                 if (current_y - text.y).abs() > POSITION_ERROR_MARGIN { | ||||
|                 if (current_y - text.point.y).abs() > POSITION_ERROR_MARGIN { | ||||
|                     continue; | ||||
|                 } | ||||
|                 if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.x) { | ||||
|                 if let Some((column, _)) = columns.iter().rev().find(|(_, x)| *x < text.point.x) { | ||||
|                     current_datapoint.values.insert( | ||||
|                         column.clone(), | ||||
|                         text.text.parse().expect("Failed to parse value!"), | ||||
|  | @ -103,26 +103,23 @@ impl DataPoint { | |||
| } | ||||
| 
 | ||||
| struct DocumentText { | ||||
|     x: f32, | ||||
|     y: f32, | ||||
|     point: Point, | ||||
|     text: String, | ||||
| } | ||||
| 
 | ||||
| struct DocumentIterator<'a> { | ||||
|     x: f32, | ||||
|     y: f32, | ||||
|     operations: Box<dyn Iterator<Item = Operation> + 'a>, | ||||
|     point: Point, | ||||
|     operations: Box<dyn Iterator<Item = Op> + 'a>, | ||||
| } | ||||
| 
 | ||||
| impl<'a> DocumentIterator<'a> { | ||||
|     fn new<B: Backend>(document: &'a pdf::file::File<B>) -> Self { | ||||
|         Self { | ||||
|             x: 0.0, | ||||
|             y: 0.0, | ||||
|             point: Point { x: 0.0, y: 0.0 }, | ||||
|             operations: Box::new( | ||||
|                 document | ||||
|                     .pages() | ||||
|                     .filter_map(|page| Some(page.ok()?.contents.clone()?.operations.into_iter())) | ||||
|                     .filter_map(|page| Some(page.ok()?.contents.clone()?.operations(document).ok()?.into_iter())) | ||||
|                     .flatten(), | ||||
|             ), | ||||
|         } | ||||
|  | @ -133,54 +130,44 @@ impl<'a> Iterator for DocumentIterator<'a> { | |||
|     type Item = DocumentText; | ||||
| 
 | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         for Operation { operator, operands } in self.operations.as_mut() { | ||||
|             if operator == "Tm" { | ||||
|                 if let (Some(x), Some(y)) = ( | ||||
|                     operands.get(4).and_then(extract_number), | ||||
|                     operands.get(5).and_then(extract_number), | ||||
|                 ) { | ||||
|                     self.x = x; | ||||
|                     self.y = y; | ||||
|         for operation in self.operations.as_mut() { | ||||
|             match operation { | ||||
|                 Op::SetTextMatrix { matrix } => { | ||||
|                     self.point = Point { x: matrix.e, y: matrix.f }; | ||||
|                 } | ||||
|             } else if operator == "TJ" || operator == "Tj" { | ||||
|                 if let Some(text) = operands.get(0).and_then(extract_string) { | ||||
|                     return Some(DocumentText { | ||||
|                         x: self.x, | ||||
|                         y: self.y, | ||||
|                         text, | ||||
|                     }); | ||||
|                 Op::TextDraw { text } => { | ||||
|                     if let Ok(text) = text.to_string() { | ||||
|                         return Some(DocumentText { | ||||
|                             point: self.point, | ||||
|                             text, | ||||
|                         }); | ||||
|                     } | ||||
|                 } | ||||
|                 Op::TextDrawAdjusted { array } => { | ||||
|                     if let Some(text) = concatenate_adjusted_text(array) { | ||||
|                         return Some(DocumentText { | ||||
|                             point: self.point, | ||||
|                             text, | ||||
|                         }); | ||||
|                     } | ||||
|                 } | ||||
|                 _ => continue, | ||||
|             } | ||||
|         } | ||||
|         None | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| fn extract_number(p: &Primitive) -> Option<f32> { | ||||
|     match p { | ||||
|         Primitive::Number(n) => Some(*n), | ||||
|         Primitive::Integer(n) => Some(*n as f32), | ||||
|         _ => None, | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| fn extract_string(p: &Primitive) -> Option<String> { | ||||
|     let result: Box<dyn AsRef<str>> = match p { | ||||
|         Primitive::Array(array) => { | ||||
|             let mut acc = String::new(); | ||||
|             for element in array.iter() { | ||||
|                 if let Primitive::String(s) = element { | ||||
|                     acc += s.as_str().ok()?.as_ref(); | ||||
|                 } | ||||
|             } | ||||
|             Box::new(acc) | ||||
| fn concatenate_adjusted_text(array: Vec<TextDrawAdjusted>) -> Option<String> { | ||||
|     let mut acc = String::new(); | ||||
|     for element in array.iter() { | ||||
|         if let TextDrawAdjusted::Text(s) = element { | ||||
|             acc += s.to_string().ok()?.as_ref(); | ||||
|         } | ||||
|         Primitive::String(s) => Box::new(s.as_str().ok()?), | ||||
|         _ => return None, | ||||
|     }; | ||||
|     Some( | ||||
|         WHITESPACE_REGEX | ||||
|             .replace_all((*result).as_ref().trim(), " ") | ||||
|             .replace_all(acc.trim(), " ") | ||||
|             .to_string(), | ||||
|     ) | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue