use bytes::Bytes; use chrono::NaiveDate; use std::num::ParseFloatError; use thiserror::Error; use crate::amf::types::date::naive_date_from_str; use crate::amf::types::date::DateParseError; use crate::CONFIG; #[derive(Debug, Clone)] pub struct AMFPdf { url: String, } #[derive(Debug, Error)] pub enum PatternExtractionError { #[error("Person not found")] PersonNotFoundError, #[error("Date executed not found")] DateExecutedNotFoundError, #[error("Date published not found")] DatePublishedNotFoundError, #[error("Exchange not found")] ExchangeNotFoundError, #[error("Nature not found")] NatureNotFoundError, #[error("Instrument not found")] InstrumentNotFoundError, #[error("Coordonnees section not found")] CoordonneesNotFound, #[error("Company name not found")] CompanyNameNotFound, #[error("Aggregated informations not found")] AggregatedInformationNotFoundError, #[error("Volume not found")] VolumeNotFoundError, #[error("Price not found")] PriceNotFoundError, } #[derive(Debug, Error)] pub enum AMFPdfError { #[error("Download error: {0}")] DownloadError(reqwest::Error), #[error("Bytes conversion error: {0}")] BytesConversionError(reqwest::Error), #[error("Error loading pdf document: {0}")] DocumentLoadError(lopdf::Error), #[error("Error during lopdf text extraction: {0}")] PdfTextExtractionError(lopdf::Error), #[error("Error during extraction of information: {0}")] PatternExtractionError(PatternExtractionError), #[error("Error parsing date: {0}")] DateParseError(DateParseError), #[error("Error parsing unit price: {0}")] PriceParseError(ParseFloatError), #[error("Error parsing volume: {0}")] VolumeParseError(ParseFloatError), } pub struct AMFPdfData { pub company_name: String, pub isin: Option, pub person: String, pub date_published: NaiveDate, pub date_executed: NaiveDate, pub exchange: String, pub nature: String, pub instrument: String, pub volume: f32, pub unit_price: f32, } impl AMFPdf { pub fn new(path: &String) -> AMFPdf { let mut url = (&CONFIG.amf_documents_path).to_string(); url.push_str(&path); AMFPdf { url } } async fn download(&self) -> Result { Ok(reqwest::get(&self.url) .await .map_err(|e| AMFPdfError::DownloadError(e))? .bytes() .await .map_err(|e| AMFPdfError::BytesConversionError(e))?) } async fn extract_text(&self) -> Result { let bfile = self.download().await?; let pdf = lopdf::Document::load_mem(&bfile).map_err(|e| AMFPdfError::DocumentLoadError(e))?; let mut text = "".to_string(); for (idx, _) in pdf.page_iter().enumerate() { text.push_str( &pdf.extract_text(&[idx as u32 + 1]) .map_err(|e| AMFPdfError::PdfTextExtractionError(e))?, ); } Ok(text) } pub async fn extract_info(&self) -> Result { let mut text = self.extract_text().await?; debug!("Extracted text from document:\n{}", text); let isin = extract_pattern( &text, "CODE D’IDENTIFICATION DE L’INSTRUMENT FINANCIER : ", "\n", ) .map_or_else( || { Some( text.lines() .skip_while(|l| l.is_empty()) .nth(1) .unwrap() .split(" ") .next() .unwrap() .get(0..12) .map(|t| t.to_string()), ) }, |t| Some(Some(t.get(0..12).unwrap_or(&t).to_string())), ) .unwrap_or(None); let person = extract_pattern( &text, &"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n", "\n", ) .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::PersonNotFoundError, ))?; let date_published_raw = extract_pattern(&text, &"DATE DE RECEPTION DE LA NOTIFICATION : ", &"\n").ok_or( AMFPdfError::PatternExtractionError( PatternExtractionError::DatePublishedNotFoundError, ), )?; let date_published = naive_date_from_str(&date_published_raw).map_err(|e| AMFPdfError::DateParseError(e))?; let date_executed_raw = extract_pattern(&text, &"DATE DE LA TRANSACTION : ", &"\n").ok_or( AMFPdfError::PatternExtractionError(PatternExtractionError::DateExecutedNotFoundError), )?; let date_executed = naive_date_from_str(&date_executed_raw).map_err(|e| AMFPdfError::DateParseError(e))?; let exchange = extract_pattern(&text, &"LIEU DE LA TRANSACTION : ", &"\n").ok_or( AMFPdfError::PatternExtractionError(PatternExtractionError::ExchangeNotFoundError), )?; let nature = extract_pattern(&text, &"NATURE DE LA TRANSACTION : ", &"\n").ok_or( AMFPdfError::PatternExtractionError(PatternExtractionError::NatureNotFoundError), )?; let instrument = extract_pattern(&text, &"DESCRIPTION DE L’INSTRUMENT FINANCIER : ", &"\n") .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::InstrumentNotFoundError, ))?; let inf_coordonnees = text.find("COORDONNEES DE L’EMETTEUR") .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::CoordonneesNotFound, ))?; let mut text_cp = text.clone(); text_cp.drain(0..inf_coordonnees); let company_name = extract_pattern(&text_cp, &"NOM : ", &"\n").ok_or( AMFPdfError::PatternExtractionError(PatternExtractionError::CompanyNameNotFound), )?; let inf_aggregees_idx = text.find("INFORMATIONS AGREGEES") .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::AggregatedInformationNotFoundError, ))?; text.drain(0..inf_aggregees_idx); let volume = extract_pattern(&text, &"VOLUME : ", &"\n") .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::VolumeNotFoundError, ))? .replace(" ", "") .parse::() .map_err(|e| AMFPdfError::VolumeParseError(e))?; let unit_price = extract_pattern(&text, &"PRIX : ".to_string(), &"\n".to_string()) .ok_or(AMFPdfError::PatternExtractionError( PatternExtractionError::PriceNotFoundError, ))? .replace(" ", "") .chars() .take_while(|c| c.is_digit(10) || c == &'.') .collect::() .parse::() .map_err(|e| AMFPdfError::PriceParseError(e))?; Ok(AMFPdfData { company_name, isin, person, date_published, date_executed, exchange, nature, instrument, volume, unit_price, }) } } fn extract_pattern(s: &String, p1: &str, p2: &str) -> Option { let idx1 = s.find(p1)?; let idx2 = s .get(idx1 + p1.len()..) .unwrap() .find(p2) .unwrap_or(s.len()) + idx1 + p1.len(); Some(s.get(idx1 + p1.len()..idx2).unwrap().to_string()) }