|
|
use bytes::Bytes;
|
|
|
use chrono::NaiveDate;
|
|
|
use std::num::ParseFloatError;
|
|
|
use thiserror::Error;
|
|
|
|
|
|
use crate::amf::types::date::naive_date_from_str;
|
|
|
use crate::amf::types::date::DateParseError;
|
|
|
use crate::CONFIG;
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct AMFPdf {
|
|
|
url: String,
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
pub enum PatternExtractionError {
|
|
|
#[error("Person not found")]
|
|
|
PersonNotFoundError,
|
|
|
#[error("Date executed not found")]
|
|
|
DateExecutedNotFoundError,
|
|
|
#[error("Date published not found")]
|
|
|
DatePublishedNotFoundError,
|
|
|
#[error("Exchange not found")]
|
|
|
ExchangeNotFoundError,
|
|
|
#[error("Nature not found")]
|
|
|
NatureNotFoundError,
|
|
|
#[error("Instrument not found")]
|
|
|
InstrumentNotFoundError,
|
|
|
#[error("Coordonnees section not found")]
|
|
|
CoordonneesNotFound,
|
|
|
#[error("Company name not found")]
|
|
|
CompanyNameNotFound,
|
|
|
#[error("Aggregated informations not found")]
|
|
|
AggregatedInformationNotFoundError,
|
|
|
#[error("Volume not found")]
|
|
|
VolumeNotFoundError,
|
|
|
#[error("Price not found")]
|
|
|
PriceNotFoundError,
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
pub enum AMFPdfError {
|
|
|
#[error("Download error: {0}")]
|
|
|
DownloadError(reqwest::Error),
|
|
|
#[error("Bytes conversion error: {0}")]
|
|
|
BytesConversionError(reqwest::Error),
|
|
|
#[error("Error loading pdf document: {0}")]
|
|
|
DocumentLoadError(lopdf::Error),
|
|
|
#[error("Error during lopdf text extraction: {0}")]
|
|
|
PdfTextExtractionError(lopdf::Error),
|
|
|
#[error("Error during extraction of information: {0}")]
|
|
|
PatternExtractionError(PatternExtractionError),
|
|
|
#[error("Error parsing date: {0}")]
|
|
|
DateParseError(DateParseError),
|
|
|
#[error("Error parsing unit price: {0}")]
|
|
|
PriceParseError(ParseFloatError),
|
|
|
#[error("Error parsing volume: {0}")]
|
|
|
VolumeParseError(ParseFloatError),
|
|
|
}
|
|
|
|
|
|
pub struct AMFPdfData {
|
|
|
pub company_name: String,
|
|
|
pub isin: Option<String>,
|
|
|
pub person: String,
|
|
|
pub date_published: NaiveDate,
|
|
|
pub date_executed: NaiveDate,
|
|
|
pub exchange: String,
|
|
|
pub nature: String,
|
|
|
pub instrument: String,
|
|
|
pub volume: f32,
|
|
|
pub unit_price: f32,
|
|
|
}
|
|
|
|
|
|
impl AMFPdf {
|
|
|
pub fn new(path: &String) -> AMFPdf {
|
|
|
let mut url = (&CONFIG.amf_documents_path).to_string();
|
|
|
url.push_str(&path);
|
|
|
AMFPdf { url }
|
|
|
}
|
|
|
|
|
|
async fn download(&self) -> Result<Bytes, AMFPdfError> {
|
|
|
Ok(reqwest::get(&self.url)
|
|
|
.await
|
|
|
.map_err(|e| AMFPdfError::DownloadError(e))?
|
|
|
.bytes()
|
|
|
.await
|
|
|
.map_err(|e| AMFPdfError::BytesConversionError(e))?)
|
|
|
}
|
|
|
|
|
|
async fn extract_text(&self) -> Result<String, AMFPdfError> {
|
|
|
let bfile = self.download().await?;
|
|
|
let pdf =
|
|
|
lopdf::Document::load_mem(&bfile).map_err(|e| AMFPdfError::DocumentLoadError(e))?;
|
|
|
let mut text = "".to_string();
|
|
|
for (idx, _) in pdf.page_iter().enumerate() {
|
|
|
text.push_str(
|
|
|
&pdf.extract_text(&[idx as u32 + 1])
|
|
|
.map_err(|e| AMFPdfError::PdfTextExtractionError(e))?,
|
|
|
);
|
|
|
}
|
|
|
Ok(text)
|
|
|
}
|
|
|
|
|
|
pub async fn extract_info(&self) -> Result<AMFPdfData, AMFPdfError> {
|
|
|
let mut text = self.extract_text().await?;
|
|
|
debug!("Extracted text from document:\n{}", text);
|
|
|
|
|
|
let isin = extract_pattern(
|
|
|
&text,
|
|
|
"CODE D’IDENTIFICATION DE L’INSTRUMENT FINANCIER : ",
|
|
|
"\n",
|
|
|
)
|
|
|
.map_or_else(
|
|
|
|| {
|
|
|
Some(
|
|
|
text.lines()
|
|
|
.skip_while(|l| l.is_empty())
|
|
|
.nth(1)
|
|
|
.unwrap()
|
|
|
.split(" ")
|
|
|
.next()
|
|
|
.unwrap()
|
|
|
.get(0..12)
|
|
|
.map(|t| t.to_string()),
|
|
|
)
|
|
|
},
|
|
|
|t| Some(Some(t.get(0..12).unwrap_or(&t).to_string())),
|
|
|
)
|
|
|
.unwrap_or(None);
|
|
|
|
|
|
let person = extract_pattern(
|
|
|
&text,
|
|
|
&"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n",
|
|
|
"\n",
|
|
|
)
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::PersonNotFoundError,
|
|
|
))?;
|
|
|
|
|
|
let date_published_raw =
|
|
|
extract_pattern(&text, &"DATE DE RECEPTION DE LA NOTIFICATION : ", &"\n").ok_or(
|
|
|
AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::DatePublishedNotFoundError,
|
|
|
),
|
|
|
)?;
|
|
|
|
|
|
let date_published =
|
|
|
naive_date_from_str(&date_published_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
|
|
|
|
|
|
let date_executed_raw = extract_pattern(&text, &"DATE DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::DateExecutedNotFoundError),
|
|
|
)?;
|
|
|
|
|
|
let date_executed =
|
|
|
naive_date_from_str(&date_executed_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
|
|
|
|
|
|
let exchange = extract_pattern(&text, &"LIEU DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::ExchangeNotFoundError),
|
|
|
)?;
|
|
|
|
|
|
let nature = extract_pattern(&text, &"NATURE DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::NatureNotFoundError),
|
|
|
)?;
|
|
|
|
|
|
let instrument = extract_pattern(&text, &"DESCRIPTION DE L’INSTRUMENT FINANCIER : ", &"\n")
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::InstrumentNotFoundError,
|
|
|
))?;
|
|
|
|
|
|
let inf_coordonnees =
|
|
|
text.find("COORDONNEES DE L’EMETTEUR")
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::CoordonneesNotFound,
|
|
|
))?;
|
|
|
|
|
|
let mut text_cp = text.clone();
|
|
|
text_cp.drain(0..inf_coordonnees);
|
|
|
|
|
|
let company_name = extract_pattern(&text_cp, &"NOM : ", &"\n").ok_or(
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::CompanyNameNotFound),
|
|
|
)?;
|
|
|
|
|
|
let inf_aggregees_idx =
|
|
|
text.find("INFORMATIONS AGREGEES")
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::AggregatedInformationNotFoundError,
|
|
|
))?;
|
|
|
|
|
|
text.drain(0..inf_aggregees_idx);
|
|
|
|
|
|
let volume = extract_pattern(&text, &"VOLUME : ", &"\n")
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::VolumeNotFoundError,
|
|
|
))?
|
|
|
.replace(" ", "")
|
|
|
.parse::<f32>()
|
|
|
.map_err(|e| AMFPdfError::VolumeParseError(e))?;
|
|
|
|
|
|
let unit_price = extract_pattern(&text, &"PRIX : ".to_string(), &"\n".to_string())
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
PatternExtractionError::PriceNotFoundError,
|
|
|
))?
|
|
|
.replace(" ", "")
|
|
|
.chars()
|
|
|
.take_while(|c| c.is_digit(10) || c == &'.')
|
|
|
.collect::<String>()
|
|
|
.parse::<f32>()
|
|
|
.map_err(|e| AMFPdfError::PriceParseError(e))?;
|
|
|
|
|
|
Ok(AMFPdfData {
|
|
|
company_name,
|
|
|
isin,
|
|
|
person,
|
|
|
date_published,
|
|
|
date_executed,
|
|
|
exchange,
|
|
|
nature,
|
|
|
instrument,
|
|
|
volume,
|
|
|
unit_price,
|
|
|
})
|
|
|
}
|
|
|
}
|
|
|
|
|
|
fn extract_pattern(s: &String, p1: &str, p2: &str) -> Option<String> {
|
|
|
let idx1 = s.find(p1)?;
|
|
|
|
|
|
let idx2 = s
|
|
|
.get(idx1 + p1.len()..)
|
|
|
.unwrap()
|
|
|
.find(p2)
|
|
|
.unwrap_or(s.len())
|
|
|
+ idx1
|
|
|
+ p1.len();
|
|
|
|
|
|
Some(s.get(idx1 + p1.len()..idx2).unwrap().to_string())
|
|
|
}
|