|
|
use bytes::Bytes;
|
|
|
use chrono::NaiveDate;
|
|
|
use std::num::ParseFloatError;
|
|
|
use thiserror::Error;
|
|
|
|
|
|
use crate::amf::types::date::naive_date_from_str;
|
|
|
use crate::amf::types::date::DateParseError;
|
|
|
use crate::CONFIG;
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct AMFPdf {
|
|
|
url: String,
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
pub enum PatternExtractionNotFoundError {
|
|
|
#[error("Person not found")]
|
|
|
Person,
|
|
|
#[error("Date executed not found")]
|
|
|
DateExecuted,
|
|
|
#[error("Date published not found")]
|
|
|
DatePublished,
|
|
|
#[error("Exchange not found")]
|
|
|
Exchange,
|
|
|
#[error("Nature not found")]
|
|
|
Nature,
|
|
|
#[error("Instrument not found")]
|
|
|
Instrument,
|
|
|
#[error("Coordonnees section not found")]
|
|
|
Coordonnees,
|
|
|
#[error("Company name not found")]
|
|
|
CompanyName,
|
|
|
#[error("Aggregated informations not found")]
|
|
|
AggregatedInformation,
|
|
|
#[error("Volume not found")]
|
|
|
Volume,
|
|
|
#[error("Price not found")]
|
|
|
PriceNotFound,
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
pub enum AMFPdfError {
|
|
|
#[error("Download error: {0}")]
|
|
|
Download(reqwest::Error),
|
|
|
#[error("Bytes conversion error: {0}")]
|
|
|
BytesConversion(reqwest::Error),
|
|
|
#[error("Error loading pdf document: {0}")]
|
|
|
DocumentLoad(lopdf::Error),
|
|
|
#[error("Error during lopdf text extraction: {0}")]
|
|
|
PdfTextExtraction(lopdf::Error),
|
|
|
#[error("Error during extraction of information: {0}")]
|
|
|
PatternExtraction(PatternExtractionNotFoundError),
|
|
|
#[error("Error parsing date: {0}")]
|
|
|
DateParse(DateParseError),
|
|
|
#[error("Error parsing unit price: {0}")]
|
|
|
PriceParse(ParseFloatError),
|
|
|
#[error("Error parsing volume: {0}")]
|
|
|
VolumeParseError(ParseFloatError),
|
|
|
}
|
|
|
|
|
|
pub struct AMFPdfData {
|
|
|
pub company_name: String,
|
|
|
pub isin: Option<String>,
|
|
|
pub person: String,
|
|
|
pub date_published: NaiveDate,
|
|
|
pub date_executed: NaiveDate,
|
|
|
pub exchange: String,
|
|
|
pub nature: String,
|
|
|
pub instrument: String,
|
|
|
pub volume: f32,
|
|
|
pub unit_price: f32,
|
|
|
}
|
|
|
|
|
|
impl AMFPdf {
|
|
|
pub fn new(path: &str) -> AMFPdf {
|
|
|
let mut url = CONFIG.amf_documents_path.to_string();
|
|
|
url.push_str(path);
|
|
|
AMFPdf { url }
|
|
|
}
|
|
|
|
|
|
async fn download(&self) -> Result<Bytes, AMFPdfError> {
|
|
|
reqwest::get(&self.url)
|
|
|
.await
|
|
|
.map_err(AMFPdfError::Download)?
|
|
|
.bytes()
|
|
|
.await
|
|
|
.map_err(AMFPdfError::BytesConversion)
|
|
|
}
|
|
|
|
|
|
async fn extract_text(&self) -> Result<String, AMFPdfError> {
|
|
|
let bfile = self.download().await?;
|
|
|
let pdf = lopdf::Document::load_mem(&bfile).map_err(AMFPdfError::DocumentLoad)?;
|
|
|
let mut text = "".to_string();
|
|
|
for (idx, _) in pdf.page_iter().enumerate() {
|
|
|
text.push_str(
|
|
|
&pdf.extract_text(&[idx as u32 + 1])
|
|
|
.map_err(AMFPdfError::PdfTextExtraction)?,
|
|
|
);
|
|
|
}
|
|
|
Ok(text)
|
|
|
}
|
|
|
|
|
|
pub async fn extract_info(&self) -> Result<AMFPdfData, AMFPdfError> {
|
|
|
let mut text = self.extract_text().await?;
|
|
|
debug!("Extracted text from document:\n{}", text);
|
|
|
|
|
|
let isin = extract_pattern(
|
|
|
&text,
|
|
|
"CODE D’IDENTIFICATION DE L’INSTRUMENT FINANCIER : ",
|
|
|
"\n",
|
|
|
)
|
|
|
.map_or_else(
|
|
|
|| {
|
|
|
Some(
|
|
|
text.lines()
|
|
|
.skip_while(|l| l.is_empty())
|
|
|
.nth(1)
|
|
|
.unwrap()
|
|
|
.split(' ')
|
|
|
.next()
|
|
|
.unwrap()
|
|
|
.get(0..12)
|
|
|
.map(|t| t.to_string()),
|
|
|
)
|
|
|
},
|
|
|
|t| Some(Some(t.get(0..12).unwrap_or(&t).to_string())),
|
|
|
)
|
|
|
.unwrap_or(None);
|
|
|
|
|
|
let person = extract_pattern(
|
|
|
&text,
|
|
|
"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n",
|
|
|
"\n",
|
|
|
)
|
|
|
.ok_or_else(|| AMFPdfError::PatternExtraction(
|
|
|
PatternExtractionNotFoundError::Person,
|
|
|
))?;
|
|
|
|
|
|
let date_published_raw =
|
|
|
extract_pattern(&text, "DATE DE RECEPTION DE LA NOTIFICATION : ", "\n").ok_or_else(
|
|
|
|| AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::DatePublished),
|
|
|
)?;
|
|
|
|
|
|
let date_published =
|
|
|
naive_date_from_str(&date_published_raw).map_err(AMFPdfError::DateParse)?;
|
|
|
|
|
|
let date_executed_raw = extract_pattern(&text, "DATE DE LA TRANSACTION : ", "\n")
|
|
|
.ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::DateExecuted)
|
|
|
})?;
|
|
|
|
|
|
let date_executed =
|
|
|
naive_date_from_str(&date_executed_raw).map_err(AMFPdfError::DateParse)?;
|
|
|
|
|
|
let exchange =
|
|
|
extract_pattern(&text, "LIEU DE LA TRANSACTION : ", "\n").ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Exchange)
|
|
|
})?;
|
|
|
|
|
|
let nature =
|
|
|
extract_pattern(&text, "NATURE DE LA TRANSACTION : ", "\n").ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Nature)
|
|
|
})?;
|
|
|
|
|
|
let instrument = extract_pattern(&text, "DESCRIPTION DE L’INSTRUMENT FINANCIER : ", "\n")
|
|
|
.ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Instrument)
|
|
|
})?;
|
|
|
|
|
|
let inf_coordonnees = text.find("COORDONNEES DE L’EMETTEUR").ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Coordonnees)
|
|
|
})?;
|
|
|
|
|
|
let mut text_cp = text.clone();
|
|
|
text_cp.drain(0..inf_coordonnees);
|
|
|
|
|
|
let company_name = extract_pattern(&text_cp, "NOM : ", "\n").ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::CompanyName)
|
|
|
})?;
|
|
|
|
|
|
let inf_aggregees_idx = text.find("INFORMATIONS AGREGEES").ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::AggregatedInformation)
|
|
|
})?;
|
|
|
|
|
|
text.drain(0..inf_aggregees_idx);
|
|
|
|
|
|
let volume = extract_pattern(&text, "VOLUME : ", "\n")
|
|
|
.ok_or_else(|| AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Volume))?
|
|
|
.replace(' ', "")
|
|
|
.parse::<f32>()
|
|
|
.map_err(AMFPdfError::VolumeParseError)?;
|
|
|
|
|
|
let unit_price = extract_pattern(&text, "PRIX : ", "\n")
|
|
|
.ok_or_else(|| {
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::PriceNotFound)
|
|
|
})?
|
|
|
.replace(' ', "")
|
|
|
.chars()
|
|
|
.take_while(|c| c.is_ascii_digit() || c == &'.')
|
|
|
.collect::<String>()
|
|
|
.parse::<f32>()
|
|
|
.map_err(AMFPdfError::PriceParse)?;
|
|
|
|
|
|
Ok(AMFPdfData {
|
|
|
company_name,
|
|
|
isin,
|
|
|
person,
|
|
|
date_published,
|
|
|
date_executed,
|
|
|
exchange,
|
|
|
nature,
|
|
|
instrument,
|
|
|
volume,
|
|
|
unit_price,
|
|
|
})
|
|
|
}
|
|
|
}
|
|
|
|
|
|
fn extract_pattern(s: &String, p1: &str, p2: &str) -> Option<String> {
|
|
|
let idx1 = s.find(p1)?;
|
|
|
|
|
|
let idx2 = s
|
|
|
.get(idx1 + p1.len()..)
|
|
|
.unwrap()
|
|
|
.find(p2)
|
|
|
.unwrap_or(s.len())
|
|
|
+ idx1
|
|
|
+ p1.len();
|
|
|
|
|
|
Some(s.get(idx1 + p1.len()..idx2).unwrap().to_string())
|
|
|
}
|