You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

238 lines
7.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

use bytes::Bytes;
use chrono::NaiveDate;
use std::num::ParseFloatError;
use thiserror::Error;
use crate::amf::types::date::naive_date_from_str;
use crate::amf::types::date::DateParseError;
use crate::CONFIG;
#[derive(Debug, Clone)]
pub struct AMFPdf {
url: String,
}
#[derive(Debug, Error)]
pub enum PatternExtractionError {
#[error("Person not found")]
PersonNotFoundError,
#[error("Date executed not found")]
DateExecutedNotFoundError,
#[error("Date published not found")]
DatePublishedNotFoundError,
#[error("Exchange not found")]
ExchangeNotFoundError,
#[error("Nature not found")]
NatureNotFoundError,
#[error("Instrument not found")]
InstrumentNotFoundError,
#[error("Coordonnees section not found")]
CoordonneesNotFound,
#[error("Company name not found")]
CompanyNameNotFound,
#[error("Aggregated informations not found")]
AggregatedInformationNotFoundError,
#[error("Volume not found")]
VolumeNotFoundError,
#[error("Price not found")]
PriceNotFoundError,
}
#[derive(Debug, Error)]
pub enum AMFPdfError {
#[error("Download error: {0}")]
DownloadError(reqwest::Error),
#[error("Bytes conversion error: {0}")]
BytesConversionError(reqwest::Error),
#[error("Error loading pdf document: {0}")]
DocumentLoadError(lopdf::Error),
#[error("Error during lopdf text extraction: {0}")]
PdfTextExtractionError(lopdf::Error),
#[error("Error during extraction of information: {0}")]
PatternExtractionError(PatternExtractionError),
#[error("Error parsing date: {0}")]
DateParseError(DateParseError),
#[error("Error parsing unit price: {0}")]
PriceParseError(ParseFloatError),
#[error("Error parsing volume: {0}")]
VolumeParseError(ParseFloatError),
}
pub struct AMFPdfData {
pub company_name: String,
pub isin: Option<String>,
pub person: String,
pub date_published: NaiveDate,
pub date_executed: NaiveDate,
pub exchange: String,
pub nature: String,
pub instrument: String,
pub volume: f32,
pub unit_price: f32,
}
impl AMFPdf {
pub fn new(path: &String) -> AMFPdf {
let mut url = (&CONFIG.amf_documents_path).to_string();
url.push_str(&path);
AMFPdf { url }
}
async fn download(&self) -> Result<Bytes, AMFPdfError> {
Ok(reqwest::get(&self.url)
.await
.map_err(|e| AMFPdfError::DownloadError(e))?
.bytes()
.await
.map_err(|e| AMFPdfError::BytesConversionError(e))?)
}
async fn extract_text(&self) -> Result<String, AMFPdfError> {
let bfile = self.download().await?;
let pdf =
lopdf::Document::load_mem(&bfile).map_err(|e| AMFPdfError::DocumentLoadError(e))?;
let mut text = "".to_string();
for (idx, _) in pdf.page_iter().enumerate() {
text.push_str(
&pdf.extract_text(&[idx as u32 + 1])
.map_err(|e| AMFPdfError::PdfTextExtractionError(e))?,
);
}
Ok(text)
}
pub async fn extract_info(&self) -> Result<AMFPdfData, AMFPdfError> {
let mut text = self.extract_text().await?;
debug!("Extracted text from document:\n{}", text);
let isin = extract_pattern(
&text,
"CODE DIDENTIFICATION DE LINSTRUMENT FINANCIER : ",
"\n",
)
.map_or_else(
|| {
Some(
text.lines()
.skip_while(|l| l.is_empty())
.nth(1)
.unwrap()
.split(" ")
.next()
.unwrap()
.get(0..12)
.map(|t| t.to_string()),
)
},
|t| Some(Some(t.get(0..12).unwrap_or(&t).to_string())),
)
.unwrap_or(None);
let person = extract_pattern(
&text,
&"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n",
"\n",
)
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::PersonNotFoundError,
))?;
let date_published_raw =
extract_pattern(&text, &"DATE DE RECEPTION DE LA NOTIFICATION : ", &"\n").ok_or(
AMFPdfError::PatternExtractionError(
PatternExtractionError::DatePublishedNotFoundError,
),
)?;
let date_published =
naive_date_from_str(&date_published_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
let date_executed_raw = extract_pattern(&text, &"DATE DE LA TRANSACTION : ", &"\n").ok_or(
AMFPdfError::PatternExtractionError(PatternExtractionError::DateExecutedNotFoundError),
)?;
let date_executed =
naive_date_from_str(&date_executed_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
let exchange = extract_pattern(&text, &"LIEU DE LA TRANSACTION : ", &"\n").ok_or(
AMFPdfError::PatternExtractionError(PatternExtractionError::ExchangeNotFoundError),
)?;
let nature = extract_pattern(&text, &"NATURE DE LA TRANSACTION : ", &"\n").ok_or(
AMFPdfError::PatternExtractionError(PatternExtractionError::NatureNotFoundError),
)?;
let instrument = extract_pattern(&text, &"DESCRIPTION DE LINSTRUMENT FINANCIER : ", &"\n")
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::InstrumentNotFoundError,
))?;
let inf_coordonnees =
text.find("COORDONNEES DE LEMETTEUR")
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::CoordonneesNotFound,
))?;
let mut text_cp = text.clone();
text_cp.drain(0..inf_coordonnees);
let company_name = extract_pattern(&text_cp, &"NOM : ", &"\n").ok_or(
AMFPdfError::PatternExtractionError(PatternExtractionError::CompanyNameNotFound),
)?;
let inf_aggregees_idx =
text.find("INFORMATIONS AGREGEES")
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::AggregatedInformationNotFoundError,
))?;
text.drain(0..inf_aggregees_idx);
let volume = extract_pattern(&text, &"VOLUME : ", &"\n")
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::VolumeNotFoundError,
))?
.replace(" ", "")
.parse::<f32>()
.map_err(|e| AMFPdfError::VolumeParseError(e))?;
let unit_price = extract_pattern(&text, &"PRIX : ".to_string(), &"\n".to_string())
.ok_or(AMFPdfError::PatternExtractionError(
PatternExtractionError::PriceNotFoundError,
))?
.replace(" ", "")
.chars()
.take_while(|c| c.is_digit(10) || c == &'.')
.collect::<String>()
.parse::<f32>()
.map_err(|e| AMFPdfError::PriceParseError(e))?;
Ok(AMFPdfData {
company_name,
isin,
person,
date_published,
date_executed,
exchange,
nature,
instrument,
volume,
unit_price,
})
}
}
fn extract_pattern(s: &String, p1: &str, p2: &str) -> Option<String> {
let idx1 = s.find(p1)?;
let idx2 = s
.get(idx1 + p1.len()..)
.unwrap()
.find(p2)
.unwrap_or(s.len())
+ idx1
+ p1.len();
Some(s.get(idx1 + p1.len()..idx2).unwrap().to_string())
}