|
|
|
|
@ -13,47 +13,47 @@ pub struct AMFPdf {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
|
|
pub enum PatternExtractionError {
|
|
|
|
|
pub enum PatternExtractionNotFoundError {
|
|
|
|
|
#[error("Person not found")]
|
|
|
|
|
PersonNotFoundError,
|
|
|
|
|
Person,
|
|
|
|
|
#[error("Date executed not found")]
|
|
|
|
|
DateExecutedNotFoundError,
|
|
|
|
|
DateExecuted,
|
|
|
|
|
#[error("Date published not found")]
|
|
|
|
|
DatePublishedNotFoundError,
|
|
|
|
|
DatePublished,
|
|
|
|
|
#[error("Exchange not found")]
|
|
|
|
|
ExchangeNotFoundError,
|
|
|
|
|
Exchange,
|
|
|
|
|
#[error("Nature not found")]
|
|
|
|
|
NatureNotFoundError,
|
|
|
|
|
Nature,
|
|
|
|
|
#[error("Instrument not found")]
|
|
|
|
|
InstrumentNotFoundError,
|
|
|
|
|
Instrument,
|
|
|
|
|
#[error("Coordonnees section not found")]
|
|
|
|
|
CoordonneesNotFound,
|
|
|
|
|
Coordonnees,
|
|
|
|
|
#[error("Company name not found")]
|
|
|
|
|
CompanyNameNotFound,
|
|
|
|
|
CompanyName,
|
|
|
|
|
#[error("Aggregated informations not found")]
|
|
|
|
|
AggregatedInformationNotFoundError,
|
|
|
|
|
AggregatedInformation,
|
|
|
|
|
#[error("Volume not found")]
|
|
|
|
|
VolumeNotFoundError,
|
|
|
|
|
Volume,
|
|
|
|
|
#[error("Price not found")]
|
|
|
|
|
PriceNotFoundError,
|
|
|
|
|
PriceNotFound,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Error)]
|
|
|
|
|
pub enum AMFPdfError {
|
|
|
|
|
#[error("Download error: {0}")]
|
|
|
|
|
DownloadError(reqwest::Error),
|
|
|
|
|
Download(reqwest::Error),
|
|
|
|
|
#[error("Bytes conversion error: {0}")]
|
|
|
|
|
BytesConversionError(reqwest::Error),
|
|
|
|
|
BytesConversion(reqwest::Error),
|
|
|
|
|
#[error("Error loading pdf document: {0}")]
|
|
|
|
|
DocumentLoadError(lopdf::Error),
|
|
|
|
|
DocumentLoad(lopdf::Error),
|
|
|
|
|
#[error("Error during lopdf text extraction: {0}")]
|
|
|
|
|
PdfTextExtractionError(lopdf::Error),
|
|
|
|
|
PdfTextExtraction(lopdf::Error),
|
|
|
|
|
#[error("Error during extraction of information: {0}")]
|
|
|
|
|
PatternExtractionError(PatternExtractionError),
|
|
|
|
|
PatternExtraction(PatternExtractionNotFoundError),
|
|
|
|
|
#[error("Error parsing date: {0}")]
|
|
|
|
|
DateParseError(DateParseError),
|
|
|
|
|
DateParse(DateParseError),
|
|
|
|
|
#[error("Error parsing unit price: {0}")]
|
|
|
|
|
PriceParseError(ParseFloatError),
|
|
|
|
|
PriceParse(ParseFloatError),
|
|
|
|
|
#[error("Error parsing volume: {0}")]
|
|
|
|
|
VolumeParseError(ParseFloatError),
|
|
|
|
|
}
|
|
|
|
|
@ -72,30 +72,29 @@ pub struct AMFPdfData {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl AMFPdf {
|
|
|
|
|
pub fn new(path: &String) -> AMFPdf {
|
|
|
|
|
let mut url = (&CONFIG.amf_documents_path).to_string();
|
|
|
|
|
url.push_str(&path);
|
|
|
|
|
pub fn new(path: &str) -> AMFPdf {
|
|
|
|
|
let mut url = CONFIG.amf_documents_path.to_string();
|
|
|
|
|
url.push_str(path);
|
|
|
|
|
AMFPdf { url }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn download(&self) -> Result<Bytes, AMFPdfError> {
|
|
|
|
|
Ok(reqwest::get(&self.url)
|
|
|
|
|
reqwest::get(&self.url)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| AMFPdfError::DownloadError(e))?
|
|
|
|
|
.map_err(AMFPdfError::Download)?
|
|
|
|
|
.bytes()
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| AMFPdfError::BytesConversionError(e))?)
|
|
|
|
|
.map_err(AMFPdfError::BytesConversion)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn extract_text(&self) -> Result<String, AMFPdfError> {
|
|
|
|
|
let bfile = self.download().await?;
|
|
|
|
|
let pdf =
|
|
|
|
|
lopdf::Document::load_mem(&bfile).map_err(|e| AMFPdfError::DocumentLoadError(e))?;
|
|
|
|
|
let pdf = lopdf::Document::load_mem(&bfile).map_err(AMFPdfError::DocumentLoad)?;
|
|
|
|
|
let mut text = "".to_string();
|
|
|
|
|
for (idx, _) in pdf.page_iter().enumerate() {
|
|
|
|
|
text.push_str(
|
|
|
|
|
&pdf.extract_text(&[idx as u32 + 1])
|
|
|
|
|
.map_err(|e| AMFPdfError::PdfTextExtractionError(e))?,
|
|
|
|
|
.map_err(AMFPdfError::PdfTextExtraction)?,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
Ok(text)
|
|
|
|
|
@ -117,7 +116,7 @@ impl AMFPdf {
|
|
|
|
|
.skip_while(|l| l.is_empty())
|
|
|
|
|
.nth(1)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.split(" ")
|
|
|
|
|
.split(' ')
|
|
|
|
|
.next()
|
|
|
|
|
.unwrap()
|
|
|
|
|
.get(0..12)
|
|
|
|
|
@ -130,82 +129,77 @@ impl AMFPdf {
|
|
|
|
|
|
|
|
|
|
let person = extract_pattern(
|
|
|
|
|
&text,
|
|
|
|
|
&"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n",
|
|
|
|
|
"NOM /FONCTION DE LA PERSONNE EXERCANT DES RESPONSABILITES DIRIGEANTES OU DE LAPERSONNE ETROITEMENT LIEE :\n",
|
|
|
|
|
"\n",
|
|
|
|
|
)
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::PersonNotFoundError,
|
|
|
|
|
.ok_or_else(|| AMFPdfError::PatternExtraction(
|
|
|
|
|
PatternExtractionNotFoundError::Person,
|
|
|
|
|
))?;
|
|
|
|
|
|
|
|
|
|
let date_published_raw =
|
|
|
|
|
extract_pattern(&text, &"DATE DE RECEPTION DE LA NOTIFICATION : ", &"\n").ok_or(
|
|
|
|
|
AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::DatePublishedNotFoundError,
|
|
|
|
|
),
|
|
|
|
|
extract_pattern(&text, "DATE DE RECEPTION DE LA NOTIFICATION : ", "\n").ok_or_else(
|
|
|
|
|
|| AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::DatePublished),
|
|
|
|
|
)?;
|
|
|
|
|
|
|
|
|
|
let date_published =
|
|
|
|
|
naive_date_from_str(&date_published_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
|
|
|
|
|
naive_date_from_str(&date_published_raw).map_err(AMFPdfError::DateParse)?;
|
|
|
|
|
|
|
|
|
|
let date_executed_raw = extract_pattern(&text, &"DATE DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::DateExecutedNotFoundError),
|
|
|
|
|
)?;
|
|
|
|
|
let date_executed_raw = extract_pattern(&text, "DATE DE LA TRANSACTION : ", "\n")
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::DateExecuted)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let date_executed =
|
|
|
|
|
naive_date_from_str(&date_executed_raw).map_err(|e| AMFPdfError::DateParseError(e))?;
|
|
|
|
|
naive_date_from_str(&date_executed_raw).map_err(AMFPdfError::DateParse)?;
|
|
|
|
|
|
|
|
|
|
let exchange = extract_pattern(&text, &"LIEU DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::ExchangeNotFoundError),
|
|
|
|
|
)?;
|
|
|
|
|
let exchange =
|
|
|
|
|
extract_pattern(&text, "LIEU DE LA TRANSACTION : ", "\n").ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Exchange)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let nature = extract_pattern(&text, &"NATURE DE LA TRANSACTION : ", &"\n").ok_or(
|
|
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::NatureNotFoundError),
|
|
|
|
|
)?;
|
|
|
|
|
let nature =
|
|
|
|
|
extract_pattern(&text, "NATURE DE LA TRANSACTION : ", "\n").ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Nature)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let instrument = extract_pattern(&text, &"DESCRIPTION DE L’INSTRUMENT FINANCIER : ", &"\n")
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::InstrumentNotFoundError,
|
|
|
|
|
))?;
|
|
|
|
|
let instrument = extract_pattern(&text, "DESCRIPTION DE L’INSTRUMENT FINANCIER : ", "\n")
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Instrument)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let inf_coordonnees =
|
|
|
|
|
text.find("COORDONNEES DE L’EMETTEUR")
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::CoordonneesNotFound,
|
|
|
|
|
))?;
|
|
|
|
|
let inf_coordonnees = text.find("COORDONNEES DE L’EMETTEUR").ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Coordonnees)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let mut text_cp = text.clone();
|
|
|
|
|
text_cp.drain(0..inf_coordonnees);
|
|
|
|
|
|
|
|
|
|
let company_name = extract_pattern(&text_cp, &"NOM : ", &"\n").ok_or(
|
|
|
|
|
AMFPdfError::PatternExtractionError(PatternExtractionError::CompanyNameNotFound),
|
|
|
|
|
)?;
|
|
|
|
|
let company_name = extract_pattern(&text_cp, "NOM : ", "\n").ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::CompanyName)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let inf_aggregees_idx =
|
|
|
|
|
text.find("INFORMATIONS AGREGEES")
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::AggregatedInformationNotFoundError,
|
|
|
|
|
))?;
|
|
|
|
|
let inf_aggregees_idx = text.find("INFORMATIONS AGREGEES").ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::AggregatedInformation)
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
text.drain(0..inf_aggregees_idx);
|
|
|
|
|
|
|
|
|
|
let volume = extract_pattern(&text, &"VOLUME : ", &"\n")
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::VolumeNotFoundError,
|
|
|
|
|
))?
|
|
|
|
|
.replace(" ", "")
|
|
|
|
|
let volume = extract_pattern(&text, "VOLUME : ", "\n")
|
|
|
|
|
.ok_or_else(|| AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::Volume))?
|
|
|
|
|
.replace(' ', "")
|
|
|
|
|
.parse::<f32>()
|
|
|
|
|
.map_err(|e| AMFPdfError::VolumeParseError(e))?;
|
|
|
|
|
.map_err(AMFPdfError::VolumeParseError)?;
|
|
|
|
|
|
|
|
|
|
let unit_price = extract_pattern(&text, &"PRIX : ".to_string(), &"\n".to_string())
|
|
|
|
|
.ok_or(AMFPdfError::PatternExtractionError(
|
|
|
|
|
PatternExtractionError::PriceNotFoundError,
|
|
|
|
|
))?
|
|
|
|
|
.replace(" ", "")
|
|
|
|
|
let unit_price = extract_pattern(&text, "PRIX : ", "\n")
|
|
|
|
|
.ok_or_else(|| {
|
|
|
|
|
AMFPdfError::PatternExtraction(PatternExtractionNotFoundError::PriceNotFound)
|
|
|
|
|
})?
|
|
|
|
|
.replace(' ', "")
|
|
|
|
|
.chars()
|
|
|
|
|
.take_while(|c| c.is_digit(10) || c == &'.')
|
|
|
|
|
.take_while(|c| c.is_ascii_digit() || c == &'.')
|
|
|
|
|
.collect::<String>()
|
|
|
|
|
.parse::<f32>()
|
|
|
|
|
.map_err(|e| AMFPdfError::PriceParseError(e))?;
|
|
|
|
|
.map_err(AMFPdfError::PriceParse)?;
|
|
|
|
|
|
|
|
|
|
Ok(AMFPdfData {
|
|
|
|
|
company_name,
|
|
|
|
|
|