cal8tor/src/info.rs

use chrono::{DateTime, TimeZone, Utc};
use regex::{Captures, Regex};
use scraper::{Html, Selector};
use std::collections::HashMap;

pub async fn info() -> HashMap<usize, Vec<(DateTime<Utc>, i64)>> {
    let document = get_webpage().await.expect("Can't reach info website.");

    // Selectors
    let sel_ul = Selector::parse("ul").unwrap();
    let sel_li = Selector::parse("li").unwrap();

    // Find the raw infos in html page
    let mut raw_data = Vec::new();
    for (i, data) in document.select(&sel_ul).enumerate() {
        if [1, 2].contains(&i) {
            raw_data.push(data);
        }
    }

    let mut data = HashMap::new();
    // d => date
    // r => repetition
    let re = Regex::new(r"(?P<d>\d{1,2} \w+ \d{4}).+(?P<r>\d)").unwrap();
    for (i, ul) in raw_data.into_iter().enumerate() {
        for element in ul.select(&sel_li) {
            match element.inner_html() {
                e if e.starts_with("Début") => {
                    let captures = re.captures(&e).unwrap();

                    let start_date = get_date(captures.name("d").unwrap().as_str());

                    let rep: i64 = captures.name("r").unwrap().as_str().parse().unwrap();

                    data.insert(i + 1, vec![(start_date, rep)]);
                }
                e if e.starts_with("Reprise") => {
                    let captures = re.captures(&e).unwrap();
                    captures.name("g");

                    let start_date = get_date(captures.name("d").unwrap().as_str());

                    let rep: i64 = captures.name("r").unwrap().as_str().parse().unwrap();

                    let it = i + 1;

                    let mut vec = data.get(&it).unwrap().to_owned();
                    vec.push((start_date, rep));

                    data.insert(it, vec);
                }
                _ => (),
            }
        }
    }

    data
}

/// Get info webpage
async fn get_webpage() -> Result<Html, Box<dyn std::error::Error>> {
    /* let url = "https://informatique.up8.edu/licence-iv/edt";

    // We don't use reqwest::get() but a client with a custom user-agent
    // in order to avoid getting rate limit
    let client = reqwest::Client::builder()
        .user_agent("bypass-rate_limit")
        .build()?;
    let html = client.get(url).send().await?.text().await?;

    // Panic on error
    crate::utils::check_errors(&html, url); */

    let html = std::fs::read_to_string("target/debug-sch.htm").unwrap();

    Ok(Html::parse_document(&html))
}

/// Turn a french date to an english one
fn anglophonization(date: &str) -> String {
    let dico = HashMap::from([
        ("janvier", "january"),
        ("mars", "march"),
        ("septembre", "september"),
        ("novembre", "november"),
    ]);

    // New regex of all the french month
    let re = Regex::new(&format!(
        "({})",
        dico.keys().cloned().collect::<Vec<_>>().join("|")
    ))
    .unwrap();

    format!(
        // Use 12:00 for chrono parser
        "{} 12:00",
        // Replace french by english month
        re.replace_all(date, |cap: &Captures| {
            match &cap[0] {
                month if dico.contains_key(month) => dico.get(month).unwrap(),
                month => panic!("Unknown month: {}", month),
            }
        })
    )
}

/// Turn a string to a DateTime
fn get_date(date: &str) -> DateTime<Utc> {
    // Use and keep UTC time, we have the hour set to 12h and
    // Paris 8 is in France so there is no problems
    Utc.datetime_from_str(&anglophonization(date), "%e %B %Y %H:%M")
        .unwrap()
}
give rep 2022-08-16 14:41:07 +02:00			`use chrono::{DateTime, TimeZone, Utc};`
convert string date to datetime object 2022-08-15 17:25:36 +02:00			`use regex::{Captures, Regex};`
wip fetching dates 2022-08-15 14:53:09 +02:00			`use scraper::{Html, Selector};`
refactor 2022-08-16 11:55:51 +02:00			`use std::collections::HashMap;`
move info stuff into one file 2022-08-15 12:20:16 +02:00
give rep 2022-08-16 14:41:07 +02:00			`pub async fn info() -> HashMap<usize, Vec<(DateTime<Utc>, i64)>> {`
wip fetching dates 2022-08-15 14:53:09 +02:00			`let document = get_webpage().await.expect("Can't reach info website.");`

			`// Selectors`
			`let sel_ul = Selector::parse("ul").unwrap();`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`let sel_li = Selector::parse("li").unwrap();`
wip fetching dates 2022-08-15 14:53:09 +02:00
			`// Find the raw infos in html page`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`let mut raw_data = Vec::new();`
wip fetching dates 2022-08-15 14:53:09 +02:00			`for (i, data) in document.select(&sel_ul).enumerate() {`
			`if [1, 2].contains(&i) {`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`raw_data.push(data);`
wip fetching dates 2022-08-15 14:53:09 +02:00			`}`
			`}`
wip: parsing school range 2022-08-15 15:46:58 +02:00
convert string date to datetime object 2022-08-15 17:25:36 +02:00			`let mut data = HashMap::new();`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`// d => date`
			`// r => repetition`
			`let re = Regex::new(r"(?P<d>\d{1,2} \w+ \d{4}).+(?P<r>\d)").unwrap();`
			`for (i, ul) in raw_data.into_iter().enumerate() {`
			`for element in ul.select(&sel_li) {`
			`match element.inner_html() {`
			`e if e.starts_with("Début") => {`
			`let captures = re.captures(&e).unwrap();`
convert string date to datetime object 2022-08-15 17:25:36 +02:00
turn all the date into datetime objects 2022-08-15 17:50:16 +02:00			`let start_date = get_date(captures.name("d").unwrap().as_str());`
convert string date to datetime object 2022-08-15 17:25:36 +02:00
store only the datetime objects 2022-08-15 17:58:40 +02:00			`let rep: i64 = captures.name("r").unwrap().as_str().parse().unwrap();`
convert string date to datetime object 2022-08-15 17:25:36 +02:00
give rep 2022-08-16 14:41:07 +02:00			`data.insert(i + 1, vec![(start_date, rep)]);`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`}`
			`e if e.starts_with("Reprise") => {`
			`let captures = re.captures(&e).unwrap();`
			`captures.name("g");`
turn all the date into datetime objects 2022-08-15 17:50:16 +02:00
			`let start_date = get_date(captures.name("d").unwrap().as_str());`

store only the datetime objects 2022-08-15 17:58:40 +02:00			`let rep: i64 = captures.name("r").unwrap().as_str().parse().unwrap();`
turn all the date into datetime objects 2022-08-15 17:50:16 +02:00
fix typo 2022-08-15 20:12:22 +02:00			`let it = i + 1;`

			`let mut vec = data.get(&it).unwrap().to_owned();`
give rep 2022-08-16 14:41:07 +02:00			`vec.push((start_date, rep));`
store only the datetime objects 2022-08-15 17:58:40 +02:00
fix typo 2022-08-15 20:12:22 +02:00			`data.insert(it, vec);`
wip: parsing school range 2022-08-15 15:46:58 +02:00			`}`
			`_ => (),`
			`}`
			`}`
			`}`

return data 2022-08-15 19:06:36 +02:00			`data`
move info stuff into one file 2022-08-15 12:20:16 +02:00			`}`

convert string date to datetime object 2022-08-15 17:25:36 +02:00			`/// Get info webpage`
move info stuff into one file 2022-08-15 12:20:16 +02:00			`async fn get_webpage() -> Result<Html, Box<dyn std::error::Error>> {`
WIP: table 2022-08-23 16:51:02 +02:00			`/* let url = "https://informatique.up8.edu/licence-iv/edt";`
bypass ratelimit 2022-08-16 15:56:41 +02:00
add comments 2022-08-16 16:07:58 +02:00			`// We don't use reqwest::get() but a client with a custom user-agent`
			`// in order to avoid getting rate limit`
			`let client = reqwest::Client::builder()`
			`.user_agent("bypass-rate_limit")`
			`.build()?;`
bypass ratelimit 2022-08-16 15:56:41 +02:00			`let html = client.get(url).send().await?.text().await?;`
move info stuff into one file 2022-08-15 12:20:16 +02:00
add http error detection 2022-08-16 15:48:13 +02:00			`// Panic on error`
WIP: table 2022-08-23 16:51:02 +02:00			`crate::utils::check_errors(&html, url); */`

			`let html = std::fs::read_to_string("target/debug-sch.htm").unwrap();`
move info stuff into one file 2022-08-15 12:20:16 +02:00
add http error detection 2022-08-16 15:48:13 +02:00			`Ok(Html::parse_document(&html))`
move info stuff into one file 2022-08-15 12:20:16 +02:00			`}`
convert string date to datetime object 2022-08-15 17:25:36 +02:00
			`/// Turn a french date to an english one`
			`fn anglophonization(date: &str) -> String {`
			`let dico = HashMap::from([`
			`("janvier", "january"),`
			`("mars", "march"),`
			`("septembre", "september"),`
			`("novembre", "november"),`
			`]);`

			`// New regex of all the french month`
			`let re = Regex::new(&format!(`
			`"({})",`
			`dico.keys().cloned().collect::<Vec<_>>().join("\|")`
			`))`
			`.unwrap();`

			`format!(`
			`// Use 12:00 for chrono parser`
			`"{} 12:00",`
			`// Replace french by english month`
			`re.replace_all(date, \|cap: &Captures\| {`
			`match &cap[0] {`
			`month if dico.contains_key(month) => dico.get(month).unwrap(),`
			`month => panic!("Unknown month: {}", month),`
			`}`
			`})`
			`)`
			`}`
turn all the date into datetime objects 2022-08-15 17:50:16 +02:00
			`/// Turn a string to a DateTime`
return data 2022-08-15 19:06:36 +02:00			`fn get_date(date: &str) -> DateTime<Utc> {`
turn all the date into datetime objects 2022-08-15 17:50:16 +02:00			`// Use and keep UTC time, we have the hour set to 12h and`
			`// Paris 8 is in France so there is no problems`
			`Utc.datetime_from_str(&anglophonization(date), "%e %B %Y %H:%M")`
			`.unwrap()`
			`}`