Skip to content

Instantly share code, notes, and snippets.

@oldtune
Created June 15, 2023 08:18
Show Gist options
  • Save oldtune/ac043b97cc92577e1fcd35d5b00608eb to your computer and use it in GitHub Desktop.
Save oldtune/ac043b97cc92577e1fcd35d5b00608eb to your computer and use it in GitHub Desktop.
crawler piece of code reqwest tl-rs
[package]
name = "crawler"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
tl = "0.7.7"
reqwest = { version = "0.11.18", features = ["gzip"] }
tokio = { version = "1.28.2", features = ["full"] }
anyhow = "1.0.71"
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let http_client = build_http_client()?;
// Note that the code is incomplete here
Ok(())
}
fn build_http_client() -> reqwest::Result<reqwest::Client> {
let client_builder = reqwest::ClientBuilder::new().gzip(true);
let client = client_builder.build();
client
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let http_client = build_http_client()?;
//we are searching for the word "access" here.
let html = http_client.get("http://tratu.coviet.vn/hoc-tieng-anh/tu-dien/lac-viet/A-V/access.html")
.send().await?
.text_with_charset("utf-8").await?;
dbg!(html);
Ok(())
}
async fn main() -> anyhow::Result<()> {
let http_client = build_http_client()?;
//we are searching for the word "access" here.
let html = http_client
.get("http://tratu.coviet.vn/hoc-tieng-anh/tu-dien/lac-viet/A-V/access.html")
.send()
.await?
.text_with_charset("utf-8")
.await?;
let dom = get_dom(&html)?;
let pronun = extract_pronunciation(&dom, "div.p5l.fl.cB");
println!("{:?}", pronun);
Ok(())
}
fn get_dom(html: &str) -> anyhow::Result<VDom> {
let dom = tl::parse(&html, ParserOptions::default())?;
Ok(dom)
}
fn extract_pronunciation(dom: &VDom, query_selector: &str) -> Option<String> {
let query_result = dom.query_selector(query_selector);
if query_result.is_none() {
return None;
}
for node_handle in query_result.unwrap() {
if let Some(node) = node_handle.get(dom.parser()) {
if let Some(tag) = node.as_tag() {
return Some(tag.inner_html(dom.parser()));
}
}
}
return None;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment