Created
June 15, 2023 08:18
-
-
Save oldtune/ac043b97cc92577e1fcd35d5b00608eb to your computer and use it in GitHub Desktop.
crawler piece of code reqwest tl-rs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "crawler" | |
version = "0.1.0" | |
edition = "2021" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
tl = "0.7.7" | |
reqwest = { version = "0.11.18", features = ["gzip"] } | |
tokio = { version = "1.28.2", features = ["full"] } | |
anyhow = "1.0.71" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[tokio::main] | |
async fn main() -> anyhow::Result<()> { | |
let http_client = build_http_client()?; | |
// Note that the code is incomplete here | |
Ok(()) | |
} | |
fn build_http_client() -> reqwest::Result<reqwest::Client> { | |
let client_builder = reqwest::ClientBuilder::new().gzip(true); | |
let client = client_builder.build(); | |
client | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[tokio::main] | |
async fn main() -> anyhow::Result<()> { | |
let http_client = build_http_client()?; | |
//we are searching for the word "access" here. | |
let html = http_client.get("http://tratu.coviet.vn/hoc-tieng-anh/tu-dien/lac-viet/A-V/access.html") | |
.send().await? | |
.text_with_charset("utf-8").await?; | |
dbg!(html); | |
Ok(()) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async fn main() -> anyhow::Result<()> { | |
let http_client = build_http_client()?; | |
//we are searching for the word "access" here. | |
let html = http_client | |
.get("http://tratu.coviet.vn/hoc-tieng-anh/tu-dien/lac-viet/A-V/access.html") | |
.send() | |
.await? | |
.text_with_charset("utf-8") | |
.await?; | |
let dom = get_dom(&html)?; | |
let pronun = extract_pronunciation(&dom, "div.p5l.fl.cB"); | |
println!("{:?}", pronun); | |
Ok(()) | |
} | |
fn get_dom(html: &str) -> anyhow::Result<VDom> { | |
let dom = tl::parse(&html, ParserOptions::default())?; | |
Ok(dom) | |
} | |
fn extract_pronunciation(dom: &VDom, query_selector: &str) -> Option<String> { | |
let query_result = dom.query_selector(query_selector); | |
if query_result.is_none() { | |
return None; | |
} | |
for node_handle in query_result.unwrap() { | |
if let Some(node) = node_handle.get(dom.parser()) { | |
if let Some(tag) = node.as_tag() { | |
return Some(tag.inner_html(dom.parser())); | |
} | |
} | |
} | |
return None; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment