Last active
May 24, 2023 09:48
-
-
Save rolisz/e691aaa3c0d508f739d43d3404d0cf8c to your computer and use it in GitHub Desktop.
Simple web crawler in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "rust_crawler" | |
version = "0.1.0" | |
authors = ["Roland Szabo <[email protected]>"] | |
edition = "2018" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
reqwest = { version = "0.10", features = ["json", "blocking"] } | |
select = "0.4.3" | |
rayon = { version = "*" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use rayon::prelude::*; | |
use reqwest::Url; | |
use select::document::Document; | |
use select::predicate::Name; | |
use select::predicate::Predicate; | |
use std::collections::HashSet; | |
use std::fs; | |
use std::io::Error as IoErr; | |
use std::io::Read; | |
use std::path::Path; | |
use std::time::Instant; | |
#[derive(Debug)] | |
enum Error { | |
Write { url: String, e: IoErr }, | |
Fetch { url: String, e: reqwest::Error }, | |
} | |
type Result<T> = std::result::Result<T, Error>; | |
impl<S: AsRef<str>> From<(S, IoErr)> for Error { | |
fn from((url, e): (S, IoErr)) -> Self { | |
Error::Write { | |
url: url.as_ref().to_string(), | |
e, | |
} | |
} | |
} | |
impl<S: AsRef<str>> From<(S, reqwest::Error)> for Error { | |
fn from((url, e): (S, reqwest::Error)) -> Self { | |
Error::Fetch { | |
url: url.as_ref().to_string(), | |
e, | |
} | |
} | |
} | |
fn get_links_from_html(html: &str) -> HashSet<String> { | |
Document::from(html) | |
.find(Name("a").or(Name("link"))) | |
.filter_map(|n| n.attr("href")) | |
.filter(has_extension) | |
.filter_map(normalize_url) | |
.collect::<HashSet<String>>() | |
} | |
fn normalize_url(url: &str) -> Option<String> { | |
let new_url = Url::parse(url); | |
match new_url { | |
Ok(new_url) => { | |
if let Some("rolisz.ro") = new_url.host_str() { | |
Some(url.to_string()) | |
} else { | |
None | |
} | |
} | |
Err(_e) => { | |
// Relative urls are not parsed by Reqwest | |
if url.starts_with('/') { | |
Some(format!("https://rolisz.ro{}", url)) | |
} else { | |
None | |
} | |
} | |
} | |
} | |
fn fetch_url(client: &reqwest::blocking::Client, url: &str) -> Result<String> { | |
let mut res = client.get(url).send().map_err(|e| (url, e))?; | |
println!("Status for {}: {}", url, res.status()); | |
let mut body = String::new(); | |
res.read_to_string(&mut body).map_err(|e| (url, e))?; | |
Ok(body) | |
} | |
fn has_extension(url: &&str) -> bool { | |
Path::new(&url).extension().is_none() | |
} | |
fn write_file(path: &str, content: &str) -> Result<()> { | |
let dir = format!("static{}", path); | |
fs::create_dir_all(format!("static{}", path)).map_err(|e| (&dir, e))?; | |
let index = format!("static{}/index.html", path); | |
fs::write(&index, content).map_err(|e| (&index, e))?; | |
Ok(()) | |
} | |
fn main() -> Result<()> { | |
let now = Instant::now(); | |
let client = reqwest::blocking::Client::new(); | |
let origin_url = "https://rolisz.ro/"; | |
let body = fetch_url(&client, origin_url)?; | |
write_file("", &body)?; | |
let mut visited = HashSet::new(); | |
visited.insert(origin_url.to_string()); | |
let found_urls = get_links_from_html(&body); | |
let mut new_urls = found_urls | |
.difference(&visited) | |
.map(|x| x.to_string()) | |
.collect::<HashSet<String>>(); | |
while !new_urls.is_empty() { | |
let (found_urls, errors): (Vec<Result<HashSet<String>>>, Vec<_>) = new_urls | |
.par_iter() | |
.map(|url| -> Result<HashSet<String>> { | |
let body = fetch_url(&client, url)?; | |
write_file(&url[origin_url.len() - 1..], &body)?; | |
let links = get_links_from_html(&body); | |
println!("Visited: {} found {} links", url, links.len()); | |
Ok(links) | |
}) | |
.partition(Result::is_ok); | |
visited.extend(new_urls); | |
new_urls = found_urls | |
.into_par_iter() | |
.map(Result::unwrap) | |
.reduce(HashSet::new, |mut acc, x| { | |
acc.extend(x); | |
acc | |
}) | |
.difference(&visited) | |
.map(|x| x.to_string()) | |
.collect::<HashSet<String>>(); | |
println!("New urls: {}", new_urls.len()); | |
println!( | |
"Errors: {:#?}", | |
errors | |
.into_iter() | |
.map(Result::unwrap_err) | |
.collect::<Vec<Error>>() | |
) | |
} | |
println!("Elapsed time: {}", now.elapsed().as_secs()); | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment