Skip to content

Instantly share code, notes, and snippets.

@rolisz
Last active May 24, 2023 09:48
Show Gist options
  • Save rolisz/e691aaa3c0d508f739d43d3404d0cf8c to your computer and use it in GitHub Desktop.
Save rolisz/e691aaa3c0d508f739d43d3404d0cf8c to your computer and use it in GitHub Desktop.
Simple web crawler in Rust
[package]
name = "rust_crawler"
version = "0.1.0"
authors = ["Roland Szabo <[email protected]>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = { version = "0.10", features = ["json", "blocking"] }
select = "0.4.3"
rayon = { version = "*" }
use rayon::prelude::*;
use reqwest::Url;
use select::document::Document;
use select::predicate::Name;
use select::predicate::Predicate;
use std::collections::HashSet;
use std::fs;
use std::io::Error as IoErr;
use std::io::Read;
use std::path::Path;
use std::time::Instant;
#[derive(Debug)]
enum Error {
Write { url: String, e: IoErr },
Fetch { url: String, e: reqwest::Error },
}
type Result<T> = std::result::Result<T, Error>;
impl<S: AsRef<str>> From<(S, IoErr)> for Error {
fn from((url, e): (S, IoErr)) -> Self {
Error::Write {
url: url.as_ref().to_string(),
e,
}
}
}
impl<S: AsRef<str>> From<(S, reqwest::Error)> for Error {
fn from((url, e): (S, reqwest::Error)) -> Self {
Error::Fetch {
url: url.as_ref().to_string(),
e,
}
}
}
fn get_links_from_html(html: &str) -> HashSet<String> {
Document::from(html)
.find(Name("a").or(Name("link")))
.filter_map(|n| n.attr("href"))
.filter(has_extension)
.filter_map(normalize_url)
.collect::<HashSet<String>>()
}
fn normalize_url(url: &str) -> Option<String> {
let new_url = Url::parse(url);
match new_url {
Ok(new_url) => {
if let Some("rolisz.ro") = new_url.host_str() {
Some(url.to_string())
} else {
None
}
}
Err(_e) => {
// Relative urls are not parsed by Reqwest
if url.starts_with('/') {
Some(format!("https://rolisz.ro{}", url))
} else {
None
}
}
}
}
fn fetch_url(client: &reqwest::blocking::Client, url: &str) -> Result<String> {
let mut res = client.get(url).send().map_err(|e| (url, e))?;
println!("Status for {}: {}", url, res.status());
let mut body = String::new();
res.read_to_string(&mut body).map_err(|e| (url, e))?;
Ok(body)
}
fn has_extension(url: &&str) -> bool {
Path::new(&url).extension().is_none()
}
fn write_file(path: &str, content: &str) -> Result<()> {
let dir = format!("static{}", path);
fs::create_dir_all(format!("static{}", path)).map_err(|e| (&dir, e))?;
let index = format!("static{}/index.html", path);
fs::write(&index, content).map_err(|e| (&index, e))?;
Ok(())
}
fn main() -> Result<()> {
let now = Instant::now();
let client = reqwest::blocking::Client::new();
let origin_url = "https://rolisz.ro/";
let body = fetch_url(&client, origin_url)?;
write_file("", &body)?;
let mut visited = HashSet::new();
visited.insert(origin_url.to_string());
let found_urls = get_links_from_html(&body);
let mut new_urls = found_urls
.difference(&visited)
.map(|x| x.to_string())
.collect::<HashSet<String>>();
while !new_urls.is_empty() {
let (found_urls, errors): (Vec<Result<HashSet<String>>>, Vec<_>) = new_urls
.par_iter()
.map(|url| -> Result<HashSet<String>> {
let body = fetch_url(&client, url)?;
write_file(&url[origin_url.len() - 1..], &body)?;
let links = get_links_from_html(&body);
println!("Visited: {} found {} links", url, links.len());
Ok(links)
})
.partition(Result::is_ok);
visited.extend(new_urls);
new_urls = found_urls
.into_par_iter()
.map(Result::unwrap)
.reduce(HashSet::new, |mut acc, x| {
acc.extend(x);
acc
})
.difference(&visited)
.map(|x| x.to_string())
.collect::<HashSet<String>>();
println!("New urls: {}", new_urls.len());
println!(
"Errors: {:#?}",
errors
.into_iter()
.map(Result::unwrap_err)
.collect::<Vec<Error>>()
)
}
println!("Elapsed time: {}", now.elapsed().as_secs());
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment