Skip to content

Instantly share code, notes, and snippets.

@rolisz
Last active February 11, 2026 07:02
Show Gist options
  • Select an option

  • Save rolisz/e691aaa3c0d508f739d43d3404d0cf8c to your computer and use it in GitHub Desktop.

Select an option

Save rolisz/e691aaa3c0d508f739d43d3404d0cf8c to your computer and use it in GitHub Desktop.
Simple web crawler in Rust
[package]
name = "rust_crawler"
version = "0.1.0"
authors = ["Roland Szabo <rolisz@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = { version = "0.10", features = ["json", "blocking"] }
select = "0.4.3"
rayon = { version = "*" }
use rayon::prelude::*;
use reqwest::Url;
use select::document::Document;
use select::predicate::Name;
use select::predicate::Predicate;
use std::collections::HashSet;
use std::fs;
use std::io::Error as IoErr;
use std::io::Read;
use std::path::Path;
use std::time::Instant;
#[derive(Debug)]
enum Error {
Write { url: String, e: IoErr },
Fetch { url: String, e: reqwest::Error },
}
type Result<T> = std::result::Result<T, Error>;
impl<S: AsRef<str>> From<(S, IoErr)> for Error {
fn from((url, e): (S, IoErr)) -> Self {
Error::Write {
url: url.as_ref().to_string(),
e,
}
}
}
impl<S: AsRef<str>> From<(S, reqwest::Error)> for Error {
fn from((url, e): (S, reqwest::Error)) -> Self {
Error::Fetch {
url: url.as_ref().to_string(),
e,
}
}
}
fn get_links_from_html(html: &str) -> HashSet<String> {
Document::from(html)
.find(Name("a").or(Name("link")))
.filter_map(|n| n.attr("href"))
.filter(has_extension)
.filter_map(normalize_url)
.collect::<HashSet<String>>()
}
fn normalize_url(url: &str) -> Option<String> {
let new_url = Url::parse(url);
match new_url {
Ok(new_url) => {
if let Some("rolisz.ro") = new_url.host_str() {
Some(url.to_string())
} else {
None
}
}
Err(_e) => {
// Relative urls are not parsed by Reqwest
if url.starts_with('/') {
Some(format!("https://rolisz.ro{}", url))
} else {
None
}
}
}
}
fn fetch_url(client: &reqwest::blocking::Client, url: &str) -> Result<String> {
let mut res = client.get(url).send().map_err(|e| (url, e))?;
println!("Status for {}: {}", url, res.status());
let mut body = String::new();
res.read_to_string(&mut body).map_err(|e| (url, e))?;
Ok(body)
}
fn has_extension(url: &&str) -> bool {
Path::new(&url).extension().is_none()
}
fn write_file(path: &str, content: &str) -> Result<()> {
let dir = format!("static{}", path);
fs::create_dir_all(format!("static{}", path)).map_err(|e| (&dir, e))?;
let index = format!("static{}/index.html", path);
fs::write(&index, content).map_err(|e| (&index, e))?;
Ok(())
}
fn main() -> Result<()> {
let now = Instant::now();
let client = reqwest::blocking::Client::new();
let origin_url = "https://rolisz.ro/";
let body = fetch_url(&client, origin_url)?;
write_file("", &body)?;
let mut visited = HashSet::new();
visited.insert(origin_url.to_string());
let found_urls = get_links_from_html(&body);
let mut new_urls = found_urls
.difference(&visited)
.map(|x| x.to_string())
.collect::<HashSet<String>>();
while !new_urls.is_empty() {
let (found_urls, errors): (Vec<Result<HashSet<String>>>, Vec<_>) = new_urls
.par_iter()
.map(|url| -> Result<HashSet<String>> {
let body = fetch_url(&client, url)?;
write_file(&url[origin_url.len() - 1..], &body)?;
let links = get_links_from_html(&body);
println!("Visited: {} found {} links", url, links.len());
Ok(links)
})
.partition(Result::is_ok);
visited.extend(new_urls);
new_urls = found_urls
.into_par_iter()
.map(Result::unwrap)
.reduce(HashSet::new, |mut acc, x| {
acc.extend(x);
acc
})
.difference(&visited)
.map(|x| x.to_string())
.collect::<HashSet<String>>();
println!("New urls: {}", new_urls.len());
println!(
"Errors: {:#?}",
errors
.into_iter()
.map(Result::unwrap_err)
.collect::<Vec<Error>>()
)
}
println!("Elapsed time: {}", now.elapsed().as_secs());
Ok(())
}
@SiliconLion
Copy link
Copy Markdown

As an experiment, we asked Claude to make basically the same program, and it made 39 files and over 8000loc before it gave up and wrote a "todo" list lol

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment