The fastest web crawler written in Rust. Maintained by @a11ywatch.
Full Changelog: https://github.com/spider-rs/spider/compare/v1.82.7...v1.83.0
website.set_http_client
and website.get_client
thanks for the help @esemeniuc
Full Changelog: https://github.com/spider-rs/spider/compare/v1.81.2...v1.82.7
This release provides HTTP response headers as a feature [headers]
and [decentralized_headers]
.
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.85...v1.81.2
Some major changes in this release. It is recommended to upgrade immediately.
Thanks for the help @apsaltis @emgardner
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.63...v1.80.85
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.63...v1.80.68
Example reusing the configuration for crawls.
extern crate spider;
use spider::{tokio, website::Website, configuration::Configuration};
use std::{time::Instant, io::Error};
const CAPACITY: usize = 4;
const CRAWL_LIST: [&str; CAPACITY] = [
"https://rsseau.fr",
"https://jeffmendez.com",
"https://spider-rs.github.io/spider-nodejs/",
"https://spider-rs.github.io/spider-py/",
];
#[tokio::main]
async fn main() -> Result<(), Error> {
let config = Configuration::new()
.with_user_agent(Some("SpiderBot"))
.with_blacklist_url(Some(Vec::from(["https://rsseau.fr/resume".into()])))
.with_subdomains(false)
.with_tld(false)
.with_redirect_limit(3)
.with_respect_robots_txt(true)
.with_external_domains(Some(
Vec::from(["http://loto.rsseau.fr/"].map(|d| d.to_string())).into_iter(),
))
.build();
let mut handles = Vec::with_capacity(CAPACITY);
for website_url in CRAWL_LIST {
match Website::new(website_url)
.with_config(config.to_owned())
.build()
{
Ok(mut website) => {
let handle = tokio::spawn(async move {
println!("Starting Crawl - {:?}", website.get_domain().inner());
let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!(
"{:?} - Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
website.get_domain().inner(),
duration,
links.len()
);
});
handles.push(handle);
}
Err(e) => println!("{:?}", e),
}
}
for handle in handles {
let _ = handle.await;
}
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.19...v1.80.27
website.with_limit
]Example reusing the configuration for crawls.
extern crate spider;
use spider::{tokio, website::Website, configuration::Configuration};
use std::{time::Instant, io::Error};
const CAPACITY: usize = 4;
const CRAWL_LIST: [&str; CAPACITY] = [
"https://rsseau.fr",
"https://jeffmendez.com",
"https://spider-rs.github.io/spider-nodejs/",
"https://spider-rs.github.io/spider-py/",
];
#[tokio::main]
async fn main() -> Result<(), Error> {
let config = Configuration::new()
.with_user_agent(Some("SpiderBot"))
.with_blacklist_url(Some(Vec::from(["https://rsseau.fr/resume".into()])))
.with_subdomains(false)
.with_tld(false)
.with_redirect_limit(3)
.with_respect_robots_txt(true)
.with_external_domains(Some(
Vec::from(["http://loto.rsseau.fr/"].map(|d| d.to_string())).into_iter(),
))
.build();
let mut handles = Vec::with_capacity(CAPACITY);
for website_url in CRAWL_LIST {
match Website::new(website_url)
.with_config(config.to_owned())
.build()
{
Ok(mut website) => {
let handle = tokio::spawn(async move {
println!("Starting Crawl - {:?}", website.get_domain().inner());
let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!(
"{:?} - Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
website.get_domain().inner(),
duration,
links.len()
);
});
handles.push(handle);
}
Err(e) => println!("{:?}", e),
}
}
for handle in handles {
let _ = handle.await;
}
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.19...v1.80.63
Some performance improvements, full builder method defaults, and encoding support.
Example using dynamic streaming encoding. Enable the feature flag [encoding].
extern crate spider;
use spider::{tokio, hashbrown::HashMap, website::Website};
#[tokio::main]
async fn main() {
let mut website: Website =
Website::new("https://hoken.kakaku.com/health_check/blood_pressure/")
.with_budget(Some(HashMap::from([("*", 2)])))
.build()
.unwrap();
let mut rx2 = website.subscribe(16).unwrap();
tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
println!("{:?}", res.get_html_encoded("SHIFT_JIS"));
}
});
website.crawl().await;
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.15...v1.80.19
Example:
extern crate spider;
use spider::{tokio, website::Website, configuration::RedirectPolicy};
use std::io::Error;
#[tokio::main]
async fn main() -> Result<(), Error> {
let mut website = Website::new("https://rsseau.fr")
.with_depth(3)
.with_redirect_limit(4)
.with_redirect_policy(RedirectPolicy::Strict)
.build()
.unwrap();
website.crawl().await;
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!("Total pages: {:?}", links.len());
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.3...v1.80.15
Example:
Make sure to have the feat flag [cache
] enabled. Storing cache in memory can be done with the flag [cache_mem
] instead of using disk space.
extern crate spider;
use spider::tokio;
use spider::website::Website;
#[tokio::main]
async fn main() {
// we can use the builder method to enable caching or set `website.cache` to true directly.
let mut website: Website = Website::new("https://rsseau.fr")
.with_caching(true)
.build()
.unwrap();
website.crawl().await;
println!("Links found {:?}", website.get_links().len());
/// next run to website.crawl().await; will be faster since content is stored on disk.
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.70.4...v1.80.3