The fastest web crawler written in Rust. Maintained by @a11ywatch.
Example reusing the configuration for crawls.
extern crate spider;
use spider::{tokio, website::Website, configuration::Configuration};
use std::{time::Instant, io::Error};
const CAPACITY: usize = 4;
const CRAWL_LIST: [&str; CAPACITY] = [
"https://rsseau.fr",
"https://jeffmendez.com",
"https://spider-rs.github.io/spider-nodejs/",
"https://spider-rs.github.io/spider-py/",
];
#[tokio::main]
async fn main() -> Result<(), Error> {
let config = Configuration::new()
.with_user_agent(Some("SpiderBot"))
.with_blacklist_url(Some(Vec::from(["https://rsseau.fr/resume".into()])))
.with_subdomains(false)
.with_tld(false)
.with_redirect_limit(3)
.with_respect_robots_txt(true)
.with_external_domains(Some(
Vec::from(["http://loto.rsseau.fr/"].map(|d| d.to_string())).into_iter(),
))
.build();
let mut handles = Vec::with_capacity(CAPACITY);
for website_url in CRAWL_LIST {
match Website::new(website_url)
.with_config(config.to_owned())
.build()
{
Ok(mut website) => {
let handle = tokio::spawn(async move {
println!("Starting Crawl - {:?}", website.get_domain().inner());
let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!(
"{:?} - Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
website.get_domain().inner(),
duration,
links.len()
);
});
handles.push(handle);
}
Err(e) => println!("{:?}", e),
}
}
for handle in handles {
let _ = handle.await;
}
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.19...v1.80.27
website.with_limit
]Example reusing the configuration for crawls.
extern crate spider;
use spider::{tokio, website::Website, configuration::Configuration};
use std::{time::Instant, io::Error};
const CAPACITY: usize = 4;
const CRAWL_LIST: [&str; CAPACITY] = [
"https://rsseau.fr",
"https://jeffmendez.com",
"https://spider-rs.github.io/spider-nodejs/",
"https://spider-rs.github.io/spider-py/",
];
#[tokio::main]
async fn main() -> Result<(), Error> {
let config = Configuration::new()
.with_user_agent(Some("SpiderBot"))
.with_blacklist_url(Some(Vec::from(["https://rsseau.fr/resume".into()])))
.with_subdomains(false)
.with_tld(false)
.with_redirect_limit(3)
.with_respect_robots_txt(true)
.with_external_domains(Some(
Vec::from(["http://loto.rsseau.fr/"].map(|d| d.to_string())).into_iter(),
))
.build();
let mut handles = Vec::with_capacity(CAPACITY);
for website_url in CRAWL_LIST {
match Website::new(website_url)
.with_config(config.to_owned())
.build()
{
Ok(mut website) => {
let handle = tokio::spawn(async move {
println!("Starting Crawl - {:?}", website.get_domain().inner());
let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!(
"{:?} - Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
website.get_domain().inner(),
duration,
links.len()
);
});
handles.push(handle);
}
Err(e) => println!("{:?}", e),
}
}
for handle in handles {
let _ = handle.await;
}
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.19...v1.80.63
Some performance improvements, full builder method defaults, and encoding support.
Example using dynamic streaming encoding. Enable the feature flag [encoding].
extern crate spider;
use spider::{tokio, hashbrown::HashMap, website::Website};
#[tokio::main]
async fn main() {
let mut website: Website =
Website::new("https://hoken.kakaku.com/health_check/blood_pressure/")
.with_budget(Some(HashMap::from([("*", 2)])))
.build()
.unwrap();
let mut rx2 = website.subscribe(16).unwrap();
tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
println!("{:?}", res.get_html_encoded("SHIFT_JIS"));
}
});
website.crawl().await;
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.15...v1.80.19
Example:
extern crate spider;
use spider::{tokio, website::Website, configuration::RedirectPolicy};
use std::io::Error;
#[tokio::main]
async fn main() -> Result<(), Error> {
let mut website = Website::new("https://rsseau.fr")
.with_depth(3)
.with_redirect_limit(4)
.with_redirect_policy(RedirectPolicy::Strict)
.build()
.unwrap();
website.crawl().await;
let links = website.get_links();
for link in links {
println!("- {:?}", link.as_ref());
}
println!("Total pages: {:?}", links.len());
Ok(())
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.80.3...v1.80.15
Example:
Make sure to have the feat flag [cache
] enabled. Storing cache in memory can be done with the flag [cache_mem
] instead of using disk space.
extern crate spider;
use spider::tokio;
use spider::website::Website;
#[tokio::main]
async fn main() {
// we can use the builder method to enable caching or set `website.cache` to true directly.
let mut website: Website = Website::new("https://rsseau.fr")
.with_caching(true)
.build()
.unwrap();
website.crawl().await;
println!("Links found {:?}", website.get_links().len());
/// next run to website.crawl().await; will be faster since content is stored on disk.
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.70.4...v1.80.3
Request interception can be done by enabling [chrome_intercept]
and setting website.chrome_intercept
. This will block all resources that are not related to the domain speeding up the request when using Chrome.
Ex:
//! `cargo run --example chrome --features chrome_intercept`
extern crate spider;
use spider::tokio;
use spider::website::Website;
#[tokio::main]
async fn main() {
let block_images = true;
let mut website: Website = Website::new("https://rsseau.fr")
.with_chrome_intercept(true, block_images)
.build()
.unwrap();
let mut rx2 = website.subscribe(16).unwrap();
tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
println!("{:?}", page.get_url());
}
});
website.crawl().await;
println!("Links found {:?}", website.get_links().len());
}
Request interception can be done using the arg block_images
and enabling the [chrome_intercept]
feature flag.
Ex: --block_images
Full Changelog: https://github.com/spider-rs/spider/compare/v1.60.12...v1.70.5
This release brings a new feature flag (smart
), performance improvements, and fixes.
Smart mode brings the best of both worlds when crawling. It runs HTTP request first until JS page Rendering is required with Chrome.
Taking a screenshot manually can be done with the [chrome_store_page]
feature flag.
extern crate spider;
use spider::tokio;
use spider::website::Website;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
let mut rx2 = website.subscribe(16).unwrap();
tokio::spawn(async move {
while let Ok(page) = rx2.recv().await {
println!("Screenshotting: {:?}", page.get_url());
let full_page = false;
let omit_background = true;
page.screenshot(full_page, omit_background).await;
// output is stored by default to ./storage/ use the env variable SCREENSHOT_DIRECTORY to adjust the path.
}
});
website.crawl().await;
println!("Links found {:?}", website.get_links().len());
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.50.20...v1.60.13
chrome_screenshot
feature flagFull Changelog: https://github.com/spider-rs/spider/compare/v1.50.2...v1.50.20
You can now run a cron job at anytime to sync data from the crawls. Use the cron with subscribe
to handle data curation with ease.
[dependencies]
spider = { version = "1.50.0", features = ["sync", "cron"] }
extern crate spider;
use spider::website::{Website, run_cron};
use spider::tokio;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://choosealicense.com");
// set the cron to run or use the builder pattern `website.with_cron`.
website.cron_str = "1/5 * * * * *".into();
let mut rx2 = website.subscribe(16).unwrap();
let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
}
});
// take ownership of the website. You can also use website.run_cron, except you need to perform abort manually on handles created.
let runner = run_cron(website).await;
println!("Starting the Runner for 10 seconds");
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
let _ = tokio::join!(runner.stop(), join_handle);
}
Full Changelog: https://github.com/spider-rs/spider/compare/v1.49.10...v1.50.5
You can set a cookie String directly with website.cookie_str
that is added for each request. Using the cookie feature also enables storing cookies that are received.
Full Changelog: https://github.com/spider-rs/spider/compare/v1.49.10...v1.49.12