-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathweb-scrape.rs
More file actions
93 lines (80 loc) · 2.84 KB
/
web-scrape.rs
File metadata and controls
93 lines (80 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
use blockless_sdk::bless_crawl::*;
/// This example demonstrates how to use the Blockless SDK to perform web scraping
/// using the BlessCrawl functionality.
///
/// It shows how to:
/// - Create a BlessCrawl instance with default configuration
/// - Scrape content from a single URL with custom configuration overrides
/// - Map links from a webpage to discover available URLs
/// - Handle errors and responses appropriately
fn main() {
println!("=== Blockless Web Scraping SDK Example ===\n");
example_scraping();
example_mapping();
example_crawling();
}
fn example_scraping() {
println!("--- Example 1: Basic Web Scraping ---");
let url = "https://example.com";
println!("scraping: {}...", url);
// First scrape with default config
let response = BlessCrawl::default()
.scrape(url, None)
.expect("Failed to scrape");
println!("response with default config: {:?}", response);
println!();
println!(
"---------- markdown ----------\n{}\n------------------------------",
response.data.content
);
}
fn example_mapping() {
println!("--- Example 2: Link Mapping/Discovery ---");
let url = "https://example.com";
println!("Mapping links from: {}", url);
let options = MapOptions::new()
.with_link_types(vec!["internal".to_string(), "external".to_string()])
.with_base_url(url.to_string())
.with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
let response = BlessCrawl::default()
.map(url, Some(options))
.expect("Failed to map");
println!("response: {:?}", response);
println!();
println!(
"------------ links ------------\n{:?}\n------------------------------",
response.data.links
);
println!();
println!(
"------------ total links ------------\n{}\n------------------------------",
response.data.total_links
);
}
fn example_crawling() {
println!("--- Example 3: Recursive Website Crawling ---");
let url = "https://example.com";
println!("Crawling website: {}", url);
let options = CrawlOptions::new()
.with_max_depth(2)
.with_limit(10)
.with_include_paths(vec!["/".to_string()])
.with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
.with_follow_external(false)
.with_delay_between_requests(1000)
.with_parallel_requests(3);
let response = BlessCrawl::default()
.crawl(url, Some(options))
.expect("Failed to crawl");
println!("response: {:?}", response);
println!();
println!(
"------------ pages ------------\n{:?}\n------------------------------",
response.data.pages
);
println!();
println!(
"------------ total pages ------------\n{}\n------------------------------",
response.data.total_pages
);
}