Skip to content

Commit cbe78a1

Browse files
author
Carlos Damazio
authored
SPAWNER-5: small refactoring (#10)
1 parent a019e94 commit cbe78a1

File tree

2 files changed

+47
-38
lines changed

2 files changed

+47
-38
lines changed

src/argparser.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ impl Program {
6363
}
6464
}
6565

66-
Program{
66+
Self {
6767
command,
6868
opts: ProgramOpts::new(args),
6969
}
@@ -183,7 +183,7 @@ impl ProgramOpts {
183183
}
184184
}
185185

186-
ProgramOpts {
186+
Self {
187187
start_url,
188188
mode,
189189
worker

src/crawler.rs

Lines changed: 45 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::net::ToSocketAddrs;
55
use std::sync::{Mutex, Arc};
66
use std::thread::{self, JoinHandle};
77

8-
use log::{info, error};
8+
use log::error;
99

1010
use crate::net;
1111
use crate::url;
@@ -20,33 +20,32 @@ pub struct CrawlerInner {
2020
seen: Arc<Mutex<HashMap<String, HashSet<String>>>>
2121
}
2222

23+
#[derive(Clone)]
2324
pub struct Crawler {
2425
depth: u8,
2526
frontier: Vec<(String, String)>,
2627
inner: Arc<Mutex<CrawlerInner>>
2728
}
2829

2930
impl Request for CrawlerInner {
30-
fn request(&self, url: String, path: String) -> Result<String, Box<dyn Error + Send + Sync>> {
31-
info!("Requesting data from url \"{}\"", url);
32-
let ip_addr = url.to_socket_addrs()?.next().unwrap();
33-
let hostname = url.split(':').next().unwrap();
34-
let port = url.split(':').last().unwrap();
31+
// request prepares and send an HTTP request to the domain requesting a determined URI
32+
fn request(&self, domain: String, uri: String) -> Result<String, Box<dyn Error + Send + Sync>> {
33+
let ip_addr = domain.to_socket_addrs()?.next().unwrap();
34+
let hostname = domain.split(':').next().unwrap();
3535
let mut ssl_stream = net::build_ssl_stream(hostname, ip_addr)?;
3636

3737
let mut headers = HashMap::new();
3838
headers.insert("Host", hostname);
3939
headers.insert("Connection", "close");
4040
let http_header = format!(
4141
"GET {} HTTP/1.1\r\n{}\r\n\r\n",
42-
path,
42+
uri,
4343
headers
4444
.iter()
4545
.map(|(key, val)| format!("{}: {}", key, val))
4646
.collect::<Vec<_>>()
4747
.join("\r\n")
4848
);
49-
info!("{}:{}", hostname, port);
5049

5150
let mut response = String::new();
5251
ssl_stream.write(http_header.as_bytes()).expect("could not write to socket");
@@ -59,7 +58,7 @@ impl Request for CrawlerInner {
5958

6059
impl Crawler {
6160
pub fn new(start_url: String, depth: u8) -> Crawler {
62-
Crawler {
61+
Self {
6362
depth,
6463
frontier: Vec::new(),
6564
inner: Arc::new(Mutex::new(CrawlerInner {
@@ -69,10 +68,41 @@ impl Crawler {
6968
}
7069
}
7170

72-
// cycle through depth: depth -> frontier -> spawn requests -> extract -> store -> next depth
71+
// extract_urls extracts the "base_url:port" from the crawler response, along with their
72+
// URI's that are cited, and filters already seen domains and path combinations
73+
fn extract_urls(&mut self, url_data: &str, frontier: &Arc<Mutex<Vec<(String, String)>>>) {
74+
for (k, v) in url::extract(url_data.to_string()) {
75+
if let Ok(local_self) = self.inner.lock() {
76+
if let Ok(mut seen) = local_self.seen.lock() {
77+
if seen.contains_key(k.as_str()) {
78+
for url in v {
79+
if !seen.get_mut(k.as_str()).unwrap().contains(url.as_str()) {
80+
seen.get_mut(k.as_str()).unwrap().insert(url.to_string());
81+
if let Ok(mut frontier) = frontier.lock() {
82+
frontier.push(url::format(url.to_string()));
83+
}
84+
}
85+
}
86+
} else {
87+
seen.insert(k.to_string(), v.clone());
88+
if let Ok(mut frontier) = frontier.lock() {
89+
frontier.extend(v.iter().map(|x| url::format(x.to_string())));
90+
}
91+
}
92+
}
93+
}
94+
}
95+
}
96+
97+
// evaluate_urls ranks the domains and paths references through each iteration
98+
// TODO: Evaluate domains and rank them according to number of references in each iteration
99+
fn evaluate_urls(&mut self) {}
100+
101+
// process executes the crawling cycle through determined depth
102+
// process -> depth -> frontier -> spawn requests -> extract -> eval -> store -> next depth
73103
pub fn process(&mut self) -> Result<Vec<(String, String)>, Vec<Box<dyn Error + Send + Sync>>> {
74104
for i in 0..self.depth {
75-
// it should be sync at first
105+
// in the first depth, it should be sync
76106
if i == 0 {
77107
let data: String;
78108
if let Ok(local_self) = self.inner.lock() {
@@ -103,12 +133,11 @@ impl Crawler {
103133
for url in self.frontier.clone() {
104134
let errors = Arc::clone(&errors);
105135
let frontier = Arc::clone(&new_frontier);
106-
let local_self = self.inner.clone();
136+
let mut crawler = self.clone();
107137

108-
// child thread to request each item in the frontier
109138
let child_thread = thread::spawn(move || {
110139
let mut url_data: String = String::from("");
111-
if let Ok(local_self) = local_self.lock() {
140+
if let Ok(local_self) = crawler.inner.lock() {
112141
match local_self.request(url.0, url.1) {
113142
Ok(data) => {
114143
url_data = data;
@@ -122,27 +151,7 @@ impl Crawler {
122151
}
123152

124153
if !url_data.is_empty() {
125-
for (k, v) in url::extract(url_data.to_string()) {
126-
if let Ok(local_self) = local_self.lock() {
127-
if let Ok(mut seen) = local_self.seen.lock() {
128-
if seen.contains_key(k.as_str()) {
129-
for url in v {
130-
if !seen.get_mut(k.as_str()).unwrap().contains(url.as_str()) {
131-
seen.get_mut(k.as_str()).unwrap().insert(url.to_string());
132-
if let Ok(mut frontier) = frontier.lock() {
133-
frontier.push(url::format(url.to_string()));
134-
}
135-
}
136-
}
137-
} else {
138-
seen.insert(k.to_string(), v.clone());
139-
if let Ok(mut frontier) = frontier.lock() {
140-
frontier.extend(v.iter().map(|x| url::format(x.to_string())));
141-
}
142-
}
143-
}
144-
}
145-
}
154+
crawler.extract_urls(url_data.as_str(), &frontier);
146155
}
147156
});
148157

@@ -157,7 +166,7 @@ impl Crawler {
157166
}
158167

159168
if let Ok(frontier) = new_frontier.clone().lock() {
160-
self.frontier.extend(frontier.clone().into_iter())
169+
self.frontier.extend(frontier.clone().into_iter());
161170
}
162171
}
163172
}

0 commit comments

Comments
 (0)