@@ -5,7 +5,7 @@ use std::net::ToSocketAddrs;
55use std:: sync:: { Mutex , Arc } ;
66use std:: thread:: { self , JoinHandle } ;
77
8- use log:: { info , error} ;
8+ use log:: error;
99
1010use crate :: net;
1111use crate :: url;
@@ -20,33 +20,32 @@ pub struct CrawlerInner {
2020 seen : Arc < Mutex < HashMap < String , HashSet < String > > > >
2121}
2222
23+ #[ derive( Clone ) ]
2324pub struct Crawler {
2425 depth : u8 ,
2526 frontier : Vec < ( String , String ) > ,
2627 inner : Arc < Mutex < CrawlerInner > >
2728}
2829
2930impl Request for CrawlerInner {
30- fn request ( & self , url : String , path : String ) -> Result < String , Box < dyn Error + Send + Sync > > {
31- info ! ( "Requesting data from url \" {}\" " , url) ;
32- let ip_addr = url. to_socket_addrs ( ) ?. next ( ) . unwrap ( ) ;
33- let hostname = url. split ( ':' ) . next ( ) . unwrap ( ) ;
34- let port = url. split ( ':' ) . last ( ) . unwrap ( ) ;
31+ // request prepares and send an HTTP request to the domain requesting a determined URI
32+ fn request ( & self , domain : String , uri : String ) -> Result < String , Box < dyn Error + Send + Sync > > {
33+ let ip_addr = domain. to_socket_addrs ( ) ?. next ( ) . unwrap ( ) ;
34+ let hostname = domain. split ( ':' ) . next ( ) . unwrap ( ) ;
3535 let mut ssl_stream = net:: build_ssl_stream ( hostname, ip_addr) ?;
3636
3737 let mut headers = HashMap :: new ( ) ;
3838 headers. insert ( "Host" , hostname) ;
3939 headers. insert ( "Connection" , "close" ) ;
4040 let http_header = format ! (
4141 "GET {} HTTP/1.1\r \n {}\r \n \r \n " ,
42- path ,
42+ uri ,
4343 headers
4444 . iter( )
4545 . map( |( key, val) | format!( "{}: {}" , key, val) )
4646 . collect:: <Vec <_>>( )
4747 . join( "\r \n " )
4848 ) ;
49- info ! ( "{}:{}" , hostname, port) ;
5049
5150 let mut response = String :: new ( ) ;
5251 ssl_stream. write ( http_header. as_bytes ( ) ) . expect ( "could not write to socket" ) ;
@@ -59,7 +58,7 @@ impl Request for CrawlerInner {
5958
6059impl Crawler {
6160 pub fn new ( start_url : String , depth : u8 ) -> Crawler {
62- Crawler {
61+ Self {
6362 depth,
6463 frontier : Vec :: new ( ) ,
6564 inner : Arc :: new ( Mutex :: new ( CrawlerInner {
@@ -69,10 +68,41 @@ impl Crawler {
6968 }
7069 }
7170
72- // cycle through depth: depth -> frontier -> spawn requests -> extract -> store -> next depth
71+ // extract_urls extracts the "base_url:port" from the crawler response, along with their
72+ // URI's that are cited, and filters already seen domains and path combinations
73+ fn extract_urls ( & mut self , url_data : & str , frontier : & Arc < Mutex < Vec < ( String , String ) > > > ) {
74+ for ( k, v) in url:: extract ( url_data. to_string ( ) ) {
75+ if let Ok ( local_self) = self . inner . lock ( ) {
76+ if let Ok ( mut seen) = local_self. seen . lock ( ) {
77+ if seen. contains_key ( k. as_str ( ) ) {
78+ for url in v {
79+ if !seen. get_mut ( k. as_str ( ) ) . unwrap ( ) . contains ( url. as_str ( ) ) {
80+ seen. get_mut ( k. as_str ( ) ) . unwrap ( ) . insert ( url. to_string ( ) ) ;
81+ if let Ok ( mut frontier) = frontier. lock ( ) {
82+ frontier. push ( url:: format ( url. to_string ( ) ) ) ;
83+ }
84+ }
85+ }
86+ } else {
87+ seen. insert ( k. to_string ( ) , v. clone ( ) ) ;
88+ if let Ok ( mut frontier) = frontier. lock ( ) {
89+ frontier. extend ( v. iter ( ) . map ( |x| url:: format ( x. to_string ( ) ) ) ) ;
90+ }
91+ }
92+ }
93+ }
94+ }
95+ }
96+
97+ // evaluate_urls ranks the domains and paths references through each iteration
98+ // TODO: Evaluate domains and rank them according to number of references in each iteration
99+ fn evaluate_urls ( & mut self ) { }
100+
101+ // process executes the crawling cycle through determined depth
102+ // process -> depth -> frontier -> spawn requests -> extract -> eval -> store -> next depth
73103 pub fn process ( & mut self ) -> Result < Vec < ( String , String ) > , Vec < Box < dyn Error + Send + Sync > > > {
74104 for i in 0 ..self . depth {
75- // it should be sync at first
105+ // in the first depth, it should be sync
76106 if i == 0 {
77107 let data: String ;
78108 if let Ok ( local_self) = self . inner . lock ( ) {
@@ -103,12 +133,11 @@ impl Crawler {
103133 for url in self . frontier . clone ( ) {
104134 let errors = Arc :: clone ( & errors) ;
105135 let frontier = Arc :: clone ( & new_frontier) ;
106- let local_self = self . inner . clone ( ) ;
136+ let mut crawler = self . clone ( ) ;
107137
108- // child thread to request each item in the frontier
109138 let child_thread = thread:: spawn ( move || {
110139 let mut url_data: String = String :: from ( "" ) ;
111- if let Ok ( local_self) = local_self . lock ( ) {
140+ if let Ok ( local_self) = crawler . inner . lock ( ) {
112141 match local_self. request ( url. 0 , url. 1 ) {
113142 Ok ( data) => {
114143 url_data = data;
@@ -122,27 +151,7 @@ impl Crawler {
122151 }
123152
124153 if !url_data. is_empty ( ) {
125- for ( k, v) in url:: extract ( url_data. to_string ( ) ) {
126- if let Ok ( local_self) = local_self. lock ( ) {
127- if let Ok ( mut seen) = local_self. seen . lock ( ) {
128- if seen. contains_key ( k. as_str ( ) ) {
129- for url in v {
130- if !seen. get_mut ( k. as_str ( ) ) . unwrap ( ) . contains ( url. as_str ( ) ) {
131- seen. get_mut ( k. as_str ( ) ) . unwrap ( ) . insert ( url. to_string ( ) ) ;
132- if let Ok ( mut frontier) = frontier. lock ( ) {
133- frontier. push ( url:: format ( url. to_string ( ) ) ) ;
134- }
135- }
136- }
137- } else {
138- seen. insert ( k. to_string ( ) , v. clone ( ) ) ;
139- if let Ok ( mut frontier) = frontier. lock ( ) {
140- frontier. extend ( v. iter ( ) . map ( |x| url:: format ( x. to_string ( ) ) ) ) ;
141- }
142- }
143- }
144- }
145- }
154+ crawler. extract_urls ( url_data. as_str ( ) , & frontier) ;
146155 }
147156 } ) ;
148157
@@ -157,7 +166,7 @@ impl Crawler {
157166 }
158167
159168 if let Ok ( frontier) = new_frontier. clone ( ) . lock ( ) {
160- self . frontier . extend ( frontier. clone ( ) . into_iter ( ) )
169+ self . frontier . extend ( frontier. clone ( ) . into_iter ( ) ) ;
161170 }
162171 }
163172 }
0 commit comments