11defmodule Floki do
2+ alias Floki.Finder
3+ alias Floki.Parser
4+
25 @ moduledoc """
36 A HTML parser and seeker.
47
@@ -10,16 +13,18 @@ defmodule Floki do
1013
1114 Assuming that you have the following HTML:
1215
13- <!doctype html>
14- <html>
15- <body>
16- <section id="content">
17- <p class="headline">Floki</p>
18- <a href="https://github.com/philss/floki">Github page</a>
19- <span data-model="user">philss</span>
20- </section>
21- </body>
22- </html>
16+ ```html
17+ <!doctype html>
18+ <html>
19+ <body>
20+ <section id="content">
21+ <p class="headline">Floki</p>
22+ <a href="https://github.com/philss/floki">Github page</a>
23+ <span data-model="user">philss</span>
24+ </section>
25+ </body>
26+ </html>
27+ ```
2328
2429 You can perform the following queries:
2530
@@ -66,15 +71,10 @@ defmodule Floki do
6671
6772 """
6873
69- @ floki_root_node "floki"
70-
7174 @ spec parse ( binary ) :: html_tree
7275
7376 def parse ( html ) do
74- html = "<#{ @ floki_root_node } >#{ html } </#{ @ floki_root_node } >"
75- { @ floki_root_node , [ ] , parsed } = :mochiweb_html . parse ( html )
76-
77- if length ( parsed ) == 1 , do: hd ( parsed ) , else: parsed
77+ Parser . parse ( html )
7878 end
7979
8080 @ doc """
@@ -101,66 +101,8 @@ defmodule Floki do
101101
102102 @ spec find ( binary | html_tree , binary ) :: html_tree
103103
104- def find ( html , selector ) when is_binary ( html ) do
105- html_tree = parse ( html )
106-
107- find ( html_tree , selector )
108- end
109-
110- def find ( html_tree , selector ) when is_tuple ( selector ) do
111- { :ok , nodes } = find_by_selector ( selector , html_tree , & attr_matcher / 3 , { :ok , [ ] } )
112-
113- Enum . reverse ( nodes )
114- end
115-
116- def find ( html_tree , selector ) do
117- tag_attr_val_regex = ~r/ (?'tag'.+)\[ (?'attr'.+)=(?'val'.+)\] /
118- attr_val_regex = ~r/ \[ (?'attr'.+)=(?'val'.+)\] /
119-
120- cond do
121- String . contains? ( selector , "," ) ->
122- selectors = String . split ( selector , "," )
123-
124- Enum . reduce selectors , [ ] , fn ( selector , acc ) ->
125- selector = String . strip ( selector )
126-
127- nodes = find ( html_tree , selector )
128-
129- unless is_list ( nodes ) , do: nodes = [ nodes ]
130-
131- Enum . concat ( acc , nodes )
132- end
133- String . contains? ( selector , "\s " ) ->
134- descendent_selector = String . split ( selector )
135-
136- Enum . reduce descendent_selector , html_tree , fn ( selector , tree ) ->
137- find ( tree , selector )
138- end
139- String . starts_with? ( selector , "." ) ->
140- "." <> class = selector
141- { :ok , nodes } = find_by_selector ( class , html_tree , & class_matcher / 3 , { :ok , [ ] } )
142-
143- Enum . reverse ( nodes )
144- String . starts_with? ( selector , "#" ) ->
145- "#" <> id = selector
146- { _status , nodes } = find_by_selector ( id , html_tree , & id_matcher / 3 , { :ok , [ ] } )
147-
148- List . first ( nodes )
149- Regex . match? ( attr_val_regex , selector ) ->
150- % { "attr" => attr , "val" => val } = Regex . named_captures ( attr_val_regex , selector )
151- { :ok , nodes } = find_by_selector ( { attr , val } , html_tree , & attr_matcher / 3 , { :ok , [ ] } )
152-
153- Enum . reverse ( nodes )
154- Regex . match? ( tag_attr_val_regex , selector ) ->
155- % { "tag" => tag , "attr" => attr , "val" => val } = Regex . named_captures ( attr_val_regex , selector )
156- { :ok , nodes } = find_by_selector ( { tag , attr , val } , html_tree , & attr_matcher / 3 , { :ok , [ ] } )
157-
158- Enum . reverse ( nodes )
159- true ->
160- { :ok , nodes } = find_by_selector ( selector , html_tree , & tag_matcher / 3 , { :ok , [ ] } )
161-
162- Enum . reverse ( nodes )
163- end
104+ def find ( html , selector ) do
105+ Finder . find ( html , selector )
164106 end
165107
166108 @ doc """
@@ -178,7 +120,7 @@ defmodule Floki do
178120 def attribute ( html , selector , attribute_name ) do
179121 html
180122 |> find ( selector )
181- |> attribute_values ( attribute_name )
123+ |> Finder . attribute_values ( attribute_name )
182124 end
183125
184126 @ doc """
@@ -196,11 +138,10 @@ defmodule Floki do
196138 def attribute ( html_tree , attribute_name ) when is_binary ( html_tree ) do
197139 html_tree
198140 |> parse
199- |> attribute ( attribute_name )
141+ |> Finder . attribute_values ( attribute_name )
200142 end
201143 def attribute ( elements , attribute_name ) do
202- elements
203- |> attribute_values ( attribute_name )
144+ Finder . attribute_values ( elements , attribute_name )
204145 end
205146
206147 @ doc """
@@ -235,114 +176,4 @@ defmodule Floki do
235176
236177 search_strategy . get ( html_tree )
237178 end
238-
239- defp attribute_match? ( attributes , attribute_name ) do
240- Enum . find attributes , fn ( { attr_name , _ } ) ->
241- attr_name == attribute_name
242- end
243- end
244-
245- defp attribute_match? ( attributes , attribute_name , selector_value ) do
246- Enum . find attributes , fn ( attribute ) ->
247- { attr_name , attr_value } = attribute
248-
249- attr_name == attribute_name && value_match? ( attr_value , selector_value )
250- end
251- end
252-
253- defp find_by_selector ( _selector , { } , _ , acc ) , do: acc
254- defp find_by_selector ( _selector , [ ] , _ , acc ) , do: acc
255- defp find_by_selector ( _selector , _ , _ , { :done , nodes } ) , do: { :done , nodes }
256- defp find_by_selector ( _selector , tree , _ , acc ) when is_binary ( tree ) , do: acc
257- defp find_by_selector ( selector , [ h | t ] , matcher , acc ) do
258- acc = find_by_selector ( selector , h , matcher , acc )
259- find_by_selector ( selector , t , matcher , acc )
260- end
261- # Ignore comments
262- defp find_by_selector ( _selector , { :comment , _comment } , _ , acc ) , do: acc
263- # Ignore XML document version
264- defp find_by_selector ( _selector , { :pi , _xml , _xml_attrs } , _ , acc ) , do: acc
265- defp find_by_selector ( selector , node , matcher , acc ) do
266- { _ , _ , child_node } = node
267-
268- acc = matcher . ( selector , node , acc )
269-
270- find_by_selector ( selector , child_node , matcher , acc )
271- end
272-
273- defp attribute_values ( element , attr_name ) when is_tuple ( element ) do
274- attribute_values ( [ element ] , attr_name )
275- end
276- defp attribute_values ( elements , attr_name ) do
277- values = Enum . reduce elements , [ ] , fn ( { _ , attributes , _ } , acc ) ->
278- case attribute_match? ( attributes , attr_name ) do
279- { _attr_name , value } ->
280- [ value | acc ]
281- _ ->
282- acc
283- end
284- end
285-
286- Enum . reverse ( values )
287- end
288-
289- defp attr_matcher ( { attr , value } , node , acc ) do
290- { _ , attributes , _ } = node
291- { :ok , acc_nodes } = acc
292-
293- if attribute_match? ( attributes , attr , value ) do
294- acc = { :ok , [ node | acc_nodes ] }
295- end
296-
297- acc
298- end
299- defp attr_matcher ( { tag_name , attr , value } , node , acc ) do
300- { tag , attributes , _ } = node
301- { :ok , acc_nodes } = acc
302-
303- if tag == tag_name and attribute_match? ( attributes , attr , value ) do
304- acc = { :ok , [ node | acc_nodes ] }
305- end
306-
307- acc
308- end
309-
310- defp class_matcher ( class_name , node , acc ) do
311- { _ , attributes , _ } = node
312- { :ok , acc_nodes } = acc
313-
314- if attribute_match? ( attributes , "class" , class_name ) do
315- acc = { :ok , [ node | acc_nodes ] }
316- end
317-
318- acc
319- end
320-
321- defp tag_matcher ( tag_name , node , acc ) do
322- { tag , _ , _ } = node
323- { :ok , acc_nodes } = acc
324-
325- if tag == tag_name do
326- acc = { :ok , [ node | acc_nodes ] }
327- end
328-
329- acc
330- end
331-
332- defp id_matcher ( id , node , acc ) do
333- { _ , attributes , _ } = node
334- { :ok , acc_nodes } = acc
335-
336- if attribute_match? ( attributes , "id" , id ) do
337- acc = { :done , [ node | acc_nodes ] }
338- end
339-
340- acc
341- end
342-
343- defp value_match? ( attribute_value , selector_value ) do
344- attribute_value
345- |> String . split
346- |> Enum . any? ( fn ( x ) -> x == selector_value end )
347- end
348179end
0 commit comments