Skip to content

Commit 4cde3f4

Browse files
committed
Fix Floki.find/2 when there is a non-HTML input
This commit closes the issue #17. It also includes a refactor to organize the code. It only moves related things to its modules.
1 parent 125252a commit 4cde3f4

File tree

7 files changed

+226
-193
lines changed

7 files changed

+226
-193
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
55

66
## [Unreleased][unreleased]
77

8+
- Fix `Floki.find/2` when there is a non-HTML input.
9+
It closes the [issue #17](https://github.com/philss/floki/issues/17)
10+
811
## [0.3.2] - 2015-06-27
912

1013
### Fixed

lib/floki.ex

Lines changed: 21 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
defmodule Floki do
2+
alias Floki.Finder
3+
alias Floki.Parser
4+
25
@moduledoc """
36
A HTML parser and seeker.
47
@@ -10,16 +13,18 @@ defmodule Floki do
1013
1114
Assuming that you have the following HTML:
1215
13-
<!doctype html>
14-
<html>
15-
<body>
16-
<section id="content">
17-
<p class="headline">Floki</p>
18-
<a href="https://github.com/philss/floki">Github page</a>
19-
<span data-model="user">philss</span>
20-
</section>
21-
</body>
22-
</html>
16+
```html
17+
<!doctype html>
18+
<html>
19+
<body>
20+
<section id="content">
21+
<p class="headline">Floki</p>
22+
<a href="https://github.com/philss/floki">Github page</a>
23+
<span data-model="user">philss</span>
24+
</section>
25+
</body>
26+
</html>
27+
```
2328
2429
You can perform the following queries:
2530
@@ -66,15 +71,10 @@ defmodule Floki do
6671
6772
"""
6873

69-
@floki_root_node "floki"
70-
7174
@spec parse(binary) :: html_tree
7275

7376
def parse(html) do
74-
html = "<#{@floki_root_node}>#{html}</#{@floki_root_node}>"
75-
{@floki_root_node, [], parsed} = :mochiweb_html.parse(html)
76-
77-
if length(parsed) == 1, do: hd(parsed), else: parsed
77+
Parser.parse(html)
7878
end
7979

8080
@doc """
@@ -101,66 +101,8 @@ defmodule Floki do
101101

102102
@spec find(binary | html_tree, binary) :: html_tree
103103

104-
def find(html, selector) when is_binary(html) do
105-
html_tree = parse(html)
106-
107-
find(html_tree, selector)
108-
end
109-
110-
def find(html_tree, selector) when is_tuple(selector) do
111-
{:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []})
112-
113-
Enum.reverse(nodes)
114-
end
115-
116-
def find(html_tree, selector) do
117-
tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/
118-
attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/
119-
120-
cond do
121-
String.contains?(selector, ",") ->
122-
selectors = String.split(selector, ",")
123-
124-
Enum.reduce selectors, [], fn(selector, acc) ->
125-
selector = String.strip(selector)
126-
127-
nodes = find(html_tree, selector)
128-
129-
unless is_list(nodes), do: nodes = [nodes]
130-
131-
Enum.concat(acc, nodes)
132-
end
133-
String.contains?(selector, "\s") ->
134-
descendent_selector = String.split(selector)
135-
136-
Enum.reduce descendent_selector, html_tree, fn(selector, tree) ->
137-
find(tree, selector)
138-
end
139-
String.starts_with?(selector, ".") ->
140-
"." <> class = selector
141-
{:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []})
142-
143-
Enum.reverse(nodes)
144-
String.starts_with?(selector, "#") ->
145-
"#" <> id = selector
146-
{_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []})
147-
148-
List.first(nodes)
149-
Regex.match?(attr_val_regex, selector) ->
150-
%{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
151-
{:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []})
152-
153-
Enum.reverse(nodes)
154-
Regex.match?(tag_attr_val_regex, selector) ->
155-
%{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
156-
{:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []})
157-
158-
Enum.reverse(nodes)
159-
true ->
160-
{:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []})
161-
162-
Enum.reverse(nodes)
163-
end
104+
def find(html, selector) do
105+
Finder.find(html, selector)
164106
end
165107

166108
@doc """
@@ -178,7 +120,7 @@ defmodule Floki do
178120
def attribute(html, selector, attribute_name) do
179121
html
180122
|> find(selector)
181-
|> attribute_values(attribute_name)
123+
|> Finder.attribute_values(attribute_name)
182124
end
183125

184126
@doc """
@@ -196,11 +138,10 @@ defmodule Floki do
196138
def attribute(html_tree, attribute_name) when is_binary(html_tree) do
197139
html_tree
198140
|> parse
199-
|> attribute(attribute_name)
141+
|> Finder.attribute_values(attribute_name)
200142
end
201143
def attribute(elements, attribute_name) do
202-
elements
203-
|> attribute_values(attribute_name)
144+
Finder.attribute_values(elements, attribute_name)
204145
end
205146

206147
@doc """
@@ -235,114 +176,4 @@ defmodule Floki do
235176

236177
search_strategy.get(html_tree)
237178
end
238-
239-
defp attribute_match?(attributes, attribute_name) do
240-
Enum.find attributes, fn({attr_name, _}) ->
241-
attr_name == attribute_name
242-
end
243-
end
244-
245-
defp attribute_match?(attributes, attribute_name, selector_value) do
246-
Enum.find attributes, fn(attribute) ->
247-
{attr_name, attr_value} = attribute
248-
249-
attr_name == attribute_name && value_match?(attr_value, selector_value)
250-
end
251-
end
252-
253-
defp find_by_selector(_selector, {}, _, acc), do: acc
254-
defp find_by_selector(_selector, [], _, acc), do: acc
255-
defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes}
256-
defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc
257-
defp find_by_selector(selector, [h|t], matcher, acc) do
258-
acc = find_by_selector(selector, h, matcher, acc)
259-
find_by_selector(selector, t, matcher, acc)
260-
end
261-
# Ignore comments
262-
defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc
263-
# Ignore XML document version
264-
defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc
265-
defp find_by_selector(selector, node, matcher, acc) do
266-
{_, _, child_node} = node
267-
268-
acc = matcher.(selector, node, acc)
269-
270-
find_by_selector(selector, child_node, matcher, acc)
271-
end
272-
273-
defp attribute_values(element, attr_name) when is_tuple(element) do
274-
attribute_values([element], attr_name)
275-
end
276-
defp attribute_values(elements, attr_name) do
277-
values = Enum.reduce elements, [], fn({_, attributes, _}, acc) ->
278-
case attribute_match?(attributes, attr_name) do
279-
{_attr_name, value} ->
280-
[value|acc]
281-
_ ->
282-
acc
283-
end
284-
end
285-
286-
Enum.reverse(values)
287-
end
288-
289-
defp attr_matcher({attr, value}, node, acc) do
290-
{_, attributes, _} = node
291-
{:ok, acc_nodes} = acc
292-
293-
if attribute_match?(attributes, attr, value) do
294-
acc = {:ok, [node|acc_nodes]}
295-
end
296-
297-
acc
298-
end
299-
defp attr_matcher({tag_name, attr, value}, node, acc) do
300-
{tag, attributes, _} = node
301-
{:ok, acc_nodes} = acc
302-
303-
if tag == tag_name and attribute_match?(attributes, attr, value) do
304-
acc = {:ok, [node|acc_nodes]}
305-
end
306-
307-
acc
308-
end
309-
310-
defp class_matcher(class_name, node, acc) do
311-
{_, attributes, _} = node
312-
{:ok, acc_nodes} = acc
313-
314-
if attribute_match?(attributes, "class", class_name) do
315-
acc = {:ok, [node|acc_nodes]}
316-
end
317-
318-
acc
319-
end
320-
321-
defp tag_matcher(tag_name, node, acc) do
322-
{tag, _, _} = node
323-
{:ok, acc_nodes} = acc
324-
325-
if tag == tag_name do
326-
acc = {:ok, [node|acc_nodes]}
327-
end
328-
329-
acc
330-
end
331-
332-
defp id_matcher(id, node, acc) do
333-
{_, attributes, _} = node
334-
{:ok, acc_nodes} = acc
335-
336-
if attribute_match?(attributes, "id", id) do
337-
acc = {:done, [node|acc_nodes]}
338-
end
339-
340-
acc
341-
end
342-
343-
defp value_match?(attribute_value, selector_value) do
344-
attribute_value
345-
|> String.split
346-
|> Enum.any?(fn(x) -> x == selector_value end)
347-
end
348179
end

lib/floki/finder.ex

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
defmodule Floki.Finder do
2+
@moduledoc false
3+
4+
import Floki.Matchers
5+
6+
def find(html, selector) when is_binary(html) do
7+
Floki.Parser.parse(html) |> do_find(selector)
8+
end
9+
10+
def find(html_tree, selector), do: do_find(html_tree, selector)
11+
12+
def attribute_values(element, attr_name) when is_tuple(element) do
13+
attribute_values([element], attr_name)
14+
end
15+
def attribute_values(elements, attr_name) do
16+
values = Enum.reduce elements, [], fn({_, attributes, _}, acc) ->
17+
case attribute_match?(attributes, attr_name) do
18+
{_attr_name, value} ->
19+
[value|acc]
20+
_ ->
21+
acc
22+
end
23+
end
24+
25+
Enum.reverse(values)
26+
end
27+
28+
defp do_find(html_tree, selector) when is_tuple(selector) do
29+
{:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []})
30+
Enum.reverse(nodes)
31+
end
32+
33+
defp do_find(html_tree, selector) do
34+
tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/
35+
attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/
36+
37+
cond do
38+
String.contains?(selector, ",") ->
39+
selectors = String.split(selector, ",")
40+
41+
Enum.reduce selectors, [], fn(selector, acc) ->
42+
selector = String.strip(selector)
43+
44+
nodes = do_find(html_tree, selector)
45+
46+
unless is_list(nodes), do: nodes = [nodes]
47+
48+
Enum.concat(acc, nodes)
49+
end
50+
String.contains?(selector, "\s") ->
51+
descendent_selector = String.split(selector)
52+
53+
Enum.reduce descendent_selector, html_tree, fn(selector, tree) ->
54+
do_find(tree, selector)
55+
end
56+
String.starts_with?(selector, ".") ->
57+
"." <> class = selector
58+
{:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []})
59+
60+
Enum.reverse(nodes)
61+
String.starts_with?(selector, "#") ->
62+
"#" <> id = selector
63+
{_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []})
64+
65+
List.first(nodes)
66+
Regex.match?(attr_val_regex, selector) ->
67+
%{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
68+
{:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []})
69+
70+
Enum.reverse(nodes)
71+
Regex.match?(tag_attr_val_regex, selector) ->
72+
%{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
73+
{:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []})
74+
75+
Enum.reverse(nodes)
76+
true ->
77+
{:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []})
78+
79+
Enum.reverse(nodes)
80+
end
81+
end
82+
83+
defp find_by_selector(_selector, {}, _, acc), do: acc
84+
defp find_by_selector(_selector, [], _, acc), do: acc
85+
defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes}
86+
defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc
87+
defp find_by_selector(selector, [h|t], matcher, acc) do
88+
acc = find_by_selector(selector, h, matcher, acc)
89+
find_by_selector(selector, t, matcher, acc)
90+
end
91+
# Ignore comments
92+
defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc
93+
# Ignore XML document version
94+
defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc
95+
defp find_by_selector(selector, node, matcher, acc) do
96+
{_, _, child_node} = node
97+
98+
acc = matcher.(selector, node, acc)
99+
100+
find_by_selector(selector, child_node, matcher, acc)
101+
end
102+
end

0 commit comments

Comments
 (0)