Skip to content

Commit 8da3523

Browse files
committed
Unescapes html in PageParser.href_match_to_url
PageParser breaks if the links contain any escaped characters. This fixes that bug.
1 parent fcdee8a commit 8da3523

File tree

2 files changed

+16
-1
lines changed

2 files changed

+16
-1
lines changed

pex/crawler.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@
2222
from urlparse import urlparse
2323

2424

25+
def unescape(s):
26+
"""Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
27+
s = s.replace("&lt;", "<")
28+
s = s.replace("&gt;", ">")
29+
# this has to be last:
30+
s = s.replace("&amp;", "&")
31+
return s
32+
33+
2534
class PageParser(object):
2635
"""A helper class to extract and differentiate ordinary and download links from webpages."""
2736

@@ -34,7 +43,7 @@ class PageParser(object):
3443
def href_match_to_url(cls, match):
3544
def pick(group):
3645
return '' if group is None else group
37-
return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
46+
return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))
3847

3948
@classmethod
4049
def rel_links(cls, page):

tests/test_crawler.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ def test_page_parser_basic():
4646
assert lpp("<a href='stuff'> <a href=%s>" % target) == (['stuff', href], [])
4747

4848

49+
def test_page_parser_escaped_html():
50+
url = 'url?param1=val&param2=val2'
51+
link = 'a href="%s"' % url.replace('&', '&amp;')
52+
assert lpp(link) == ([url], [])
53+
54+
4955
def test_page_parser_rels():
5056
VALID_RELS = tuple(PageParser.REL_TYPES)
5157
for rel in VALID_RELS + ('', ' ', 'blah'):

0 commit comments

Comments
 (0)