Unescapes html in PageParser.href_match_to_url

daveFNbuck · daveFNbuck · commit 8da352359a53 · 2016-01-02T10:12:23.000-08:00
PageParser breaks if the links contain any escaped characters. This fixes that
bug.
diff --git a/pex/crawler.py b/pex/crawler.py
@@ -22,6 +22,15 @@
   from urlparse import urlparse
 
 
+def unescape(s):
+  """Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
+  s = s.replace("&lt;", "<")
+  s = s.replace("&gt;", ">")
+  # this has to be last:
+  s = s.replace("&amp;", "&")
+  return s
+
+
 class PageParser(object):
   """A helper class to extract and differentiate ordinary and download links from webpages."""
 
@@ -34,7 +43,7 @@ class PageParser(object):
   def href_match_to_url(cls, match):
     def pick(group):
       return '' if group is None else group
-    return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
+    return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))
 
   @classmethod
   def rel_links(cls, page):
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -46,6 +46,12 @@ def test_page_parser_basic():
       assert lpp("<a href='stuff'> <a href=%s>" % target) == (['stuff', href], [])
 
 
+def test_page_parser_escaped_html():
+  url = 'url?param1=val&param2=val2'
+  link = 'a href="%s"' % url.replace('&', '&amp;')
+  assert lpp(link) == ([url], [])
+
+
 def test_page_parser_rels():
   VALID_RELS = tuple(PageParser.REL_TYPES)
   for rel in VALID_RELS + ('', ' ', 'blah'):