File tree Expand file tree Collapse file tree 2 files changed +16
-1
lines changed
Expand file tree Collapse file tree 2 files changed +16
-1
lines changed Original file line number Diff line number Diff line change 2222 from urlparse import urlparse
2323
2424
25+ def unescape (s ):
26+ """Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
27+ s = s .replace ("<" , "<" )
28+ s = s .replace (">" , ">" )
29+ # this has to be last:
30+ s = s .replace ("&" , "&" )
31+ return s
32+
33+
2534class PageParser (object ):
2635 """A helper class to extract and differentiate ordinary and download links from webpages."""
2736
@@ -34,7 +43,7 @@ class PageParser(object):
3443 def href_match_to_url (cls , match ):
3544 def pick (group ):
3645 return '' if group is None else group
37- return pick (match .group (1 )) or pick (match .group (2 )) or pick (match .group (3 ))
46+ return unescape ( pick (match .group (1 )) or pick (match .group (2 )) or pick (match .group (3 ) ))
3847
3948 @classmethod
4049 def rel_links (cls , page ):
Original file line number Diff line number Diff line change @@ -46,6 +46,12 @@ def test_page_parser_basic():
4646 assert lpp ("<a href='stuff'> <a href=%s>" % target ) == (['stuff' , href ], [])
4747
4848
49+ def test_page_parser_escaped_html ():
50+ url = 'url?param1=val¶m2=val2'
51+ link = 'a href="%s"' % url .replace ('&' , '&' )
52+ assert lpp (link ) == ([url ], [])
53+
54+
4955def test_page_parser_rels ():
5056 VALID_RELS = tuple (PageParser .REL_TYPES )
5157 for rel in VALID_RELS + ('' , ' ' , 'blah' ):
You can’t perform that action at this time.
0 commit comments