From 50555985b60ad8666d52449fd564f1337c39fbaa Mon Sep 17 00:00:00 2001 From: daTokenizer Date: Wed, 14 Oct 2015 20:51:39 +0300 Subject: [PATCH] Handle gzipped pages gracefully instead of just throing an error or returning None --- goose/network.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/goose/network.py b/goose/network.py index 666a7d61..cd7cd2f6 100644 --- a/goose/network.py +++ b/goose/network.py @@ -21,6 +21,8 @@ limitations under the License. """ import urllib2 +import StringIO +import gzip class HtmlFetcher(object): @@ -48,9 +50,16 @@ def get_html(self, url): headers=self.headers) # do request try: - self.result = urllib2.urlopen( + response = urllib2.urlopen( self.request, timeout=self.config.http_timeout) + if response.info().get('Content-Encoding') == 'gzip': + buf = StringIO.StringIO(response.read()) + f = gzip.GzipFile(fileobj=buf) + self.result = f + else: + self.result = request + except Exception: self.result = None